Merge branch 'main' into msvc-rocm

2026-04-03 03:00:17 -04:00 · 2023-08-24 01:04:39 -05:00
parent e644fdf38a db990826d3
commit b086bf7d4f
24 changed files with 817 additions and 312 deletions
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -29,14 +29,8 @@ from brevitas_examples.llm.llm_quant.quantize import quantize_model
 from brevitas_examples.llm.llm_quant.run_utils import get_model_impl


-def brevitas〇matmul_rhs_group_quant〡shape(
-    lhs: List[int],
-    rhs: List[int],
-    rhs_scale: List[int],
-    rhs_zero_point: List[int],
-    rhs_bit_width: int,
-    rhs_group_size: int,
-) -> List[int]:
+# fmt: off
+def quant〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
    if len(lhs) == 3 and len(rhs) == 2:
        return [lhs[0], lhs[1], rhs[0]]
    elif len(lhs) == 2 and len(rhs) == 2:
@@ -45,30 +39,21 @@ def brevitas〇matmul_rhs_group_quant〡shape(
        raise ValueError("Input shapes not supported.")


-def brevitas〇matmul_rhs_group_quant〡dtype(
-    lhs_rank_dtype: Tuple[int, int],
-    rhs_rank_dtype: Tuple[int, int],
-    rhs_scale_rank_dtype: Tuple[int, int],
-    rhs_zero_point_rank_dtype: Tuple[int, int],
-    rhs_bit_width: int,
-    rhs_group_size: int,
-) -> int:
+def quant〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
    # output dtype is the dtype of the lhs float input
    lhs_rank, lhs_dtype = lhs_rank_dtype
    return lhs_dtype


-def brevitas〇matmul_rhs_group_quant〡has_value_semantics(
-    lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size
-) -> None:
+def quant〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
    return


 brevitas_matmul_rhs_group_quant_library = [
-    brevitas〇matmul_rhs_group_quant〡shape,
-    brevitas〇matmul_rhs_group_quant〡dtype,
-    brevitas〇matmul_rhs_group_quant〡has_value_semantics,
-]
+    quant〇matmul_rhs_group_quant〡shape,
+    quant〇matmul_rhs_group_quant〡dtype,
+    quant〇matmul_rhs_group_quant〡has_value_semantics]
+# fmt: on

 global_device = "cuda"
 global_precision = "fp16"
@@ -244,7 +229,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
                ts_graph,
                [*h2ogptCompileInput],
                output_type=torch_mlir.OutputType.TORCH,
-                backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+                backend_legal_ops=["quant.matmul_rhs_group_quant"],
                extra_library=brevitas_matmul_rhs_group_quant_library,
                use_tracing=False,
                verbose=False,
--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
@@ -37,7 +37,8 @@ from apps.language_models.src.model_wrappers.vicuna4 import (
 )
 from apps.language_models.src.model_wrappers.vicuna_model import (
    FirstVicuna,
-    SecondVicuna,
+    SecondVicuna7B,
+    SecondVicuna13B,
 )
 from apps.language_models.utils import (
    get_vmfb_from_path,
@@ -48,13 +49,12 @@ from shark.shark_importer import import_with_fx
 from shark.shark_inference import SharkInference


-
 parser = argparse.ArgumentParser(
    prog="vicuna runner",
    description="runs a vicuna model",
 )
 parser.add_argument(
-    "--precision", "-p", default="fp32", help="fp32, fp16, int8, int4"
+    "--precision", "-p", default="int8", help="fp32, fp16, int8, int4"
 )
 parser.add_argument("--device", "-d", default="cuda", help="vulkan, cpu, cuda")
 parser.add_argument(
@@ -106,7 +106,7 @@ parser.add_argument(
    "--model_name",
    type=str,
    default="vicuna",
-    choices=["vicuna", "llama2_7b", "llama2_70b"],
+    choices=["vicuna", "llama2_7b", "llama2_13b", "llama2_70b"],
    help="Specify which model to run.",
 )
 parser.add_argument(
@@ -129,7 +129,7 @@ parser.add_argument(
 )

 # fmt: off
-def brevitas〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
+def quant〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
    if len(lhs) == 3 and len(rhs) == 2:
        return [lhs[0], lhs[1], rhs[0]]
    elif len(lhs) == 2 and len(rhs) == 2:
@@ -138,20 +138,20 @@ def brevitas〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rh
        raise ValueError("Input shapes not supported.")


-def brevitas〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
+def quant〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
    # output dtype is the dtype of the lhs float input
    lhs_rank, lhs_dtype = lhs_rank_dtype
    return lhs_dtype


-def brevitas〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
+def quant〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
    return


 brevitas_matmul_rhs_group_quant_library = [
-    brevitas〇matmul_rhs_group_quant〡shape,
-    brevitas〇matmul_rhs_group_quant〡dtype,
-    brevitas〇matmul_rhs_group_quant〡has_value_semantics]
+    quant〇matmul_rhs_group_quant〡shape,
+    quant〇matmul_rhs_group_quant〡dtype,
+    quant〇matmul_rhs_group_quant〡has_value_semantics]
 # fmt: on


@@ -187,13 +187,14 @@ class VicunaBase(SharkLLMBase):
        return vicuna_model

    def combine_mlir_scripts(
-        self, first_vicuna_mlir, second_vicuna_mlir, output_name
+        self, first_vicuna_mlir, second_vicuna_mlir, output_name, model_name=None
    ):
        print(f"[DEBUG] combining first and second mlir")
        print(f"[DEBIG] output_name = {output_name}")
        maps1 = []
        maps2 = []
-        constants = set()
+        constants_1 = set()
+        constants_2 = set()
        f1 = []
        f2 = []

@@ -204,7 +205,7 @@ class VicunaBase(SharkLLMBase):
            if re.search("#map\d*\s*=", line):
                maps1.append(line)
            elif re.search("arith.constant", line):
-                constants.add(line)
+                constants_1.add(line)
            elif not re.search("module", line):
                line = re.sub("forward", "first_vicuna_forward", line)
                f1.append(line)
@@ -230,7 +231,7 @@ class VicunaBase(SharkLLMBase):
            elif "global_seed" in line:
                continue
            elif re.search("arith.constant", line):
-                constants.add(line)
+                constants_2.add(line)
            elif not re.search("module", line):
                line = re.sub("forward", "second_vicuna_forward", line)
                f2.append(line)
@@ -253,15 +254,25 @@ class VicunaBase(SharkLLMBase):
        module_end = "}"

        global_vars = []
-        vnames = []
-        global_var_loading1 = []
-        global_var_loading2 = []
+        global_var_loading1 = dict()
+        global_var_loading2 = dict()

        print(f"[DEBUG] processing constants")
-        counter = 0
-        constants = list(constants)
+        # in both 1 and 2
+        constants = [(e, "") for e in list(constants_1 & constants_2)]
+        # only in 1
+        constants.extend(
+            [(e, "_1") for e in list(constants_1.difference(constants_2))]
+        )
+        # only in 2
+        constants.extend(
+            [(e, "_2") for e in list(constants_2.difference(constants_1))]
+        )
+        del constants_1, constants_2
+        gc.collect()
+
        while constants:
-            constant = constants.pop(0)
+            constant, vname_suf = constants.pop(0)
            vname, vbody = constant.split("=")
            vname = re.sub("%", "", vname)
            vname = vname.strip()
@@ -271,41 +282,42 @@ class VicunaBase(SharkLLMBase):
                print(constant)
            vdtype = vbody.split(":")[-1].strip()
            fixed_vdtype = vdtype
-            if "c1_i64" in vname:
-                print(constant)
-                counter += 1
-            if counter == 2:
-                counter = 0
-                print("detected duplicate")
-                continue
-            vnames.append(vname)
+            noinline = "{noinline}" if "tensor" in fixed_vdtype else ""
            if "true" not in vname:
                global_vars.append(
-                    f"ml_program.global public @{vname}({vbody}) : {fixed_vdtype}"
-                )
-                global_var_loading1.append(
-                    f"\t\t%{vname} = ml_program.global_load_const @{vname} : {fixed_vdtype}"
-                )
-                global_var_loading2.append(
-                    f"\t\t%{vname} = ml_program.global_load_const @{vname} : {fixed_vdtype}"
+                    f"util.global private @{vname}{vname_suf} {noinline} = {vbody} : {fixed_vdtype}"
                )
+                if vname_suf != "_2":
+                    global_var_loading1[
+                        f"\t\t%{vname} = util.global_load @{vname}{vname_suf} : {fixed_vdtype}"
+                    ] = ""
+                if vname_suf != "_1":
+                    global_var_loading2[
+                        f"\t\t%{vname} = util.global_load @{vname}{vname_suf} : {fixed_vdtype}"
+                    ] = ""
            else:
                global_vars.append(
-                    f"ml_program.global public @{vname}({vbody}) : i1"
-                )
-                global_var_loading1.append(
-                    f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
-                )
-                global_var_loading2.append(
-                    f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
+                    f"util.global private @{vname}{vname_suf} = {vbody} : i1"
                )
+                if vname_suf != "_2":
+                    global_var_loading1[
+                        f"\t\t%{vname} = util.global_load @{vname}{vname_suf} : i1"
+                    ] = ""
+                if vname_suf != "_1":
+                    global_var_loading2[
+                        f"\t\t%{vname} = util.global_load @{vname}{vname_suf} : i1"
+                    ] = ""
+
+        del constants
+        gc.collect()
+
        new_f1, new_f2 = [], []

        print(f"[DEBUG] processing f1")
        for line in f1:
            if "func.func" in line:
                new_f1.append(line)
-                for global_var in global_var_loading1:
+                for global_var in global_var_loading1.keys():
                    new_f1.append(global_var)
            else:
                new_f1.append(line)
@@ -314,7 +326,7 @@ class VicunaBase(SharkLLMBase):
        for line in f2:
            if "func.func" in line:
                new_f2.append(line)
-                for global_var in global_var_loading2:
+                for global_var in global_var_loading2.keys():
                    if (
                        "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
                        in global_var
@@ -322,10 +334,7 @@ class VicunaBase(SharkLLMBase):
                        print(global_var)
                    new_f2.append(global_var)
            else:
-                if "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64" in line:
-                    new_f2.append("%" + line)
-                else:
-                    new_f2.append(line)
+                new_f2.append(line)

        f1 = new_f1
        f2 = new_f2
@@ -352,7 +361,8 @@ class VicunaBase(SharkLLMBase):
            f_.writelines(line + "\n" for line in global_vars)
            f_.writelines(line + "\n" for line in f1)
            f_.writelines(line + "\n" for line in f2)
-            f_.writelines(line + "\n" for line in [module_end])
+            if not (model_name and "llama2_13b" in model_name):
+                f_.writelines(line + "\n" for line in [module_end])

        del maps1
        del maps2
@@ -406,7 +416,6 @@ class VicunaBase(SharkLLMBase):
            _past_key_values = output["past_key_values"]
            _token = int(torch.argmax(_logits[:, -1, :], dim=1)[0])
        else:
-            print(len(output))
            _logits = torch.tensor(output[0])
            _past_key_values = torch.tensor(output[1:])
            _token = torch.argmax(_logits[:, -1, :], dim=1)
@@ -440,7 +449,12 @@ class ShardedVicuna(VicunaBase):
        compressed=False,
        extra_args_cmd=[],
    ) -> None:
-        super().__init__(model_name, hf_model_path, max_num_tokens, extra_args_cmd=extra_args_cmd)
+        super().__init__(
+            model_name,
+            hf_model_path,
+            max_num_tokens,
+            extra_args_cmd=extra_args_cmd,
+        )
        self.max_sequence_length = 256
        self.device = device
        self.precision = precision
@@ -839,7 +853,7 @@ class ShardedVicuna(VicunaBase):
                            inputs0[2],
                        ),
                        output_type="torch",
-                        backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+                        backend_legal_ops=["quant.matmul_rhs_group_quant"],
                        extra_library=brevitas_matmul_rhs_group_quant_library,
                        use_tracing=False,
                        verbose=False,
@@ -883,7 +897,7 @@ class ShardedVicuna(VicunaBase):
                            pkv1_placeholder,
                        ),
                        output_type="torch",
-                        backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+                        backend_legal_ops=["quant.matmul_rhs_group_quant"],
                        extra_library=brevitas_matmul_rhs_group_quant_library,
                        use_tracing=False,
                        verbose=False,
@@ -946,7 +960,8 @@ class ShardedVicuna(VicunaBase):
                        "--iree-vm-target-truncate-unsupported-floats",
                        "--iree-codegen-check-ir-before-llvm-conversion=false",
                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    ] + self.extra_args,
+                    ]
+                    + self.extra_args,
                )
                module.load_module(vmfb_path)
            modules.append(module)
@@ -1012,7 +1027,8 @@ class ShardedVicuna(VicunaBase):
                        "--iree-vm-target-truncate-unsupported-floats",
                        "--iree-codegen-check-ir-before-llvm-conversion=false",
                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    ] + self.extra_args,
+                    ]
+                    + self.extra_args,
                )
                module.load_module(vmfb_path)
            modules.append(module)
@@ -1228,7 +1244,12 @@ class UnshardedVicuna(VicunaBase):
        cache_vicunas=False,
        extra_args_cmd=[],
    ) -> None:
-        super().__init__(model_name, hf_model_path, max_num_tokens, extra_args_cmd=extra_args_cmd)
+        super().__init__(
+            model_name,
+            hf_model_path,
+            max_num_tokens,
+            extra_args_cmd=extra_args_cmd,
+        )
        if "llama2" in self.model_name and hf_auth_token == None:
            raise ValueError(
                "HF auth token required. Pass it using --hf_auth_token flag."
@@ -1236,6 +1257,8 @@ class UnshardedVicuna(VicunaBase):
        self.hf_auth_token = hf_auth_token
        if self.model_name == "llama2_7b":
            self.hf_model_path = "meta-llama/Llama-2-7b-chat-hf"
+        elif self.model_name == "llama2_13b":
+            self.hf_model_path = "meta-llama/Llama-2-13b-chat-hf"
        elif self.model_name == "llama2_70b":
            self.hf_model_path = "meta-llama/Llama-2-70b-chat-hf"
        print(f"[DEBUG] hf model name: {self.hf_model_path}")
@@ -1318,7 +1341,7 @@ class UnshardedVicuna(VicunaBase):
            new_lines.append(line)
        return "\n".join(new_lines)

-    def write_in_dynamic_inputs1(self, module):
+    def write_in_dynamic_inputs1(self, module, model_name):
        print("[DEBUG] writing dynamic inputs to second vicuna")

        def remove_constant_dim(line):
@@ -1337,7 +1360,7 @@ class UnshardedVicuna(VicunaBase):
                line = re.sub("c19", "dim", line)
            if " 19," in line:
                line = re.sub(" 19,", " %dim,", line)
-            if "20x" in line:
+            if "x20x" in line or "<20x" in line:
                line = re.sub("20x", "?x", line)
                line = re.sub("tensor.empty\(\)", "tensor.empty(%dimp1)", line)
            if " 20," in line:
@@ -1347,12 +1370,21 @@ class UnshardedVicuna(VicunaBase):
        module = module.splitlines()
        new_lines = []
        # Using a while loop and the pop method to avoid creating a copy of module
+        if "llama2_13b" in model_name:
+            pkv_tensor_shape = "tensor<1x40x?x128x"
+        else:
+            pkv_tensor_shape = "tensor<1x32x?x128x"
+        if self.precision in ["fp16", "int4", "int8"]:
+            pkv_tensor_shape += "f16>"
+        else:
+            pkv_tensor_shape += "f32>"
+
        while module:
            line = module.pop(0)
            if "%c19_i64 = arith.constant 19 : i64" in line:
                new_lines.append("%c2 = arith.constant 2 : index")
                new_lines.append(
-                    f"%dim_4_int = tensor.dim %arg1, %c2 : tensor<1x32x?x128x{'f16' if self.precision == 'fp16' else 'f32'}>"
+                    f"%dim_4_int = tensor.dim %arg1, %c2 : {pkv_tensor_shape}"
                )
                new_lines.append(
                    "%dim_i64 = arith.index_cast %dim_4_int : index to i64"
@@ -1363,7 +1395,7 @@ class UnshardedVicuna(VicunaBase):
            if "%c20_i64 = arith.constant 20 : i64" in line:
                new_lines.append("%c1_i64 = arith.constant 1 : i64")
                new_lines.append(
-                    "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
+                    "%c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
                )
                new_lines.append(
                    "%dimp1 = arith.index_cast %c20_i64 : i64 to index"
@@ -1377,6 +1409,7 @@ class UnshardedVicuna(VicunaBase):
    def compile(self, download_vmfb=False):
        # Testing : DO NOT Download Vmfbs if not found. Modify later
        # download vmfbs for A100
+        print(f"Looking into gs://shark_tank/{self.model_name}/unsharded/vmfb/{self.vicuna_vmfb_path.name}")
        if not self.vicuna_vmfb_path.exists() and download_vmfb:
            download_public_file(
                f"gs://shark_tank/{self.model_name}/unsharded/vmfb/{self.vicuna_vmfb_path.name}",
@@ -1402,16 +1435,20 @@ class UnshardedVicuna(VicunaBase):
            mlir_generated = False
            if self.load_mlir_from_shark_tank:
                # download MLIR from shark tank
-                download_public_file(
-                    f"gs://shark_tank/{self.model_name}/unsharded/mlir/{self.vicuna_mlir_path.name}",
-                    self.vicuna_mlir_path.absolute(),
-                    single_file=True,
-                )
-                if self.vicuna_mlir_path.exists():
-                    with open(self.vicuna_mlir_path, "rb") as f:
-                        combined_module = f.read()
-                    mlir_generated = True
-                else:
+                for suffix in ["mlir", "mlirbc"]:
+                    self.vicuna_mlir_path = self.get_model_path(suffix)
+                    download_public_file(
+                        f"gs://shark_tank/{self.model_name}/unsharded/mlir/{self.vicuna_mlir_path.name}",
+                        self.vicuna_mlir_path.absolute(),
+                        single_file=True,
+                    )
+                    if self.vicuna_mlir_path.exists():
+                        with open(self.vicuna_mlir_path, "rb") as f:
+                            combined_module = f.read()
+                        mlir_generated = True
+                        break
+                self.vicuna_mlir_path = self.get_model_path("mlir")
+                if not mlir_generated:
                    print(
                        f"[DEBUG] failed to download {self.vicuna_mlir_path.name} from shark tank"
                    )
@@ -1447,10 +1484,11 @@ class UnshardedVicuna(VicunaBase):
                        self.hf_auth_token,
                    )
                    print(f"[DEBUG] generating torchscript graph")
+                    is_f16 = self.precision in ["fp16", "int4"]
                    ts_graph = import_with_fx(
                        model,
                        firstVicunaCompileInput,
-                        is_f16=self.precision == "fp16",
+                        is_f16=is_f16,
                        precision=self.precision,
                        f16_input_mask=[False, False],
                        mlir_type="torchscript",
@@ -1471,9 +1509,7 @@ class UnshardedVicuna(VicunaBase):
                            ts_graph,
                            [*firstVicunaCompileInput],
                            output_type=torch_mlir.OutputType.TORCH,
-                            backend_legal_ops=[
-                                "brevitas.matmul_rhs_group_quant"
-                            ],
+                            backend_legal_ops=["quant.matmul_rhs_group_quant"],
                            extra_library=brevitas_matmul_rhs_group_quant_library,
                            use_tracing=False,
                            verbose=False,
@@ -1505,6 +1541,7 @@ class UnshardedVicuna(VicunaBase):
                    if self.cache_vicunas:
                        with open(f"first_{self.precision}.mlir", "w+") as f:
                            f.write(first_module)
+                        print("Finished writing IR after dynamic")

                if Path(f"second_{self.precision}.mlir").exists():
                    print(f"loading second_{self.precision}.mlir")
@@ -1515,33 +1552,49 @@ class UnshardedVicuna(VicunaBase):
                    compilation_input_ids = torch.zeros(
                        [1, 1], dtype=torch.int64
                    )
+                    if self.model_name == "llama2_13b":
+                        dim1 = 40
+                        total_tuple = 80
+                    else:
+                        dim1 = 32
+                        total_tuple = 64
                    pkv = tuple(
-                        (torch.zeros([1, 32, 19, 128], dtype=torch.float32))
-                        for _ in range(64)
+                        (torch.zeros([1, dim1, 19, 128], dtype=torch.float32))
+                        for _ in range(total_tuple)
                    )
                    secondVicunaCompileInput = (compilation_input_ids,) + pkv
-                    model = SecondVicuna(
-                        self.hf_model_path,
-                        self.precision,
-                        self.weight_group_size,
-                        self.model_name,
-                        self.hf_auth_token,
-                    )
+                    if self.model_name == "llama2_13b":
+                        model = SecondVicuna13B(
+                            self.hf_model_path,
+                            self.precision,
+                            self.weight_group_size,
+                            self.model_name,
+                            self.hf_auth_token,
+                        )
+                    else:
+                        model = SecondVicuna7B(
+                            self.hf_model_path,
+                            self.precision,
+                            self.weight_group_size,
+                            self.model_name,
+                            self.hf_auth_token,
+                        )
                    print(f"[DEBUG] generating torchscript graph")
+                    is_f16 = self.precision in ["fp16", "int4"]
                    ts_graph = import_with_fx(
                        model,
                        secondVicunaCompileInput,
-                        is_f16=self.precision == "fp16",
+                        is_f16=is_f16,
                        precision=self.precision,
-                        f16_input_mask=[False] + [True] * 64,
+                        f16_input_mask=[False] + [True] * total_tuple,
                        mlir_type="torchscript",
                    )
                    del model
-                    if self.precision == "fp16":
+                    if self.precision in ["fp16", "int4"]:
                        secondVicunaCompileInput = get_f16_inputs(
                            secondVicunaCompileInput,
                            True,
-                            f16_input_mask=[False] + [True] * 64,
+                            f16_input_mask=[False] + [True] * total_tuple,
                        )
                    secondVicunaCompileInput = list(secondVicunaCompileInput)
                    for i in range(len(secondVicunaCompileInput)):
@@ -1558,14 +1611,11 @@ class UnshardedVicuna(VicunaBase):
                            ts_graph,
                            [*secondVicunaCompileInput],
                            output_type=torch_mlir.OutputType.TORCH,
-                            backend_legal_ops=[
-                                "brevitas.matmul_rhs_group_quant"
-                            ],
+                            backend_legal_ops=["quant.matmul_rhs_group_quant"],
                            extra_library=brevitas_matmul_rhs_group_quant_library,
                            use_tracing=False,
                            verbose=False,
                        )
-                        print(f"[DEBUG] converting torch to linalg")
                        run_pipeline_with_repro_report(
                            second_module,
                            "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
@@ -1589,11 +1639,13 @@ class UnshardedVicuna(VicunaBase):
                        str(second_module)
                    )
                    if self.cache_vicunas:
-                        with open(f"second_{self.precision}.mlir", "w+") as f:
+                        with open(f"second_{self.precision}.mlir", 'w') as f:
                            f.write(second_module)
+                        print("Finished writing IR after dynamic")
+                    

                combined_module = self.combine_mlir_scripts(
-                    first_module, second_module, self.vicuna_mlir_path
+                    first_module, second_module, self.vicuna_mlir_path, self.model_name
                )
                del first_module, second_module

@@ -1612,7 +1664,8 @@ class UnshardedVicuna(VicunaBase):
                "--iree-vm-target-truncate-unsupported-floats",
                "--iree-codegen-check-ir-before-llvm-conversion=false",
                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-            ] + self.extra_args,
+            ]
+            + self.extra_args,
        )
        print("Saved vic vmfb at ", str(path))
        shark_module.load_module(path)
@@ -1629,7 +1682,7 @@ class UnshardedVicuna(VicunaBase):
        )
        return res_str

-    def generate(self, prompt, cli=True):
+    def generate(self, prompt, cli):
        # TODO: refactor for cleaner integration
        if self.shark_model is None:
            self.compile()
@@ -1637,7 +1690,7 @@ class UnshardedVicuna(VicunaBase):
        params = {"prompt": prompt, "is_first": True, "fv": self.shark_model}

        generated_token_op = self.generate_new_token(
-            params=params, sharded=False, cli=False
+            params=params, sharded=False, cli=cli
        )

        token = generated_token_op["token"]
@@ -1660,7 +1713,7 @@ class UnshardedVicuna(VicunaBase):
            }

            generated_token_op = self.generate_new_token(
-                params=params, sharded=False
+                params=params, sharded=False, cli=cli
            )

            token = generated_token_op["token"]
@@ -1688,6 +1741,79 @@ class UnshardedVicuna(VicunaBase):
        pass


+# NOTE: Each `model_name` should have its own start message
+start_message = {
+    "llama2_7b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
+    "llama2_13b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
+    "llama2_70b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
+    "StableLM": (
+        "<|SYSTEM|># StableLM Tuned (Alpha version)"
+        "\n- StableLM is a helpful and harmless open-source AI language model "
+        "developed by StabilityAI."
+        "\n- StableLM is excited to be able to help the user, but will refuse "
+        "to do anything that could be considered harmful to the user."
+        "\n- StableLM is more than just an information source, StableLM is also "
+        "able to write poetry, short stories, and make jokes."
+        "\n- StableLM will refuse to participate in anything that "
+        "could harm a human."
+    ),
+    "vicuna": (
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's "
+        "questions.\n"
+    ),
+    "vicuna4": (
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's "
+        "questions.\n"
+    ),
+    "vicuna1p3": (
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's "
+        "questions.\n"
+    ),
+    "codegen": "",
+}
+
+
+def create_prompt(model_name, history):
+    global start_message
+    system_message = start_message[model_name]
+    conversation = "".join(
+        [
+            "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
+            for item in history
+        ]
+    )
+    msg = system_message + conversation
+    msg = msg.strip()
+    return msg
+
+
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()

@@ -1750,28 +1876,20 @@ if __name__ == "__main__":
        answer to a question, please don't share false information."""
    prologue_prompt = "ASSISTANT:\n"

-    from apps.stable_diffusion.web.ui.stablelm_ui import chat, set_vicuna_model
-
    history = []
-    set_vicuna_model(vic)

    model_list = {
        "vicuna": "vicuna=>TheBloke/vicuna-7B-1.1-HF",
        "llama2_7b": "llama2_7b=>meta-llama/Llama-2-7b-chat-hf",
+        "llama2_13b": "llama2_13b=>meta-llama/Llama-2-13b-chat-hf",
        "llama2_70b": "llama2_70b=>meta-llama/Llama-2-70b-chat-hf",
    }
    while True:
        # TODO: Add break condition from user input
        user_prompt = input("User: ")
        history.append([user_prompt, ""])
-        history = list(
-            chat(
-                system_message,
-                history,
-                model=model_list[args.model_name],
-                devices=args.device,
-                precision=args.precision,
-                config_file=None,
-                cli=args.cli,
-            )
-        )[0]
+        prompt = create_prompt(args.model_name, history)
+        for text, msg in vic.generate(prompt, cli=True):
+            if "formatted" in msg:
+                print("Response:", text)
+                history[-1][1] = text
--- a/apps/language_models/src/model_wrappers/vicuna4.py
+++ b/apps/language_models/src/model_wrappers/vicuna4.py
@@ -47,7 +47,7 @@ from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
 )
 from apps.language_models.src.model_wrappers.vicuna_model import (
    FirstVicuna,
-    SecondVicuna,
+    SecondVicuna7B,
 )
 from apps.language_models.utils import (
    get_vmfb_from_path,
--- a/apps/language_models/src/model_wrappers/vicuna_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model.py
@@ -28,7 +28,7 @@ class FirstVicuna(torch.nn.Module):
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
                get_model_impl(self.model).layers,
-                dtype=torch.float32,
+                dtype=torch.float16 if precision == "int4" else torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
                weight_scale_precision="float",
@@ -50,7 +50,7 @@ class FirstVicuna(torch.nn.Module):
        return tuple(return_vals)


-class SecondVicuna(torch.nn.Module):
+class SecondVicuna7B(torch.nn.Module):
    def __init__(
        self,
        model_path,
@@ -76,7 +76,7 @@ class SecondVicuna(torch.nn.Module):
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
                get_model_impl(self.model).layers,
-                dtype=torch.float32,
+                dtype=torch.float16 if precision == "int4" else torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
                weight_scale_precision="float",
@@ -297,6 +297,296 @@ class SecondVicuna(torch.nn.Module):
        return tuple(return_vals)


+class SecondVicuna13B(torch.nn.Module):
+    def __init__(
+        self,
+        model_path,
+        precision="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        if precision in ["int4", "int8"]:
+            print("Second Vicuna applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                get_model_impl(self.model).layers,
+                dtype=torch.float16 if precision == "int4" else torch.float32,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+        i65,
+        i66,
+        i67,
+        i68,
+        i69,
+        i70,
+        i71,
+        i72,
+        i73,
+        i74,
+        i75,
+        i76,
+        i77,
+        i78,
+        i79,
+        i80,
+    ):
+        # input_ids = input_tuple[0]
+        # input_tuple = torch.unbind(pkv, dim=0)
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+            (
+                i65,
+                i66,
+            ),
+            (
+                i67,
+                i68,
+            ),
+            (
+                i69,
+                i70,
+            ),
+            (
+                i71,
+                i72,
+            ),
+            (
+                i73,
+                i74,
+            ),
+            (
+                i75,
+                i76,
+            ),
+            (
+                i77,
+                i78,
+            ),
+            (
+                i79,
+                i80,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
+        return_vals.append(op.logits)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
 class CombinedModel(torch.nn.Module):
    def __init__(
        self,
@@ -305,7 +595,8 @@ class CombinedModel(torch.nn.Module):
    ):
        super().__init__()
        self.first_vicuna = FirstVicuna(first_vicuna_model_path)
-        self.second_vicuna = SecondVicuna(second_vicuna_model_path)
+        # NOT using this path for 13B currently, hence using `SecondVicuna7B`.
+        self.second_vicuna = SecondVicuna7B(second_vicuna_model_path)

    def forward(self, input_ids):
        first_output = self.first_vicuna(input_ids=input_ids)
--- a/apps/language_models/src/pipelines/minigpt4_pipeline.py
+++ b/apps/language_models/src/pipelines/minigpt4_pipeline.py
@@ -136,7 +136,8 @@ from brevitas_examples.llm.llm_quant.quantize import quantize_model
 from brevitas_examples.llm.llm_quant.run_utils import get_model_impl


-def brevitas〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
+# fmt: off
+def quant〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
    if len(lhs) == 3 and len(rhs) == 2:
        return [lhs[0], lhs[1], rhs[0]]
    elif len(lhs) == 2 and len(rhs) == 2:
@@ -145,20 +146,21 @@ def brevitas〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rh
        raise ValueError("Input shapes not supported.")


-def brevitas〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
+def quant〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
    # output dtype is the dtype of the lhs float input
    lhs_rank, lhs_dtype = lhs_rank_dtype
    return lhs_dtype


-def brevitas〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
+def quant〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
    return


 brevitas_matmul_rhs_group_quant_library = [
-    brevitas〇matmul_rhs_group_quant〡shape,
-    brevitas〇matmul_rhs_group_quant〡dtype,
-    brevitas〇matmul_rhs_group_quant〡has_value_semantics]
+    quant〇matmul_rhs_group_quant〡shape,
+    quant〇matmul_rhs_group_quant〡dtype,
+    quant〇matmul_rhs_group_quant〡has_value_semantics]
+# fmt: on


 def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):
@@ -209,7 +211,7 @@ def compile_int_precision(
        torchscript_module,
        inputs,
        output_type="torch",
-        backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+        backend_legal_ops=["quant.matmul_rhs_group_quant"],
        extra_library=brevitas_matmul_rhs_group_quant_library,
        use_tracing=False,
        verbose=False,
--- a/apps/stable_diffusion/scripts/train_lora_word.py
+++ b/apps/stable_diffusion/scripts/train_lora_word.py
@@ -34,7 +34,7 @@ from PIL import Image
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 from diffusers.loaders import AttnProcsLayers
-from diffusers.models.cross_attention import LoRACrossAttnProcessor
+from diffusers.models.attention_processor import LoRAXFormersAttnProcessor

 import torch_mlir
 from torch_mlir.dynamo import make_simple_dynamo_backend
@@ -287,7 +287,7 @@ def lora_train(
                block_id = int(name[len("down_blocks.")])
                hidden_size = unet.config.block_out_channels[block_id]

-            lora_attn_procs[name] = LoRACrossAttnProcessor(
+            lora_attn_procs[name] = LoRAXFormersAttnProcessor(
                hidden_size=hidden_size,
                cross_attention_dim=cross_attention_dim,
            )
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -177,9 +177,11 @@ class SharkifyStableDiffusionModel:
            "unet",
            "unet512",
            "stencil_unet",
+            "stencil_unet_512",
            "vae",
            "vae_encode",
            "stencil_adaptor",
+            "stencil_adaptor_512",
        ]
        index = 0
        for model in sub_model_list:
@@ -339,7 +341,7 @@ class SharkifyStableDiffusionModel:
        )
        return shark_vae, vae_mlir

-    def get_controlled_unet(self):
+    def get_controlled_unet(self, use_large=False):
        class ControlledUnetModel(torch.nn.Module):
            def __init__(
                self,
@@ -415,6 +417,16 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False

        inputs = tuple(self.inputs["unet"])
+        model_name = "stencil_unet"
+        if use_large:
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (
+                inputs[:2]
+                + (torch.nn.functional.pad(inputs[2], pad),)
+                + inputs[3:]
+            )
+            model_name = "stencil_unet_512"
        input_mask = [
            True,
            True,
@@ -437,19 +449,19 @@ class SharkifyStableDiffusionModel:
        shark_controlled_unet, controlled_unet_mlir = compile_through_fx(
            unet,
            inputs,
-            extended_model_name=self.model_name["stencil_unet"],
+            extended_model_name=self.model_name[model_name],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
-            model_name="stencil_unet",
+            model_name=model_name,
            precision=self.precision,
            return_mlir=self.return_mlir,
        )
        return shark_controlled_unet, controlled_unet_mlir

-    def get_control_net(self):
+    def get_control_net(self, use_large=False):
        class StencilControlNetModel(torch.nn.Module):
            def __init__(
                self, model_id=self.use_stencil, low_cpu_mem_usage=False
@@ -497,17 +509,34 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False

        inputs = tuple(self.inputs["stencil_adaptor"])
+        if use_large:
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (
+                inputs[0],
+                inputs[1],
+                torch.nn.functional.pad(inputs[2], pad),
+                inputs[3],
+            )
+            save_dir = os.path.join(
+                self.sharktank_dir, self.model_name["stencil_adaptor_512"]
+            )
+        else:
+            save_dir = os.path.join(
+                self.sharktank_dir, self.model_name["stencil_adaptor"]
+            )
        input_mask = [True, True, True, True]
+        model_name = "stencil_adaptor" if use_large else "stencil_adaptor_512"
        shark_cnet, cnet_mlir = compile_through_fx(
            scnet,
            inputs,
-            extended_model_name=self.model_name["stencil_adaptor"],
+            extended_model_name=self.model_name[model_name],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
-            model_name="stencil_adaptor",
+            model_name=model_name,
            precision=self.precision,
            return_mlir=self.return_mlir,
        )
@@ -748,7 +777,7 @@ class SharkifyStableDiffusionModel:
            else:
                return self.get_unet(use_large=use_large)
        else:
-            return self.get_controlled_unet()
+            return self.get_controlled_unet(use_large=use_large)

    def vae_encode(self):
        try:
@@ -847,12 +876,14 @@ class SharkifyStableDiffusionModel:
        except Exception as e:
            sys.exit(e)

-    def controlnet(self):
+    def controlnet(self, use_large=False):
        try:
            self.inputs["stencil_adaptor"] = self.get_input_info_for(
                base_models["stencil_adaptor"]
            )
-            compiled_stencil_adaptor, controlnet_mlir = self.get_control_net()
+            compiled_stencil_adaptor, controlnet_mlir = self.get_control_net(
+                use_large=use_large
+            )

            check_compilation(compiled_stencil_adaptor, "Stencil")
            if self.return_mlir:
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -58,6 +58,7 @@ class StencilPipeline(StableDiffusionPipeline):
    ):
        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
        self.controlnet = None
+        self.controlnet_512 = None

    def load_controlnet(self):
        if self.controlnet is not None:
@@ -68,6 +69,15 @@ class StencilPipeline(StableDiffusionPipeline):
        del self.controlnet
        self.controlnet = None

+    def load_controlnet_512(self):
+        if self.controlnet_512 is not None:
+            return
+        self.controlnet_512 = self.sd_model.controlnet(use_large=True)
+
+    def unload_controlnet_512(self):
+        del self.controlnet_512
+        self.controlnet_512 = None
+
    def prepare_latents(
        self,
        batch_size,
@@ -111,8 +121,12 @@ class StencilPipeline(StableDiffusionPipeline):
        latent_history = [latents]
        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
        text_embeddings_numpy = text_embeddings.detach().numpy()
-        self.load_unet()
-        self.load_controlnet()
+        if text_embeddings.shape[1] <= self.model_max_length:
+            self.load_unet()
+            self.load_controlnet()
+        else:
+            self.load_unet_512()
+            self.load_controlnet_512()
        for i, t in tqdm(enumerate(total_timesteps)):
            step_start_time = time.time()
            timestep = torch.tensor([t]).to(dtype)
@@ -135,43 +149,82 @@ class StencilPipeline(StableDiffusionPipeline):
                ).to(dtype)
            else:
                latent_model_input_1 = latent_model_input
-            control = self.controlnet(
-                "forward",
-                (
-                    latent_model_input_1,
-                    timestep,
-                    text_embeddings,
-                    controlnet_hint,
-                ),
-                send_to_host=False,
-            )
+            if text_embeddings.shape[1] <= self.model_max_length:
+                control = self.controlnet(
+                    "forward",
+                    (
+                        latent_model_input_1,
+                        timestep,
+                        text_embeddings,
+                        controlnet_hint,
+                    ),
+                    send_to_host=False,
+                )
+            else:
+                control = self.controlnet_512(
+                    "forward",
+                    (
+                        latent_model_input_1,
+                        timestep,
+                        text_embeddings,
+                        controlnet_hint,
+                    ),
+                    send_to_host=False,
+                )
            timestep = timestep.detach().numpy()
            # Profiling Unet.
            profile_device = start_profiling(file_path="unet.rdc")
            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
-            noise_pred = self.unet(
-                "forward",
-                (
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                    control[0],
-                    control[1],
-                    control[2],
-                    control[3],
-                    control[4],
-                    control[5],
-                    control[6],
-                    control[7],
-                    control[8],
-                    control[9],
-                    control[10],
-                    control[11],
-                    control[12],
-                ),
-                send_to_host=False,
-            )
+
+            if text_embeddings.shape[1] <= self.model_max_length:
+                noise_pred = self.unet(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                        control[0],
+                        control[1],
+                        control[2],
+                        control[3],
+                        control[4],
+                        control[5],
+                        control[6],
+                        control[7],
+                        control[8],
+                        control[9],
+                        control[10],
+                        control[11],
+                        control[12],
+                    ),
+                    send_to_host=False,
+                )
+            else:
+                print(self.unet_512)
+                noise_pred = self.unet_512(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                        control[0],
+                        control[1],
+                        control[2],
+                        control[3],
+                        control[4],
+                        control[5],
+                        control[6],
+                        control[7],
+                        control[8],
+                        control[9],
+                        control[10],
+                        control[11],
+                        control[12],
+                    ),
+                    send_to_host=False,
+                )
            end_profiling(profile_device)

            if cpu_scheduling:
@@ -191,7 +244,9 @@ class StencilPipeline(StableDiffusionPipeline):

        if self.ondemand:
            self.unload_unet()
+            self.unload_unet_512()
            self.unload_controlnet()
+            self.unload_controlnet_512()
        avg_step_time = step_time_sum / len(total_timesteps)
        self.log += f"\nAverage step time: {avg_step_time}ms/it"

--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -37,7 +37,7 @@ def launch_app(address):
        height=height,
        text_select=True,
    )
-    webview.start(private_mode=False)
+    webview.start(private_mode=False, storage_path=os.getcwd())


 if __name__ == "__main__":
--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -24,6 +24,7 @@ past_key_values = None

 model_map = {
    "llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
+    "llama2_13b": "meta-llama/Llama-2-13b-chat-hf",
    "llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
    "codegen": "Salesforce/codegen25-7b-multi",
    "vicuna1p3": "lmsys/vicuna-7b-v1.3",
@@ -43,6 +44,15 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
+    "llama2_13b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
    "llama2_70b": (
        "System: You are a helpful, respectful and honest assistant. Always answer "
        "as helpfully as possible, while being safe.  Your answers should not "
@@ -91,6 +101,7 @@ def create_prompt(model_name, history):
        "vicuna4",
        "vicuna1p3",
        "llama2_7b",
+        "llama2_13b",
        "llama2_70b",
    ]:
        conversation = "".join(
@@ -139,6 +150,9 @@ def get_default_config():
    c.split_into_layers()


+model_vmfb_key = ""
+
+
 # TODO: Make chat reusable for UI and API
 def chat(
    curr_system_message,
@@ -147,20 +161,33 @@ def chat(
    device,
    precision,
    config_file,
-    cli=True,
+    cli=False,
    progress=gr.Progress(),
 ):
    global past_key_values
+    global model_vmfb_key

    global vicuna_model
    model_name, model_path = list(map(str.strip, model.split("=>")))
+    if "cuda" in device:
+        device = "cuda"
+    elif "sync" in device:
+        device = "cpu-sync"
+    elif "task" in device:
+        device = "cpu-task"
+    elif "vulkan" in device:
+        device = "vulkan"
+    else:
+        print("unrecognized device")

+    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{precision}"
    if model_name in [
        "vicuna",
        "vicuna4",
        "vicuna1p3",
        "codegen",
        "llama2_7b",
+        "llama2_13b",
        "llama2_70b",
    ]:
        from apps.language_models.scripts.vicuna import ShardedVicuna
@@ -181,6 +208,8 @@ def chat(
            else:
                print("unrecognized device")

+        if new_model_vmfb_key != model_vmfb_key:
+            model_vmfb_key = new_model_vmfb_key
            max_toks = 128 if model_name == "codegen" else 512

            # get iree flags that need to be overridden, from commandline args
@@ -232,7 +261,7 @@ def chat(
        count = 0
        start_time = time.time()
        for text, msg in progress.tqdm(
-            vicuna_model.generate(prompt, cli=False),
+            vicuna_model.generate(prompt, cli=cli),
            desc="generating response",
        ):
            count += 1
@@ -256,7 +285,8 @@ def chat(
        SharkStableLM,
    )

-    if sharkModel == 0:
+    if new_model_vmfb_key != model_vmfb_key:
+        model_vmfb_key = new_model_vmfb_key
        # max_new_tokens=512
        shark_slm = SharkStableLM(
            model_name
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -24,13 +24,13 @@ def get_image(url, local_filename):
            shutil.copyfileobj(res.raw, f)


-def compare_images(new_filename, golden_filename):
+def compare_images(new_filename, golden_filename, upload=False):
    new = np.array(Image.open(new_filename)) / 255.0
    golden = np.array(Image.open(golden_filename)) / 255.0
    diff = np.abs(new - golden)
    mean = np.mean(diff)
    if mean > 0.1:
-        if os.name != "nt":
+        if os.name != "nt" and upload == True:
            subprocess.run(
                [
                    "gsutil",
@@ -39,7 +39,7 @@ def compare_images(new_filename, golden_filename):
                    "gs://shark_tank/testdata/builder/",
                ]
            )
-        raise SystemExit("new and golden not close")
+        raise AssertionError("new and golden not close")
    else:
        print("SUCCESS")

--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -1,5 +1,6 @@
 #!/bin/bash

-IMPORTER=1 BENCHMARK=1 ./setup_venv.sh
+IMPORTER=1 BENCHMARK=1 NO_BREVITAS=1 ./setup_venv.sh
 source $GITHUB_WORKSPACE/shark.venv/bin/activate
+python build_tools/stable_diffusion_testing.py --gen
 python tank/generate_sharktank.py
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -63,7 +63,14 @@ def get_inpaint_inputs():
    open("./test_images/inputs/mask.png", "wb").write(mask.content)


-def test_loop(device="vulkan", beta=False, extra_flags=[]):
+def test_loop(
+    device="vulkan",
+    beta=False,
+    extra_flags=[],
+    upload_bool=True,
+    exit_on_fail=True,
+    do_gen=False,
+):
    # Get golden values from tank
    shutil.rmtree("./test_images", ignore_errors=True)
    model_metrics = []
@@ -81,6 +88,8 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
    if beta:
        extra_flags.append("--beta_models=True")
    extra_flags.append("--no-progress_bar")
+    if do_gen:
+        extra_flags.append("--import_debug")
    to_skip = [
        "Linaqruf/anything-v3.0",
        "prompthero/openjourney",
@@ -181,7 +190,14 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
                        "./test_images/golden/" + model_name + "/*.png"
                    )
                    golden_file = glob(golden_path)[0]
-                    compare_images(test_file, golden_file)
+                    try:
+                        compare_images(
+                            test_file, golden_file, upload=upload_bool
+                        )
+                    except AssertionError as e:
+                        print(e)
+                        if exit_on_fail == True:
+                            raise
                else:
                    print(command)
                    print("failed to generate image for this configuration")
@@ -200,6 +216,9 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
                            extra_flags.remove(
                                "--iree_vulkan_target_triple=rdna2-unknown-windows"
                            )
+            if do_gen:
+                prepare_artifacts()
+
    with open(os.path.join(os.getcwd(), "sd_testing_metrics.csv"), "w+") as f:
        header = "model_name;device;use_tune;import_opt;Clip Inference time(ms);Average Step (ms/it);VAE Inference time(ms);total image generation(s);command\n"
        f.write(header)
@@ -218,15 +237,49 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
            f.write(";".join(output) + "\n")


+def prepare_artifacts():
+    gen_path = os.path.join(os.getcwd(), "gen_shark_tank")
+    if not os.path.isdir(gen_path):
+        os.mkdir(gen_path)
+    for dirname in os.listdir(os.getcwd()):
+        for modelname in ["clip", "unet", "vae"]:
+            if modelname in dirname and "vmfb" not in dirname:
+                if not os.path.isdir(os.path.join(gen_path, dirname)):
+                    shutil.move(os.path.join(os.getcwd(), dirname), gen_path)
+                    print(f"Moved dir: {dirname} to {gen_path}.")
+
+
 parser = argparse.ArgumentParser()

 parser.add_argument("-d", "--device", default="vulkan")
 parser.add_argument(
    "-b", "--beta", action=argparse.BooleanOptionalAction, default=False
 )
-
+parser.add_argument("-e", "--extra_args", type=str, default=None)
+parser.add_argument(
+    "-u", "--upload", action=argparse.BooleanOptionalAction, default=True
+)
+parser.add_argument(
+    "-x", "--exit_on_fail", action=argparse.BooleanOptionalAction, default=True
+)
+parser.add_argument(
+    "-g", "--gen", action=argparse.BooleanOptionalAction, default=False
+)

 if __name__ == "__main__":
    args = parser.parse_args()
    print(args)
-    test_loop(args.device, args.beta, [])
+    extra_args = []
+    if args.extra_args:
+        for arg in args.extra_args.split(","):
+            extra_args.append(arg)
+    test_loop(
+        args.device,
+        args.beta,
+        extra_args,
+        args.upload,
+        args.exit_on_fail,
+        args.gen,
+    )
+    if args.gen:
+        prepare_artifacts()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
    "packaging",

    "numpy>=1.22.4",
-    "torch-mlir>=20221021.633",
+    "torch-mlir>=20230620.875",
    "iree-compiler>=20221022.190",
    "iree-runtime>=20221022.190",
 ]
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -3,7 +3,7 @@

 numpy>1.22.4
 pytorch-triton
-torchvision==0.16.0.dev20230322 
+torchvision 
 tabulate

 tqdm
@@ -15,7 +15,7 @@ iree-tools-tf

 # TensorFlow and JAX.
 gin-config
-tensorflow>2.11
+tf-nightly
 keras
 #tf-models-nightly
 #tensorflow-text-nightly
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -128,7 +128,7 @@ if [[ ! -z "${IMPORTER}" ]]; then
  fi
 fi

-$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/torch/
+$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/cpu/

 if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
  T_VER=$($PYTHON -m pip show torch | grep Version)
@@ -145,14 +145,8 @@ if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
  fi
 fi

-if [[ ! -z "${ONNX}" ]]; then
-  echo "${Yellow}Installing ONNX and onnxruntime for benchmarks..."
-  $PYTHON -m pip install onnx onnxruntime psutil
-  if [ $? -eq 0 ];then
-    echo "Successfully installed ONNX and ONNX runtime."
-  else
-    echo "Could not install ONNX." >&2
-  fi
+if [[ -z "${NO_BREVITAS}" ]]; then
+  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@dev
 fi

 if [[ -z "${CONDA_PREFIX}" && "$SKIP_VENV" != "1" ]]; then
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -124,42 +124,41 @@ class SharkBenchmarkRunner(SharkRunner):
        elif self.mlir_dialect in ["mhlo", "tf"]:
            return self.benchmark_tf(modelname)

-    def benchmark_torch(self, modelname):
+    def benchmark_torch(self, modelname, device="cpu"):
        import torch
        from tank.model_utils import get_torch_model

-        if self.device == "cuda":
-            torch.set_default_tensor_type(torch.cuda.FloatTensor)
-            if self.enable_tf32:
-                torch.backends.cuda.matmul.allow_tf32 = True
+        # TODO: Pass this as an arg. currently the best way is to setup with BENCHMARK=1 if we want to use torch+cuda, else use cpu.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if device == "cuda":
+            torch.set_default_device("cuda:0")
+            # if self.enable_tf32:
+            #    torch.backends.cuda.matmul.allow_tf32 = True
        else:
-            torch.set_default_tensor_type(torch.FloatTensor)
-        torch_device = torch.device(
-            "cuda:0" if self.device == "cuda" else "cpu"
-        )
+            torch.set_default_dtype(torch.float32)
+            torch.set_default_device("cpu")
+        torch_device = torch.device("cuda:0" if device == "cuda" else "cpu")
        HFmodel, input = get_torch_model(modelname, self.import_args)[:2]
        frontend_model = HFmodel.model
        frontend_model.to(torch_device)
-        input.to(torch_device)
-
-        # TODO: re-enable as soon as pytorch CUDA context issues are resolved
-        try:
-            frontend_model = torch.compile(
-                frontend_model, mode="max-autotune", backend="inductor"
-            )
-        except RuntimeError:
-            frontend_model = HFmodel.model
+        if device == "cuda":
+            frontend_model.cuda()
+            input.to(torch.device("cuda:0"))
+            print(input)
+        else:
+            frontend_model.cpu()
+            input.cpu()

        for i in range(shark_args.num_warmup_iterations):
            frontend_model.forward(input)

-        if self.device == "cuda":
+        if device == "cuda":
            torch.cuda.reset_peak_memory_stats()
        begin = time.time()
        for i in range(shark_args.num_iterations):
            out = frontend_model.forward(input)
        end = time.time()
-        if self.device == "cuda":
+        if device == "cuda":
            stats = torch.cuda.memory_stats()
            device_peak_b = stats["allocated_bytes.all.peak"]
            frontend_model.to(torch.device("cpu"))
@@ -171,7 +170,7 @@ class SharkBenchmarkRunner(SharkRunner):
        print(
            f"Torch benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
-        if self.device == "cuda":
+        if device == "cuda":
            # Set device to CPU so we don't run into segfaults exiting pytest subprocesses.
            torch_device = torch.device("cpu")
        return [
--- a/shark/shark_compile.py
+++ b/shark/shark_compile.py
@@ -11,14 +11,8 @@ from brevitas_examples.llm.llm_quant.quantize import quantize_model
 from brevitas_examples.llm.llm_quant.run_utils import get_model_impl


-def brevitas〇matmul_rhs_group_quant〡shape(
-    lhs: List[int],
-    rhs: List[int],
-    rhs_scale: List[int],
-    rhs_zero_point: List[int],
-    rhs_bit_width: int,
-    rhs_group_size: int,
-) -> List[int]:
+# fmt: off
+def quant〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
    if len(lhs) == 3 and len(rhs) == 2:
        return [lhs[0], lhs[1], rhs[0]]
    elif len(lhs) == 2 and len(rhs) == 2:
@@ -27,30 +21,21 @@ def brevitas〇matmul_rhs_group_quant〡shape(
        raise ValueError("Input shapes not supported.")


-def brevitas〇matmul_rhs_group_quant〡dtype(
-    lhs_rank_dtype: Tuple[int, int],
-    rhs_rank_dtype: Tuple[int, int],
-    rhs_scale_rank_dtype: Tuple[int, int],
-    rhs_zero_point_rank_dtype: Tuple[int, int],
-    rhs_bit_width: int,
-    rhs_group_size: int,
-) -> int:
+def quant〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
    # output dtype is the dtype of the lhs float input
    lhs_rank, lhs_dtype = lhs_rank_dtype
    return lhs_dtype


-def brevitas〇matmul_rhs_group_quant〡has_value_semantics(
-    lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size
-) -> None:
+def quant〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
    return


 brevitas_matmul_rhs_group_quant_library = [
-    brevitas〇matmul_rhs_group_quant〡shape,
-    brevitas〇matmul_rhs_group_quant〡dtype,
-    brevitas〇matmul_rhs_group_quant〡has_value_semantics,
-]
+    quant〇matmul_rhs_group_quant〡shape,
+    quant〇matmul_rhs_group_quant〡dtype,
+    quant〇matmul_rhs_group_quant〡has_value_semantics]
+# fmt: on


 def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):
@@ -122,7 +107,7 @@ def compile_int_precision(
        torchscript_module,
        inputs,
        output_type="torch",
-        backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+        backend_legal_ops=["quant.matmul_rhs_group_quant"],
        extra_library=brevitas_matmul_rhs_group_quant_library,
        use_tracing=False,
        verbose=False,
--- a/shark/shark_generate_model_config.py
+++ b/shark/shark_generate_model_config.py
@@ -138,7 +138,7 @@ if __name__ == "__main__":
    firstVicunaCompileInput = (compilation_input_ids,)
    from apps.language_models.src.model_wrappers.vicuna_model import (
        FirstVicuna,
-        SecondVicuna,
+        SecondVicuna7B,
        CombinedModel,
    )

--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -615,7 +615,7 @@ def import_with_fx(
        replace_call_fn_target(
            fx_g,
            src=matmul_rhs_group_quant_placeholder,
-            target=torch.ops.brevitas.matmul_rhs_group_quant,
+            target=torch.ops.quant.matmul_rhs_group_quant,
        )

        fx_g.recompile()
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -13,7 +13,6 @@ google/vit-base-patch16-224,stablehlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,
 microsoft/MiniLM-L12-H384-uncased,stablehlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile.",""
 microsoft/layoutlm-base-uncased,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 microsoft/mpnet-base,stablehlo,tf,1e-2,1e-2,default,None,True,True,True,"",""
-albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir",""
 alexnet,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/879",""
 bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
 bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
--- a/tank/generate_sharktank.py
+++ b/tank/generate_sharktank.py
@@ -16,12 +16,6 @@ import subprocess as sp
 import hashlib
 import numpy as np
 from pathlib import Path
-from apps.stable_diffusion.src.models import (
-    model_wrappers as mw,
-)
-from apps.stable_diffusion.src.utils.stable_args import (
-    args,
-)


 def create_hash(file_name):
@@ -60,31 +54,6 @@ def save_torch_model(torch_model_list, local_tank_cache, import_args):
            print("generating artifacts for: " + torch_model_name)
            model = None
            input = None
-            if model_type == "stable_diffusion":
-                args.use_tuned = False
-                args.import_mlir = True
-                args.local_tank_cache = local_tank_cache
-
-                precision_values = ["fp16"]
-                seq_lengths = [64, 77]
-                for precision_value in precision_values:
-                    args.precision = precision_value
-                    for length in seq_lengths:
-                        model = mw.SharkifyStableDiffusionModel(
-                            model_id=torch_model_name,
-                            custom_weights="",
-                            precision=precision_value,
-                            max_len=length,
-                            width=512,
-                            height=512,
-                            use_base_vae=False,
-                            custom_vae="",
-                            debug=True,
-                            sharktank_dir=local_tank_cache,
-                            generate_vmfb=False,
-                        )
-                        model()
-                continue
            if model_type == "vision":
                model, input, _ = get_vision_model(
                    torch_model_name, import_args
@@ -103,10 +72,11 @@ def save_torch_model(torch_model_list, local_tank_cache, import_args):
                model, input, _ = get_hf_img_cls_model(
                    torch_model_name, import_args
                )
-            elif model_type == "fp16":
-                model, input, _ = get_fp16_model(torch_model_name, import_args)
            torch_model_name = torch_model_name.replace("/", "_")
-            if import_args["batch_size"] != 1:
+            if import_args["batch_size"] > 1:
+                print(
+                    f"Batch size for this model set to {import_args['batch_size']}"
+                )
                torch_model_dir = os.path.join(
                    local_tank_cache,
                    str(torch_model_name)
@@ -391,7 +361,7 @@ if __name__ == "__main__":

    # old_import_args = parser.parse_import_args()
    import_args = {
-        "batch_size": "1",
+        "batch_size": 1,
    }
    print(import_args)
    home = str(Path.home())
@@ -404,11 +374,6 @@ if __name__ == "__main__":
        os.path.dirname(__file__), "tflite", "tflite_model_list.csv"
    )

-    save_torch_model(
-        os.path.join(os.path.dirname(__file__), "torch_sd_list.csv"),
-        WORKDIR,
-        import_args,
-    )
    save_torch_model(torch_model_csv, WORKDIR, import_args)
-    save_tf_model(tf_model_csv, WORKDIR, import_args)
-    save_tflite_model(tflite_model_csv, WORKDIR, import_args)
+    # save_tf_model(tf_model_csv, WORKDIR, import_args)
+    # save_tflite_model(tflite_model_csv, WORKDIR, import_args)
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -278,7 +278,7 @@ def get_vision_model(torch_model, import_args):
        int(import_args["batch_size"]), 3, *input_image_size
    )
    actual_out = model(test_input)
-    if fp16_model is not None:
+    if fp16_model == True:
        test_input_fp16 = test_input.to(
            device=torch.device("cuda"), dtype=torch.half
        )
--- a/tank/torch_model_list.csv
+++ b/tank/torch_model_list.csv
@@ -5,7 +5,6 @@ microsoft/MiniLM-L12-H384-uncased,True,hf,True,linalg,False,66M,"nlp;bert-varian
 bert-base-uncased,True,hf,True,linalg,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 bert-base-cased,True,hf,True,linalg,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 google/mobilebert-uncased,True,hf,True,linalg,False,25M,"nlp,bert-variant,transformer-encoder,mobile","24 layers, 512 hidden size, 128 embedding"
-alexnet,False,vision,True,linalg,False,61M,"cnn,parallel-layers","The CNN that revolutionized computer vision (move away from hand-crafted features to neural networks),10 years old now and probably no longer used in prod."
 resnet18,False,vision,True,linalg,False,11M,"cnn,image-classification,residuals,resnet-variant","1 7x7 conv2d and the rest are 3x3 conv2d"
 resnet50,False,vision,True,linalg,False,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
 resnet101,False,vision,True,linalg,False,29M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
@@ -18,11 +17,9 @@ facebook/deit-small-distilled-patch16-224,True,hf_img_cls,False,linalg,False,22M
 microsoft/beit-base-patch16-224-pt22k-ft22k,True,hf_img_cls,False,linalg,False,86M,"image-classification,transformer-encoder,bert-variant,vision-transformer",N/A
 nvidia/mit-b0,True,hf_img_cls,False,linalg,False,3.7M,"image-classification,transformer-encoder",SegFormer
 mnasnet1_0,False,vision,True,linalg,False,-,"cnn, torchvision, mobile, architecture-search","Outperforms other mobile CNNs on Accuracy vs. Latency"
-resnet50_fp16,False,vision,True,linalg,False,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
-bert-base-uncased_fp16,True,fp16,False,linalg,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 bert-large-uncased,True,hf,True,linalg,False,330M,"nlp;bert-variant;transformer-encoder","24 layers, 1024 hidden units, 16 attention heads"
 bert-base-uncased,True,hf,False,stablehlo,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 gpt2,True,hf_causallm,False,stablehlo,True,125M,"nlp;transformer-encoder","-"
 facebook/opt-125m,True,hf,False,stablehlo,True,125M,"nlp;transformer-encoder","-"
 distilgpt2,True,hf,False,stablehlo,True,88M,"nlp;transformer-encoder","-"
-microsoft/deberta-v3-base,True,hf,False,stablehlo,True,88M,"nlp;transformer-encoder","-"
+microsoft/deberta-v3-base,True,hf,False,stablehlo,True,88M,"nlp;transformer-encoder","-"