Update README.md (#1830 )

[flags] Fix vulkan runtime flags as vma is dropped from iree (#1831 )
(SD) Fix tokenizers imports in pyinstaller builds. (#1828 )
2026-04-20 03:00:34 -04:00 · 2023-09-14 10:33:57 -05:00 · 2023-09-14 08:58:59 -05:00 · 2023-09-12 12:23:48 -05:00 · 2023-09-12 10:59:28 +05:30 · 2023-09-11 13:42:52 -05:00
49 changed files with 2195 additions and 689 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -193,3 +193,9 @@ stencil_annotator/
 # For DocuChat
 apps/language_models/langchain/user_path/
 db_dir_UserData
+
+# Embeded browser cache and other
+apps/stable_diffusion/web/EBWebView/
+
+# Llama2 tokenizer configs
+llama2_tokenizer_configs/
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ High Performance Machine Learning Distribution
  <summary>Prerequisites - Drivers </summary>
  
 #### Install your Windows hardware drivers
-* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-2-1).
+* [AMD RDNA Users] Download the latest driver (23.2.1 is the oldest supported) [here](https://www.amd.com/en/support).
 * [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
 * [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
  
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -237,7 +237,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
            print(f"[DEBUG] converting torch to linalg")
            run_pipeline_with_repro_report(
                module,
-                "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
            )
        else:
--- a/apps/language_models/scripts/stablelm.py
+++ b/apps/language_models/scripts/stablelm.py
@@ -46,6 +46,7 @@ def compile_stableLM(
    model_vmfb_name,
    device="cuda",
    precision="fp32",
+    debug=False,
 ):
    from shark.shark_inference import SharkInference

@@ -92,7 +93,7 @@ def compile_stableLM(
    shark_module.compile()

    path = shark_module.save_module(
-        vmfb_path.parent.absolute(), vmfb_path.stem
+        vmfb_path.parent.absolute(), vmfb_path.stem, debug=debug
    )
    print("Saved vmfb at ", str(path))

--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from tqdm import tqdm
 from typing import List, Tuple
 import subprocess
+import sys

 import torch
 import torch_mlir
@@ -37,7 +38,9 @@ from apps.language_models.src.model_wrappers.vicuna4 import (
 )
 from apps.language_models.src.model_wrappers.vicuna_model import (
    FirstVicuna,
-    SecondVicuna,
+    SecondVicuna7B,
+    SecondVicuna13B,
+    SecondVicuna70B,
 )
 from apps.language_models.utils import (
    get_vmfb_from_path,
@@ -47,9 +50,6 @@ from shark.shark_importer import get_f16_inputs
 from shark.shark_importer import import_with_fx
 from shark.shark_inference import SharkInference

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
-

 parser = argparse.ArgumentParser(
    prog="vicuna runner",
@@ -108,7 +108,7 @@ parser.add_argument(
    "--model_name",
    type=str,
    default="vicuna",
-    choices=["vicuna", "llama2_7b", "llama2_70b"],
+    choices=["vicuna", "llama2_7b", "llama2_13b", "llama2_70b"],
    help="Specify which model to run.",
 )
 parser.add_argument(
@@ -189,17 +189,20 @@ class VicunaBase(SharkLLMBase):
        return vicuna_model

    def combine_mlir_scripts(
-        self, first_vicuna_mlir, second_vicuna_mlir, output_name
+        self,
+        first_vicuna_mlir,
+        second_vicuna_mlir,
+        output_name,
    ):
        print(f"[DEBUG] combining first and second mlir")
-        print(f"[DEBIG] output_name = {output_name}")
+        print(f"[DEBUG] output_name = {output_name}")
        maps1 = []
        maps2 = []
        constants = set()
        f1 = []
        f2 = []

-        print(f"[DEBUG] processing first vircuna mlir")
+        print(f"[DEBUG] processing first vicuna mlir")
        first_vicuna_mlir = first_vicuna_mlir.splitlines()
        while first_vicuna_mlir:
            line = first_vicuna_mlir.pop(0)
@@ -223,7 +226,7 @@ class VicunaBase(SharkLLMBase):
                for func_line in f1
            ]

-        print(f"[DEBUG] processing second vircuna mlir")
+        print(f"[DEBUG] processing second vicuna mlir")
        second_vicuna_mlir = second_vicuna_mlir.splitlines()
        while second_vicuna_mlir:
            line = second_vicuna_mlir.pop(0)
@@ -273,6 +276,7 @@ class VicunaBase(SharkLLMBase):
                print(constant)
            vdtype = vbody.split(":")[-1].strip()
            fixed_vdtype = vdtype
+            noinline = "{noinline}" if "tensor" in fixed_vdtype else ""
            if "c1_i64" in vname:
                print(constant)
                counter += 1
@@ -301,6 +305,7 @@ class VicunaBase(SharkLLMBase):
                global_var_loading2.append(
                    f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
                )
+
        new_f1, new_f2 = [], []

        print(f"[DEBUG] processing f1")
@@ -324,10 +329,7 @@ class VicunaBase(SharkLLMBase):
                        print(global_var)
                    new_f2.append(global_var)
            else:
-                if "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64" in line:
-                    new_f2.append("%" + line)
-                else:
-                    new_f2.append(line)
+                new_f2.append(line)

        f1 = new_f1
        f2 = new_f2
@@ -412,8 +414,7 @@ class VicunaBase(SharkLLMBase):
            _past_key_values = torch.tensor(output[1:])
            _token = torch.argmax(_logits[:, -1, :], dim=1)

-        skip_sp_tok = True if self.model_name == "codegen" else False
-        _detok = self.tokenizer.decode(_token, skip_special_tokens=skip_sp_tok)
+        _detok = self.tokenizer.decode(_token, skip_special_tokens=False)
        ret_dict = {
            "token": _token,
            "detok": _detok,
@@ -440,11 +441,18 @@ class ShardedVicuna(VicunaBase):
        weight_group_size=128,
        compressed=False,
        extra_args_cmd=[],
+        debug=False,
    ) -> None:
-        super().__init__(model_name, hf_model_path, max_num_tokens, extra_args_cmd=extra_args_cmd)
+        super().__init__(
+            model_name,
+            hf_model_path,
+            max_num_tokens,
+            extra_args_cmd=extra_args_cmd,
+        )
        self.max_sequence_length = 256
        self.device = device
        self.precision = precision
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
        self.config = config_json
        self.weight_group_size = weight_group_size
@@ -457,17 +465,11 @@ class ShardedVicuna(VicunaBase):
            kwargs = {
                "use_auth_token": "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"
            }
-        if self.model_name == "codegen":
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                trust_remote_code=True,
-            )
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                use_fast=False,
-                **kwargs,
-            )
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.hf_model_path,
+            use_fast=False,
+            **kwargs,
+        )
        return tokenizer

    def get_src_model(self):
@@ -632,7 +634,7 @@ class ShardedVicuna(VicunaBase):
        return device_idx

    def compile_lmhead(
-        self, lmh, hidden_states, device="cpu", device_idx=None
+        self, lmh, hidden_states, device="cpu", device_idx=None,
    ):
        # compile the lm head of the vicuna model
        # This can be used for both first and second vicuna, so only needs to be run once
@@ -680,7 +682,7 @@ class ShardedVicuna(VicunaBase):
        if vmfb_path.exists():
            shark_module.load_module(vmfb_path)
        else:
-            shark_module.save_module(module_name="lmhead")
+            shark_module.save_module(module_name="lmhead", debug=self.debug)
            shark_module.load_module(vmfb_path)
        compiled_module = LMHeadCompiled(shark_module)
        return compiled_module
@@ -726,7 +728,7 @@ class ShardedVicuna(VicunaBase):
        if vmfb_path.exists():
            shark_module.load_module(vmfb_path)
        else:
-            shark_module.save_module(module_name="norm")
+            shark_module.save_module(module_name="norm", debug=self.debug)
            shark_module.load_module(vmfb_path)
        compiled_module = VicunaNormCompiled(shark_module)
        return compiled_module
@@ -777,14 +779,14 @@ class ShardedVicuna(VicunaBase):
        if vmfb_path.exists():
            shark_module.load_module(vmfb_path)
        else:
-            shark_module.save_module(module_name="embedding")
+            shark_module.save_module(module_name="embedding", debug=self.debug)
            shark_module.load_module(vmfb_path)
        compiled_module = VicunaEmbeddingCompiled(shark_module)

        return compiled_module

    def compile_to_vmfb_one_model(
-        self, inputs0, layers0, inputs1, layers1, device="cpu"
+        self, inputs0, layers0, inputs1, layers1, device="cpu",
    ):
        mlirs, modules = [], []
        assert len(layers0) == len(layers1)
@@ -794,7 +796,6 @@ class ShardedVicuna(VicunaBase):
            # if vmfb_path.exists():
            #    continue
            if mlir_path.exists():
-                # print(f"Found layer {idx} mlir")
                f_ = open(mlir_path, "rb")
                bytecode = f_.read()
                f_.close()
@@ -830,6 +831,8 @@ class ShardedVicuna(VicunaBase):
                    layer0, inputs0[0], inputs0[1], inputs0[2]
                )
                if self.precision in ["int4", "int8"]:
+                    from brevitas_examples.llm.llm_quant.quantize import quantize_model
+                    from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
                    module0 = torch_mlir.compile(
                        ts_g,
                        (
@@ -846,7 +849,7 @@ class ShardedVicuna(VicunaBase):
                    print(f"[DEBUG] converting torch to linalg")
                    run_pipeline_with_repro_report(
                        module0,
-                        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
                    )
                else:
@@ -890,7 +893,7 @@ class ShardedVicuna(VicunaBase):
                    print(f"[DEBUG] converting torch to linalg")
                    run_pipeline_with_repro_report(
                        module1,
-                        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
                    )
                else:
@@ -915,7 +918,6 @@ class ShardedVicuna(VicunaBase):
                mlirs.append(module_combined)

            if vmfb_path.exists():
-                # print(f"Found layer {idx} vmfb")
                device_idx = self.get_device_index(
                    f"first_vicuna.model.model.layers.{idx}[\s.$]"
                )
@@ -945,7 +947,9 @@ class ShardedVicuna(VicunaBase):
                        "--iree-vm-target-truncate-unsupported-floats",
                        "--iree-codegen-check-ir-before-llvm-conversion=false",
                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    ] + self.extra_args,
+                    ]
+                    + self.extra_args,
+                    debug=self.debug,
                )
                module.load_module(vmfb_path)
            modules.append(module)
@@ -962,7 +966,6 @@ class ShardedVicuna(VicunaBase):
            # if vmfb_path.exists():
            #    continue
            if mlir_path.exists():
-                # print(f"Found layer {idx} mlir")
                f_ = open(mlir_path, "rb")
                bytecode = f_.read()
                f_.close()
@@ -981,7 +984,6 @@ class ShardedVicuna(VicunaBase):
                mlirs.append(bytecode)

            if vmfb_path.exists():
-                # print(f"Found layer {idx} vmfb")
                device_idx = self.get_device_index(
                    f"first_vicuna.model.model.layers.{idx}[\s.$]"
                )
@@ -1011,7 +1013,9 @@ class ShardedVicuna(VicunaBase):
                        "--iree-vm-target-truncate-unsupported-floats",
                        "--iree-codegen-check-ir-before-llvm-conversion=false",
                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    ] + self.extra_args,
+                    ]
+                    + self.extra_args,
+                    debug=self.debug,
                )
                module.load_module(vmfb_path)
            modules.append(module)
@@ -1028,6 +1032,8 @@ class ShardedVicuna(VicunaBase):
            )

        if self.precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
            print("Applying weight quantization..")
            weight_bit_width = 4 if self.precision == "int4" else 8
            quantize_model(
@@ -1218,21 +1224,25 @@ class UnshardedVicuna(VicunaBase):
        precision="int8",
        vicuna_mlir_path=None,
        vicuna_vmfb_path=None,
-        load_mlir_from_shark_tank=True,
+        load_mlir_from_shark_tank=False,
        low_device_memory=False,
        weight_group_size=128,
        download_vmfb=False,
        cache_vicunas=False,
        extra_args_cmd=[],
+        debug=False,
    ) -> None:
-        super().__init__(model_name, hf_model_path, max_num_tokens, extra_args_cmd=extra_args_cmd)
-        if "llama2" in self.model_name and hf_auth_token == None:
-            raise ValueError(
-                "HF auth token required. Pass it using --hf_auth_token flag."
-            )
+        super().__init__(
+            model_name,
+            hf_model_path,
+            max_num_tokens,
+            extra_args_cmd=extra_args_cmd,
+        )
        self.hf_auth_token = hf_auth_token
        if self.model_name == "llama2_7b":
            self.hf_model_path = "meta-llama/Llama-2-7b-chat-hf"
+        elif self.model_name == "llama2_13b":
+            self.hf_model_path = "meta-llama/Llama-2-13b-chat-hf"
        elif self.model_name == "llama2_70b":
            self.hf_model_path = "meta-llama/Llama-2-70b-chat-hf"
        print(f"[DEBUG] hf model name: {self.hf_model_path}")
@@ -1245,6 +1255,7 @@ class UnshardedVicuna(VicunaBase):
        self.load_mlir_from_shark_tank = load_mlir_from_shark_tank
        self.low_device_memory = low_device_memory
        self.weight_group_size = weight_group_size
+        self.debug = debug
        if self.vicuna_mlir_path == None:
            self.vicuna_mlir_path = self.get_model_path()
        if self.vicuna_vmfb_path == None:
@@ -1255,25 +1266,28 @@ class UnshardedVicuna(VicunaBase):

    def get_model_path(self, suffix="mlir"):
        safe_device = self.device.split("-")[0]
-        if suffix == "mlir":
+        if suffix in ["mlirbc", "mlir"]:
            return Path(f"{self.model_name}_{self.precision}.{suffix}")
        return Path(
            f"{self.model_name}_{self.precision}_{safe_device}.{suffix}"
        )

    def get_tokenizer(self):
-        kwargs = {"use_auth_token": self.hf_auth_token}
-        if self.model_name == "codegen":
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                trust_remote_code=True,
-            )
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                use_fast=False,
-                **kwargs,
+        local_tokenizer_path = Path(Path.cwd(), "llama2_tokenizer_configs")
+        local_tokenizer_path.mkdir(parents=True, exist_ok=True)
+        tokenizer_files_to_download = [
+            "config.json",
+            "special_tokens_map.json",
+            "tokenizer.model",
+            "tokenizer_config.json",
+        ]
+        for tokenizer_file in tokenizer_files_to_download:
+            download_public_file(
+                f"gs://shark_tank/llama2_tokenizer/{tokenizer_file}",
+                Path(local_tokenizer_path, tokenizer_file),
+                single_file=True,
            )
+        tokenizer = AutoTokenizer.from_pretrained(str(local_tokenizer_path))
        return tokenizer

    def get_src_model(self):
@@ -1334,7 +1348,7 @@ class UnshardedVicuna(VicunaBase):
                line = re.sub("c19", "dim", line)
            if " 19," in line:
                line = re.sub(" 19,", " %dim,", line)
-            if "20x" in line:
+            if "x20x" in line or "<20x" in line:
                line = re.sub("20x", "?x", line)
                line = re.sub("tensor.empty\(\)", "tensor.empty(%dimp1)", line)
            if " 20," in line:
@@ -1343,13 +1357,25 @@ class UnshardedVicuna(VicunaBase):

        module = module.splitlines()
        new_lines = []
+
        # Using a while loop and the pop method to avoid creating a copy of module
+        if "llama2_13b" in self.model_name:
+            pkv_tensor_shape = "tensor<1x40x?x128x"
+        elif "llama2_70b" in self.model_name:
+            pkv_tensor_shape = "tensor<1x8x?x128x"
+        else:
+            pkv_tensor_shape = "tensor<1x32x?x128x"
+        if self.precision in ["fp16", "int4", "int8"]:
+            pkv_tensor_shape += "f16>"
+        else:
+            pkv_tensor_shape += "f32>"
+
        while module:
            line = module.pop(0)
            if "%c19_i64 = arith.constant 19 : i64" in line:
                new_lines.append("%c2 = arith.constant 2 : index")
                new_lines.append(
-                    f"%dim_4_int = tensor.dim %arg1, %c2 : tensor<1x32x?x128x{'f16' if self.precision == 'fp16' else 'f32'}>"
+                    f"%dim_4_int = tensor.dim %arg1, %c2 : {pkv_tensor_shape}"
                )
                new_lines.append(
                    "%dim_i64 = arith.index_cast %dim_4_int : index to i64"
@@ -1360,7 +1386,7 @@ class UnshardedVicuna(VicunaBase):
            if "%c20_i64 = arith.constant 20 : i64" in line:
                new_lines.append("%c1_i64 = arith.constant 1 : i64")
                new_lines.append(
-                    "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
+                    "%c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
                )
                new_lines.append(
                    "%dimp1 = arith.index_cast %c20_i64 : i64 to index"
@@ -1371,10 +1397,13 @@ class UnshardedVicuna(VicunaBase):

        return "\n".join(new_lines)

-    def compile(self, download_vmfb=False):
+    def compile(self):
        # Testing : DO NOT Download Vmfbs if not found. Modify later
        # download vmfbs for A100
-        if not self.vicuna_vmfb_path.exists() and download_vmfb:
+        if not self.vicuna_vmfb_path.exists() and self.download_vmfb:
+            print(
+                f"Looking into gs://shark_tank/{self.model_name}/unsharded/vmfb/{self.vicuna_vmfb_path.name}"
+            )
            download_public_file(
                f"gs://shark_tank/{self.model_name}/unsharded/vmfb/{self.vicuna_vmfb_path.name}",
                self.vicuna_vmfb_path.absolute(),
@@ -1387,213 +1416,244 @@ class UnshardedVicuna(VicunaBase):
            print(f"[DEBUG] vmfb found at {self.vicuna_vmfb_path.absolute()}")
            return

-        print(f"[DEBUG] vmfb not found at {self.vicuna_vmfb_path.absolute()}")
-        if self.vicuna_mlir_path.exists():
-            print(f"[DEBUG] mlir found at {self.vicuna_mlir_path.absolute()}")
-            with open(self.vicuna_mlir_path, "rb") as f:
-                combined_module = f.read()
-        else:
-            print(
-                f"[DEBUG] mlir not found at {self.vicuna_mlir_path.absolute()}"
-            )
-            mlir_generated = False
-            if self.load_mlir_from_shark_tank:
-                # download MLIR from shark tank
+        print(f"[DEBUG] vmfb not found")
+        mlir_generated = False
+        for suffix in ["mlirbc", "mlir"]:
+            self.vicuna_mlir_path = self.get_model_path(suffix)
+            if not self.vicuna_mlir_path.exists() and self.load_mlir_from_shark_tank:
+                print(
+                    f"Looking into gs://shark_tank/{self.model_name}/unsharded/mlir/{self.vicuna_mlir_path.name}"
+                )
                download_public_file(
                    f"gs://shark_tank/{self.model_name}/unsharded/mlir/{self.vicuna_mlir_path.name}",
                    self.vicuna_mlir_path.absolute(),
                    single_file=True,
                )
-                if self.vicuna_mlir_path.exists():
-                    with open(self.vicuna_mlir_path, "rb") as f:
-                        combined_module = f.read()
-                    mlir_generated = True
-                else:
-                    print(
-                        f"[DEBUG] failed to download {self.vicuna_mlir_path.name} from shark tank"
-                    )
+            if self.vicuna_mlir_path.exists():
+                print(f"[DEBUG] mlir found at {self.vicuna_mlir_path.absolute()}")
+                with open(self.vicuna_mlir_path, "rb") as f:
+                    combined_module = f.read()
+                mlir_generated = True
+                break

-            if not mlir_generated:
-                print("[DEBUG] generating mlir on device")
-                # Select a compilation prompt such that the resulting input_ids
-                # from the model's tokenizer has shape [1, 19]
-                if self.model_name == "codegen":
-                    compilation_prompt = "def hello_world():\n    print('Hello World')\n    print('Hello World')"
-                else:
-                    compilation_prompt = "".join(["0" for _ in range(17)])
+        if not mlir_generated:
+            print(f"[DEBUG] mlir not found")
+            # Disabling this path of IR generation for now as it is broken.
+            print("Please check if the mlir file is present at the shark tank. Exiting.")
+            self.shark_model = None
+            sys.exit()
+            return

-                if Path(f"first_{self.precision}.mlir").exists():
-                    print(f"loading first_{self.precision}.mlir")
-                    with open(Path(f"first_{self.precision}.mlir"), "r") as f:
-                        first_module = f.read()
-                else:
-                    # generate first vicuna
-                    compilation_input_ids = self.tokenizer(
-                        compilation_prompt,
-                        return_tensors="pt",
-                    ).input_ids
-                    compilation_input_ids = torch.tensor(
-                        compilation_input_ids
-                    ).reshape([1, 19])
-                    firstVicunaCompileInput = (compilation_input_ids,)
-                    model = FirstVicuna(
-                        self.hf_model_path,
-                        self.precision,
-                        self.weight_group_size,
-                        self.model_name,
-                        self.hf_auth_token,
-                    )
-                    print(f"[DEBUG] generating torchscript graph")
-                    ts_graph = import_with_fx(
-                        model,
-                        firstVicunaCompileInput,
-                        is_f16=self.precision == "fp16",
-                        precision=self.precision,
-                        f16_input_mask=[False, False],
-                        mlir_type="torchscript",
-                    )
-                    del model
-                    firstVicunaCompileInput = list(firstVicunaCompileInput)
-                    firstVicunaCompileInput[
-                        0
-                    ] = torch_mlir.TensorPlaceholder.like(
-                        firstVicunaCompileInput[0], dynamic_axes=[1]
-                    )
+            print("[DEBUG] generating mlir on device")
+            # Select a compilation prompt such that the resulting input_ids
+            # from the model's tokenizer has shape [1, 19]
+            compilation_prompt = "".join(["0" for _ in range(17)])

-                    firstVicunaCompileInput = tuple(firstVicunaCompileInput)
-                    first_module = None
-                    print(f"[DEBUG] generating torch mlir")
-                    if self.precision in ["int4", "int8"]:
-                        first_module = torch_mlir.compile(
-                            ts_graph,
-                            [*firstVicunaCompileInput],
-                            output_type=torch_mlir.OutputType.TORCH,
-                            backend_legal_ops=[
-                                "quant.matmul_rhs_group_quant"
-                            ],
-                            extra_library=brevitas_matmul_rhs_group_quant_library,
-                            use_tracing=False,
-                            verbose=False,
-                        )
-                        print(f"[DEBUG] converting torch to linalg")
-                        run_pipeline_with_repro_report(
-                            first_module,
-                            "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
-                            description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
-                        )
-                    else:
-                        first_module = torch_mlir.compile(
-                            ts_graph,
-                            [*firstVicunaCompileInput],
-                            torch_mlir.OutputType.LINALG_ON_TENSORS,
-                            use_tracing=False,
-                            verbose=False,
-                        )
-                    del ts_graph
-                    del firstVicunaCompileInput
-                    gc.collect()
-
-                    print(
-                        "[DEBUG] successfully generated first vicuna linalg mlir"
-                    )
-                    first_module = self.write_in_dynamic_inputs0(
-                        str(first_module), dynamic_input_size=19
-                    )
-                    if self.cache_vicunas:
-                        with open(f"first_{self.precision}.mlir", "w+") as f:
-                            f.write(first_module)
-
-                if Path(f"second_{self.precision}.mlir").exists():
-                    print(f"loading second_{self.precision}.mlir")
-                    with open(Path(f"second_{self.precision}.mlir"), "r") as f:
-                        second_module = f.read()
-                else:
-                    # generate second vicuna
-                    compilation_input_ids = torch.zeros(
-                        [1, 1], dtype=torch.int64
-                    )
-                    pkv = tuple(
-                        (torch.zeros([1, 32, 19, 128], dtype=torch.float32))
-                        for _ in range(64)
-                    )
-                    secondVicunaCompileInput = (compilation_input_ids,) + pkv
-                    model = SecondVicuna(
-                        self.hf_model_path,
-                        self.precision,
-                        self.weight_group_size,
-                        self.model_name,
-                        self.hf_auth_token,
-                    )
-                    print(f"[DEBUG] generating torchscript graph")
-                    ts_graph = import_with_fx(
-                        model,
-                        secondVicunaCompileInput,
-                        is_f16=self.precision == "fp16",
-                        precision=self.precision,
-                        f16_input_mask=[False] + [True] * 64,
-                        mlir_type="torchscript",
-                    )
-                    del model
-                    if self.precision == "fp16":
-                        secondVicunaCompileInput = get_f16_inputs(
-                            secondVicunaCompileInput,
-                            True,
-                            f16_input_mask=[False] + [True] * 64,
-                        )
-                    secondVicunaCompileInput = list(secondVicunaCompileInput)
-                    for i in range(len(secondVicunaCompileInput)):
-                        if i != 0:
-                            secondVicunaCompileInput[
-                                i
-                            ] = torch_mlir.TensorPlaceholder.like(
-                                secondVicunaCompileInput[i], dynamic_axes=[2]
-                            )
-                    secondVicunaCompileInput = tuple(secondVicunaCompileInput)
-                    print(f"[DEBUG] generating torch mlir")
-                    if self.precision in ["int4", "int8"]:
-                        second_module = torch_mlir.compile(
-                            ts_graph,
-                            [*secondVicunaCompileInput],
-                            output_type=torch_mlir.OutputType.TORCH,
-                            backend_legal_ops=[
-                                "quant.matmul_rhs_group_quant"
-                            ],
-                            extra_library=brevitas_matmul_rhs_group_quant_library,
-                            use_tracing=False,
-                            verbose=False,
-                        )
-                        print(f"[DEBUG] converting torch to linalg")
-                        run_pipeline_with_repro_report(
-                            second_module,
-                            "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
-                            description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
-                        )
-                    else:
-                        second_module = torch_mlir.compile(
-                            ts_graph,
-                            [*secondVicunaCompileInput],
-                            torch_mlir.OutputType.LINALG_ON_TENSORS,
-                            use_tracing=False,
-                            verbose=False,
-                        )
-                    del ts_graph
-                    del secondVicunaCompileInput
-                    gc.collect()
-                    print(
-                        "[DEBUG] successfully generated second vicuna linalg mlir"
-                    )
-                    second_module = self.write_in_dynamic_inputs1(
-                        str(second_module)
-                    )
-                    if self.cache_vicunas:
-                        with open(f"second_{self.precision}.mlir", "w+") as f:
-                            f.write(second_module)
-
-                combined_module = self.combine_mlir_scripts(
-                    first_module, second_module, self.vicuna_mlir_path
+            first_model_path = f"first_{self.model_name}_{self.precision}.mlir"
+            if Path(first_model_path).exists():
+                print(f"loading {first_model_path}")
+                with open(Path(first_model_path), "r") as f:
+                    first_module = f.read()
+            else:
+                # generate first vicuna
+                compilation_input_ids = self.tokenizer(
+                    compilation_prompt,
+                    return_tensors="pt",
+                ).input_ids
+                compilation_input_ids = torch.tensor(
+                    compilation_input_ids
+                ).reshape([1, 19])
+                firstVicunaCompileInput = (compilation_input_ids,)
+                model = FirstVicuna(
+                    self.hf_model_path,
+                    self.precision,
+                    self.weight_group_size,
+                    self.model_name,
+                    self.hf_auth_token,
+                )
+                print(f"[DEBUG] generating torchscript graph")
+                is_f16 = self.precision in ["fp16", "int4"]
+                ts_graph = import_with_fx(
+                    model,
+                    firstVicunaCompileInput,
+                    is_f16=is_f16,
+                    precision=self.precision,
+                    f16_input_mask=[False, False],
+                    mlir_type="torchscript",
+                )
+                del model
+                firstVicunaCompileInput = list(firstVicunaCompileInput)
+                firstVicunaCompileInput[
+                    0
+                ] = torch_mlir.TensorPlaceholder.like(
+                    firstVicunaCompileInput[0], dynamic_axes=[1]
                )
-                del first_module, second_module

+                firstVicunaCompileInput = tuple(firstVicunaCompileInput)
+                first_module = None
+                print(f"[DEBUG] generating torch mlir")
+                if self.precision in ["int4", "int8"]:
+                    first_module = torch_mlir.compile(
+                        ts_graph,
+                        [*firstVicunaCompileInput],
+                        output_type=torch_mlir.OutputType.TORCH,
+                        backend_legal_ops=["quant.matmul_rhs_group_quant"],
+                        extra_library=brevitas_matmul_rhs_group_quant_library,
+                        use_tracing=False,
+                        verbose=False,
+                    )
+                    print(f"[DEBUG] converting torch to linalg")
+                    run_pipeline_with_repro_report(
+                        first_module,
+                        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
+                    )
+                else:
+                    first_module = torch_mlir.compile(
+                        ts_graph,
+                        [*firstVicunaCompileInput],
+                        torch_mlir.OutputType.LINALG_ON_TENSORS,
+                        use_tracing=False,
+                        verbose=False,
+                    )
+                del ts_graph
+                del firstVicunaCompileInput
+                gc.collect()
+
+                print(
+                    "[DEBUG] successfully generated first vicuna linalg mlir"
+                )
+                first_module = self.write_in_dynamic_inputs0(
+                    str(first_module), dynamic_input_size=19
+                )
+                if self.cache_vicunas:
+                    with open(first_model_path, "w+") as f:
+                        f.write(first_module)
+                    print("Finished writing IR after dynamic")
+
+            print(f"[DEBUG] Starting generation of second llama")
+            second_model_path = f"second_{self.model_name}_{self.precision}.mlir"
+            if Path(second_model_path).exists():
+                print(f"loading {second_model_path}")
+                with open(Path(second_model_path), "r") as f:
+                    second_module = f.read()
+            else:
+                # generate second vicuna
+                compilation_input_ids = torch.zeros(
+                    [1, 1], dtype=torch.int64
+                )
+                if self.model_name == "llama2_13b":
+                    dim1 = 40
+                    total_tuple = 80
+                elif self.model_name == "llama2_70b":
+                    dim1 = 8
+                    total_tuple = 160
+                else:
+                    dim1 = 32
+                    total_tuple = 64
+                pkv = tuple(
+                    (torch.zeros([1, dim1, 19, 128], dtype=torch.float32))
+                    for _ in range(total_tuple)
+                )
+                secondVicunaCompileInput = (compilation_input_ids,) + pkv
+                if self.model_name == "llama2_13b":
+                    model = SecondVicuna13B(
+                        self.hf_model_path,
+                        self.precision,
+                        self.weight_group_size,
+                        self.model_name,
+                        self.hf_auth_token,
+                    )
+                elif self.model_name == "llama2_70b":
+                    model = SecondVicuna70B(
+                        self.hf_model_path,
+                        self.precision,
+                        self.weight_group_size,
+                        self.model_name,
+                        self.hf_auth_token,
+                    )
+                else:
+                    model = SecondVicuna7B(
+                        self.hf_model_path,
+                        self.precision,
+                        self.weight_group_size,
+                        self.model_name,
+                        self.hf_auth_token,
+                    )
+                print(f"[DEBUG] generating torchscript graph")
+                is_f16 = self.precision in ["fp16", "int4"]
+                ts_graph = import_with_fx(
+                    model,
+                    secondVicunaCompileInput,
+                    is_f16=is_f16,
+                    precision=self.precision,
+                    f16_input_mask=[False] + [True] * total_tuple,
+                    mlir_type="torchscript",
+                )
+                del model
+                if self.precision in ["fp16", "int4"]:
+                    secondVicunaCompileInput = get_f16_inputs(
+                        secondVicunaCompileInput,
+                        True,
+                        f16_input_mask=[False] + [True] * total_tuple,
+                    )
+                secondVicunaCompileInput = list(secondVicunaCompileInput)
+                for i in range(len(secondVicunaCompileInput)):
+                    if i != 0:
+                        secondVicunaCompileInput[i] = torch_mlir.TensorPlaceholder.like(
+                            secondVicunaCompileInput[i], dynamic_axes=[2]
+                        )
+                secondVicunaCompileInput = tuple(secondVicunaCompileInput)
+                print(f"[DEBUG] generating torch mlir")
+                if self.precision in ["int4", "int8"]:
+                    second_module = torch_mlir.compile(
+                        ts_graph,
+                        [*secondVicunaCompileInput],
+                        output_type=torch_mlir.OutputType.TORCH,
+                        backend_legal_ops=["quant.matmul_rhs_group_quant"],
+                        extra_library=brevitas_matmul_rhs_group_quant_library,
+                        use_tracing=False,
+                        verbose=False,
+                    )
+                    print(f"[DEBUG] converting torch to linalg")
+                    run_pipeline_with_repro_report(
+                        second_module,
+                        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
+                    )
+                else:
+                    second_module = torch_mlir.compile(
+                        ts_graph,
+                        [*secondVicunaCompileInput],
+                        torch_mlir.OutputType.LINALG_ON_TENSORS,
+                        use_tracing=False,
+                        verbose=False,
+                    )
+                del ts_graph
+                del secondVicunaCompileInput
+                gc.collect()
+
+                print(
+                    "[DEBUG] successfully generated second vicuna linalg mlir"
+                )
+                second_module = self.write_in_dynamic_inputs1(
+                    str(second_module)
+                )
+                if self.cache_vicunas:
+                    with open(second_model_path, "w+") as f:
+                        f.write(second_module)
+                    print("Finished writing IR after dynamic")
+
+            combined_module = self.combine_mlir_scripts(
+                first_module,
+                second_module,
+                self.vicuna_mlir_path,
+            )
+            del first_module, second_module
+
+        print(self.device)
+        if "rocm" in self.device:
+            self.device = "rocm"
        shark_module = SharkInference(
            mlir_module=combined_module,
            device=self.device,
@@ -1606,7 +1666,9 @@ class UnshardedVicuna(VicunaBase):
                "--iree-vm-target-truncate-unsupported-floats",
                "--iree-codegen-check-ir-before-llvm-conversion=false",
                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-            ] + self.extra_args,
+            ]
+            + self.extra_args,
+            debug=self.debug,
        )
        print("Saved vic vmfb at ", str(path))
        shark_module.load_module(path)
@@ -1617,9 +1679,8 @@ class UnshardedVicuna(VicunaBase):
            if type(res_tokens[i]) != int:
                res_tokens[i] = int(res_tokens[i][0])

-        skip_sp_tok = True if self.model_name == "codegen" else False
        res_str = self.tokenizer.decode(
-            res_tokens, skip_special_tokens=skip_sp_tok
+            res_tokens, skip_special_tokens=False
        )
        return res_str

@@ -1662,7 +1723,7 @@ class UnshardedVicuna(VicunaBase):
            pkv = generated_token_op["past_key_values"]
            detok = generated_token_op["detok"]

-            if token == 2 and self.model_name != "codegen":
+            if token == 2:
                break
            res_tokens.append(token)
            if detok == "<0x0A>":
@@ -1674,13 +1735,13 @@ class UnshardedVicuna(VicunaBase):
            yield detok, ""

        res_str = self.decode_tokens(res_tokens)
-        # print(f"[DEBUG] final output : \n{res_str}")
        yield res_str, "formatted"

    def autocomplete(self, prompt):
        # use First vic alone to complete a story / prompt / sentence.
        pass

+
 # NOTE: Each `model_name` should have its own start message
 start_message = {
    "llama2_7b": (
@@ -1692,6 +1753,15 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
+    "llama2_13b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
    "llama2_70b": (
        "System: You are a helpful, respectful and honest assistant. Always answer "
        "as helpfully as possible, while being safe.  Your answers should not "
@@ -1701,46 +1771,37 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
-    "StableLM": (
-        "<|SYSTEM|># StableLM Tuned (Alpha version)"
-        "\n- StableLM is a helpful and harmless open-source AI language model "
-        "developed by StabilityAI."
-        "\n- StableLM is excited to be able to help the user, but will refuse "
-        "to do anything that could be considered harmful to the user."
-        "\n- StableLM is more than just an information source, StableLM is also "
-        "able to write poetry, short stories, and make jokes."
-        "\n- StableLM will refuse to participate in anything that "
-        "could harm a human."
-    ),
    "vicuna": (
        "A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's "
        "questions.\n"
    ),
-    "vicuna4": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "vicuna1p3": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "codegen": "",
 }

+
 def create_prompt(model_name, history):
    global start_message
    system_message = start_message[model_name]
-    conversation = "".join(
-        [
-            "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
-            for item in history
-        ]
-    )
-    msg = system_message + conversation
-    msg = msg.strip()
+    if "llama2" in model_name:
+        B_INST, E_INST = "[INST]", "[/INST]"
+        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+        conversation = "".join(
+            [
+                f"{B_INST} {item[0].strip()} {E_INST} {item[1].strip()} "
+                for item in history[1:]
+            ]
+        )
+        msg = f"{B_INST} {B_SYS} {system_message} {E_SYS} {history[0][0]} {E_INST} {history[0][1]} {conversation}"
+
+    else:
+        conversation = "".join(
+            [
+                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
+                for item in history
+            ]
+        )
+        msg = system_message + conversation
+        msg = msg.strip()
    return msg


@@ -1811,6 +1872,7 @@ if __name__ == "__main__":
    model_list = {
        "vicuna": "vicuna=>TheBloke/vicuna-7B-1.1-HF",
        "llama2_7b": "llama2_7b=>meta-llama/Llama-2-7b-chat-hf",
+        "llama2_13b": "llama2_13b=>meta-llama/Llama-2-13b-chat-hf",
        "llama2_70b": "llama2_70b=>meta-llama/Llama-2-70b-chat-hf",
    }
    while True:
@@ -1820,5 +1882,5 @@ if __name__ == "__main__":
        prompt = create_prompt(args.model_name, history)
        for text, msg in vic.generate(prompt, cli=True):
            if "formatted" in msg:
-                print("Response:",text)
+                print("Response:", text)
                history[-1][1] = text
--- a/apps/language_models/shark_llama_cli.spec
+++ b/apps/language_models/shark_llama_cli.spec
@@ -0,0 +1,94 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import collect_submodules
+from PyInstaller.utils.hooks import copy_metadata
+
+import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
+
+datas = []
+datas += collect_data_files('torch')
+datas += copy_metadata('torch')
+datas += copy_metadata('tqdm')
+datas += copy_metadata('regex')
+datas += copy_metadata('requests')
+datas += copy_metadata('packaging')
+datas += copy_metadata('filelock')
+datas += copy_metadata('numpy')
+datas += copy_metadata('tokenizers')
+datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torch-mlir')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
+datas += copy_metadata('huggingface-hub')
+datas += copy_metadata('sentencepiece')
+datas += copy_metadata("pyyaml")
+datas += collect_data_files("tokenizers")
+datas += collect_data_files("tiktoken")
+datas += collect_data_files("accelerate")
+datas += collect_data_files('diffusers')
+datas += collect_data_files('transformers')
+datas += collect_data_files('opencv-python')
+datas += collect_data_files('pytorch_lightning')
+datas += collect_data_files('skimage')
+datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_client')
+datas += collect_data_files('iree')
+datas += collect_data_files('google-cloud-storage')
+datas += collect_data_files('py-cpuinfo')
+datas += collect_data_files("shark", include_py_files=True)
+datas += collect_data_files("timm", include_py_files=True)
+datas += collect_data_files("tqdm")
+datas += collect_data_files("tkinter")
+datas += collect_data_files("webview")
+datas += collect_data_files("sentencepiece")
+datas += collect_data_files("jsonschema")
+datas += collect_data_files("jsonschema_specifications")
+datas += collect_data_files("cpuinfo")
+datas += collect_data_files("langchain")
+
+binaries = []
+
+block_cipher = None
+
+hiddenimports = ['shark', 'shark.shark_inference', 'apps']
+hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
+
+a = Analysis(
+    ['scripts/vicuna.py'],
+    pathex=['.'],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=hiddenimports,
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='shark_llama_cli',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/apps/language_models/src/model_wrappers/vicuna4.py
+++ b/apps/language_models/src/model_wrappers/vicuna4.py
@@ -47,7 +47,7 @@ from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
 )
 from apps.language_models.src.model_wrappers.vicuna_model import (
    FirstVicuna,
-    SecondVicuna,
+    SecondVicuna7B,
 )
 from apps.language_models.utils import (
    get_vmfb_from_path,
@@ -57,8 +57,6 @@ from shark.shark_importer import get_f16_inputs
 from shark.shark_importer import import_with_fx
 from shark.shark_inference import SharkInference

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import (
    LlamaDecoderLayer,
--- a/apps/language_models/src/model_wrappers/vicuna_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model.py
@@ -1,9 +1,6 @@
 import torch
 from transformers import AutoModelForCausalLM

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
-

 class FirstVicuna(torch.nn.Module):
    def __init__(
@@ -21,12 +18,18 @@ class FirstVicuna(torch.nn.Module):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
+        print(f"[DEBUG] model_path : {model_path}")
        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("First Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
                get_model_impl(self.model).layers,
-                dtype=torch.float32,
+                dtype=torch.float16 if precision == "int4" else torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
                weight_scale_precision="float",
@@ -48,7 +51,7 @@ class FirstVicuna(torch.nn.Module):
        return tuple(return_vals)


-class SecondVicuna(torch.nn.Module):
+class SecondVicuna7B(torch.nn.Module):
    def __init__(
        self,
        model_path,
@@ -64,12 +67,18 @@ class SecondVicuna(torch.nn.Module):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
+        print(f"[DEBUG] model_path : {model_path}")
        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("Second Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
                get_model_impl(self.model).layers,
-                dtype=torch.float32,
+                dtype=torch.float16 if precision == "int4" else torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
                weight_scale_precision="float",
@@ -148,8 +157,6 @@ class SecondVicuna(torch.nn.Module):
        i63,
        i64,
    ):
-        # input_ids = input_tuple[0]
-        # input_tuple = torch.unbind(pkv, dim=0)
        token = i0
        past_key_values = (
            (i1, i2),
@@ -290,6 +297,833 @@ class SecondVicuna(torch.nn.Module):
        return tuple(return_vals)


+class SecondVicuna13B(torch.nn.Module):
+    def __init__(
+        self,
+        model_path,
+        precision="int8",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
+            print("Second Vicuna applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                get_model_impl(self.model).layers,
+                dtype=torch.float16 if precision == "int4" else torch.float32,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+        i65,
+        i66,
+        i67,
+        i68,
+        i69,
+        i70,
+        i71,
+        i72,
+        i73,
+        i74,
+        i75,
+        i76,
+        i77,
+        i78,
+        i79,
+        i80,
+    ):
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+            (
+                i65,
+                i66,
+            ),
+            (
+                i67,
+                i68,
+            ),
+            (
+                i69,
+                i70,
+            ),
+            (
+                i71,
+                i72,
+            ),
+            (
+                i73,
+                i74,
+            ),
+            (
+                i75,
+                i76,
+            ),
+            (
+                i77,
+                i78,
+            ),
+            (
+                i79,
+                i80,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
+        return_vals.append(op.logits)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
+class SecondVicuna70B(torch.nn.Module):
+    def __init__(
+        self,
+        model_path,
+        precision="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        print(f"[DEBUG] model_path : {model_path}")
+        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
+            print("Second Vicuna applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                get_model_impl(self.model).layers,
+                dtype=torch.float16,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+        i65,
+        i66,
+        i67,
+        i68,
+        i69,
+        i70,
+        i71,
+        i72,
+        i73,
+        i74,
+        i75,
+        i76,
+        i77,
+        i78,
+        i79,
+        i80,
+        i81,
+        i82,
+        i83,
+        i84,
+        i85,
+        i86,
+        i87,
+        i88,
+        i89,
+        i90,
+        i91,
+        i92,
+        i93,
+        i94,
+        i95,
+        i96,
+        i97,
+        i98,
+        i99,
+        i100,
+        i101,
+        i102,
+        i103,
+        i104,
+        i105,
+        i106,
+        i107,
+        i108,
+        i109,
+        i110,
+        i111,
+        i112,
+        i113,
+        i114,
+        i115,
+        i116,
+        i117,
+        i118,
+        i119,
+        i120,
+        i121,
+        i122,
+        i123,
+        i124,
+        i125,
+        i126,
+        i127,
+        i128,
+        i129,
+        i130,
+        i131,
+        i132,
+        i133,
+        i134,
+        i135,
+        i136,
+        i137,
+        i138,
+        i139,
+        i140,
+        i141,
+        i142,
+        i143,
+        i144,
+        i145,
+        i146,
+        i147,
+        i148,
+        i149,
+        i150,
+        i151,
+        i152,
+        i153,
+        i154,
+        i155,
+        i156,
+        i157,
+        i158,
+        i159,
+        i160,
+    ):
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+            (
+                i65,
+                i66,
+            ),
+            (
+                i67,
+                i68,
+            ),
+            (
+                i69,
+                i70,
+            ),
+            (
+                i71,
+                i72,
+            ),
+            (
+                i73,
+                i74,
+            ),
+            (
+                i75,
+                i76,
+            ),
+            (
+                i77,
+                i78,
+            ),
+            (
+                i79,
+                i80,
+            ),
+            (
+                i81,
+                i82,
+            ),
+            (
+                i83,
+                i84,
+            ),
+            (
+                i85,
+                i86,
+            ),
+            (
+                i87,
+                i88,
+            ),
+            (
+                i89,
+                i90,
+            ),
+            (
+                i91,
+                i92,
+            ),
+            (
+                i93,
+                i94,
+            ),
+            (
+                i95,
+                i96,
+            ),
+            (
+                i97,
+                i98,
+            ),
+            (
+                i99,
+                i100,
+            ),
+            (
+                i101,
+                i102,
+            ),
+            (
+                i103,
+                i104,
+            ),
+            (
+                i105,
+                i106,
+            ),
+            (
+                i107,
+                i108,
+            ),
+            (
+                i109,
+                i110,
+            ),
+            (
+                i111,
+                i112,
+            ),
+            (
+                i113,
+                i114,
+            ),
+            (
+                i115,
+                i116,
+            ),
+            (
+                i117,
+                i118,
+            ),
+            (
+                i119,
+                i120,
+            ),
+            (
+                i121,
+                i122,
+            ),
+            (
+                i123,
+                i124,
+            ),
+            (
+                i125,
+                i126,
+            ),
+            (
+                i127,
+                i128,
+            ),
+            (
+                i129,
+                i130,
+            ),
+            (
+                i131,
+                i132,
+            ),
+            (
+                i133,
+                i134,
+            ),
+            (
+                i135,
+                i136,
+            ),
+            (
+                i137,
+                i138,
+            ),
+            (
+                i139,
+                i140,
+            ),
+            (
+                i141,
+                i142,
+            ),
+            (
+                i143,
+                i144,
+            ),
+            (
+                i145,
+                i146,
+            ),
+            (
+                i147,
+                i148,
+            ),
+            (
+                i149,
+                i150,
+            ),
+            (
+                i151,
+                i152,
+            ),
+            (
+                i153,
+                i154,
+            ),
+            (
+                i155,
+                i156,
+            ),
+            (
+                i157,
+                i158,
+            ),
+            (
+                i159,
+                i160,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
+        return_vals.append(op.logits)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
 class CombinedModel(torch.nn.Module):
    def __init__(
        self,
@@ -298,7 +1132,8 @@ class CombinedModel(torch.nn.Module):
    ):
        super().__init__()
        self.first_vicuna = FirstVicuna(first_vicuna_model_path)
-        self.second_vicuna = SecondVicuna(second_vicuna_model_path)
+        # NOT using this path for 13B currently, hence using `SecondVicuna7B`.
+        self.second_vicuna = SecondVicuna7B(second_vicuna_model_path)

    def forward(self, input_ids):
        first_output = self.first_vicuna(input_ids=input_ids)
--- a/apps/language_models/src/pipelines/SharkLLMBase.py
+++ b/apps/language_models/src/pipelines/SharkLLMBase.py
@@ -3,7 +3,10 @@ from abc import ABC, abstractmethod

 class SharkLLMBase(ABC):
    def __init__(
-        self, model_name, hf_model_path=None, max_num_tokens=512
+        self,
+        model_name,
+        hf_model_path=None,
+        max_num_tokens=512,
    ) -> None:
        self.model_name = model_name
        self.hf_model_path = hf_model_path
--- a/apps/language_models/src/pipelines/falcon_pipeline.py
+++ b/apps/language_models/src/pipelines/falcon_pipeline.py
@@ -71,6 +71,7 @@ class Falcon(SharkLLMBase):
        precision="fp32",
        falcon_mlir_path=None,
        falcon_vmfb_path=None,
+        debug=False,
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
        self.max_padding_length = 100
@@ -78,6 +79,7 @@ class Falcon(SharkLLMBase):
        self.precision = precision
        self.falcon_vmfb_path = falcon_vmfb_path
        self.falcon_mlir_path = falcon_mlir_path
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
        self.shark_model = self.compile()
        self.src_model = self.get_src_model()
@@ -208,6 +210,7 @@ class Falcon(SharkLLMBase):
                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
                "--iree-spirv-index-bits=64",
            ],
+            debug=self.debug,
        )
        print("Saved falcon vmfb at ", str(path))
        shark_module.load_module(path)
--- a/apps/language_models/src/pipelines/minigpt4_pipeline.py
+++ b/apps/language_models/src/pipelines/minigpt4_pipeline.py
@@ -178,7 +178,7 @@ def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):


 def compile_module(
-    shark_module, extended_model_name, generate_vmfb, extra_args=[]
+    shark_module, extended_model_name, generate_vmfb, extra_args=[], debug=False,
 ):
    if generate_vmfb:
        vmfb_path = os.path.join(os.getcwd(), extended_model_name + ".vmfb")
@@ -190,7 +190,7 @@ def compile_module(
                "No vmfb found. Compiling and saving to {}".format(vmfb_path)
            )
            path = shark_module.save_module(
-                os.getcwd(), extended_model_name, extra_args
+                os.getcwd(), extended_model_name, extra_args, debug=debug
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -199,7 +199,7 @@ def compile_module(


 def compile_int_precision(
-    model, inputs, precision, device, generate_vmfb, extended_model_name
+    model, inputs, precision, device, generate_vmfb, extended_model_name, debug=False
 ):
    torchscript_module = import_with_fx(
        model,
@@ -219,7 +219,7 @@ def compile_int_precision(
    print(f"[DEBUG] converting torch to linalg")
    run_pipeline_with_repro_report(
        mlir_module,
-        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
    )
    from contextlib import redirect_stdout
@@ -251,6 +251,7 @@ def compile_int_precision(
            extended_model_name=extended_model_name,
            generate_vmfb=generate_vmfb,
            extra_args=extra_args,
+            debug=debug,
        ),
        bytecode,
    )
@@ -294,6 +295,7 @@ def shark_compile_through_fx_int(
        device,
        generate_or_load_vmfb,
        extended_model_name,
+        debug,
    )
    extra_args = [
        "--iree-hal-dump-executable-sources-to=ies",
--- a/apps/language_models/src/pipelines/stablelm_pipeline.py
+++ b/apps/language_models/src/pipelines/stablelm_pipeline.py
@@ -32,11 +32,13 @@ class SharkStableLM(SharkLLMBase):
        max_num_tokens=512,
        device="cuda",
        precision="fp32",
+        debug="False",
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
        self.max_sequence_len = 256
        self.device = device
        self.precision = precision
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
        self.shark_model = self.compile()

@@ -111,7 +113,7 @@ class SharkStableLM(SharkLLMBase):
        shark_module.compile()

        path = shark_module.save_module(
-            vmfb_path.parent.absolute(), vmfb_path.stem
+            vmfb_path.parent.absolute(), vmfb_path.stem, debug=self.debug
        )
        print("Saved vmfb at ", str(path))

--- a/apps/stable_diffusion/profiling_with_iree.md
+++ b/apps/stable_diffusion/profiling_with_iree.md
@@ -7,16 +7,16 @@ Compile Commands FP32/FP16:

 ```shell
 Vulkan AMD: 
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux /path/to/input/mlir -o /path/to/output/vmfb

 #  add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
 #  use –iree-input-type=auto or "mhlo_legacy" or "stablehlo" for TF models

 CUDA NVIDIA:
-iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=cuda /path/to/input/mlir -o /path/to/output/vmfb

 CPU:
-iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu  --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu /path/to/input/mlir -o /path/to/output/vmfb
 ```


--- a/apps/stable_diffusion/shark_studio_imports.py
+++ b/apps/stable_diffusion/shark_studio_imports.py
@@ -15,8 +15,8 @@ pathex = [

 # datafiles for pyinstaller
 datas = []
-datas += collect_data_files("torch")
 datas += copy_metadata("torch")
+datas += copy_metadata("tokenizers")
 datas += copy_metadata("tqdm")
 datas += copy_metadata("regex")
 datas += copy_metadata("requests")
@@ -31,20 +31,20 @@ datas += copy_metadata("Pillow")
 datas += copy_metadata("sentencepiece")
 datas += copy_metadata("pyyaml")
 datas += copy_metadata("huggingface-hub")
+datas += collect_data_files("torch")
 datas += collect_data_files("tokenizers")
 datas += collect_data_files("tiktoken")
 datas += collect_data_files("accelerate")
 datas += collect_data_files("diffusers")
 datas += collect_data_files("transformers")
 datas += collect_data_files("pytorch_lightning")
-datas += collect_data_files("opencv_python")
 datas += collect_data_files("skimage")
 datas += collect_data_files("gradio")
 datas += collect_data_files("gradio_client")
 datas += collect_data_files("iree")
-datas += collect_data_files("google_cloud_storage")
 datas += collect_data_files("shark", include_py_files=True)
 datas += collect_data_files("timm", include_py_files=True)
+datas += collect_data_files("tqdm")
 datas += collect_data_files("tkinter")
 datas += collect_data_files("webview")
 datas += collect_data_files("sentencepiece")
@@ -52,6 +52,7 @@ datas += collect_data_files("jsonschema")
 datas += collect_data_files("jsonschema_specifications")
 datas += collect_data_files("cpuinfo")
 datas += collect_data_files("langchain")
+datas += collect_data_files("cv2")
 datas += [
    ("src/utils/resources/prompts.json", "resources"),
    ("src/utils/resources/model_db.json", "resources"),
@@ -73,8 +74,11 @@ datas += [
 # hidden imports for pyinstaller
 hiddenimports = ["shark", "shark.shark_inference", "apps"]
 hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+blacklist = ["tests", "convert"]
 hiddenimports += [
-    x for x in collect_submodules("transformers") if "tests" not in x
+    x
+    for x in collect_submodules("transformers")
+    if not any(kw in x for kw in blacklist)
 ]
 hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
-hiddenimports += ["iree._runtime", "iree._runtime_libs"]
+hiddenimports += ["iree._runtime", "iree.compiler._mlir_libs._mlir.ir"]
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -177,6 +177,7 @@ class SharkifyStableDiffusionModel:
            "unet",
            "unet512",
            "stencil_unet",
+            "stencil_unet_512",
            "vae",
            "vae_encode",
            "stencil_adaptor",
@@ -340,7 +341,7 @@ class SharkifyStableDiffusionModel:
        )
        return shark_vae, vae_mlir

-    def get_controlled_unet(self):
+    def get_controlled_unet(self, use_large=False):
        class ControlledUnetModel(torch.nn.Module):
            def __init__(
                self,
@@ -416,6 +417,16 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False

        inputs = tuple(self.inputs["unet"])
+        model_name = "stencil_unet"
+        if use_large:
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (
+                inputs[:2]
+                + (torch.nn.functional.pad(inputs[2], pad),)
+                + inputs[3:]
+            )
+            model_name = "stencil_unet_512"
        input_mask = [
            True,
            True,
@@ -438,13 +449,13 @@ class SharkifyStableDiffusionModel:
        shark_controlled_unet, controlled_unet_mlir = compile_through_fx(
            unet,
            inputs,
-            extended_model_name=self.model_name["stencil_unet"],
+            extended_model_name=self.model_name[model_name],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
-            model_name="stencil_unet",
+            model_name=model_name,
            precision=self.precision,
            return_mlir=self.return_mlir,
        )
@@ -766,7 +777,7 @@ class SharkifyStableDiffusionModel:
            else:
                return self.get_unet(use_large=use_large)
        else:
-            return self.get_controlled_unet()
+            return self.get_controlled_unet(use_large=use_large)

    def vae_encode(self):
        try:
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
@@ -84,13 +84,35 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        num_inference_steps,
        strength,
        dtype,
+        resample_type,
    ):
        # Pre process image -> get image encoded -> process latents

        # TODO: process with variable HxW combos

-        # Pre process image
-        image = image.resize((width, height))
+        # Pre-process image
+        if resample_type == "Lanczos":
+            resample_type = Image.LANCZOS
+        elif resample_type == "Nearest Neighbor":
+            resample_type = Image.NEAREST
+        elif resample_type == "Bilinear":
+            resample_type = Image.BILINEAR
+        elif resample_type == "Bicubic":
+            resample_type = Image.BICUBIC
+        elif resample_type == "Adaptive":
+            resample_type = Image.ADAPTIVE
+        elif resample_type == "Antialias":
+            resample_type = Image.ANTIALIAS
+        elif resample_type == "Box":
+            resample_type = Image.BOX
+        elif resample_type == "Affine":
+            resample_type = Image.AFFINE
+        elif resample_type == "Cubic":
+            resample_type = Image.CUBIC
+        else:  # Fallback to Lanczos
+            resample_type = Image.LANCZOS
+
+        image = image.resize((width, height), resample=resample_type)
        image_arr = np.stack([np.array(i) for i in (image,)], axis=0)
        image_arr = image_arr / 255.0
        image_arr = torch.from_numpy(image_arr).permute(0, 3, 1, 2).to(dtype)
@@ -147,6 +169,7 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        cpu_scheduling,
        max_embeddings_multiples,
        use_stencil,
+        resample_type,
    ):
        # prompts and negative prompts must be a list.
        if isinstance(prompts, str):
@@ -186,6 +209,7 @@ class Image2ImagePipeline(StableDiffusionPipeline):
            num_inference_steps=num_inference_steps,
            strength=strength,
            dtype=dtype,
+            resample_type=resample_type,
        )

        # Get Image latents
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -149,7 +149,7 @@ class StencilPipeline(StableDiffusionPipeline):
                ).to(dtype)
            else:
                latent_model_input_1 = latent_model_input
-            if text_embeddings.shapes[1] <= self.model_max_length:
+            if text_embeddings.shape[1] <= self.model_max_length:
                control = self.controlnet(
                    "forward",
                    (
@@ -175,29 +175,56 @@ class StencilPipeline(StableDiffusionPipeline):
            # Profiling Unet.
            profile_device = start_profiling(file_path="unet.rdc")
            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
-            noise_pred = self.unet(
-                "forward",
-                (
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                    control[0],
-                    control[1],
-                    control[2],
-                    control[3],
-                    control[4],
-                    control[5],
-                    control[6],
-                    control[7],
-                    control[8],
-                    control[9],
-                    control[10],
-                    control[11],
-                    control[12],
-                ),
-                send_to_host=False,
-            )
+
+            if text_embeddings.shape[1] <= self.model_max_length:
+                noise_pred = self.unet(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                        control[0],
+                        control[1],
+                        control[2],
+                        control[3],
+                        control[4],
+                        control[5],
+                        control[6],
+                        control[7],
+                        control[8],
+                        control[9],
+                        control[10],
+                        control[11],
+                        control[12],
+                    ),
+                    send_to_host=False,
+                )
+            else:
+                print(self.unet_512)
+                noise_pred = self.unet_512(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                        control[0],
+                        control[1],
+                        control[2],
+                        control[3],
+                        control[4],
+                        control[5],
+                        control[6],
+                        control[7],
+                        control[8],
+                        control[9],
+                        control[10],
+                        control[11],
+                        control[12],
+                    ),
+                    send_to_host=False,
+                )
            end_profiling(profile_device)

            if cpu_scheduling:
@@ -246,6 +273,7 @@ class StencilPipeline(StableDiffusionPipeline):
        cpu_scheduling,
        max_embeddings_multiples,
        use_stencil,
+        resample_type,
    ):
        # Control Embedding check & conversion
        # TODO: 1. Change `num_images_per_prompt`.
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -158,9 +158,9 @@ def load_lower_configs(base_model_id=None):
                f"{spec}.json"
            )

-    full_gs_url = config_bucket + config_name
    lowering_config_dir = os.path.join(WORKDIR, "configs", config_name)
    print("Loading lowering config file from ", lowering_config_dir)
+    full_gs_url = config_bucket + config_name
    download_public_file(full_gs_url, lowering_config_dir, True)
    return lowering_config_dir

@@ -281,13 +281,9 @@ def sd_model_annotation(mlir_model, model_name, base_model_id=None):
        if "rdna2" not in args.iree_vulkan_target_triple.split("-")[0]:
            use_winograd = True
            winograd_config_dir = load_winograd_configs()
-            winograd_model = annotate_with_winograd(
+            tuned_model = annotate_with_winograd(
                mlir_model, winograd_config_dir, model_name
            )
-            lowering_config_dir = load_lower_configs(base_model_id)
-            tuned_model = annotate_with_lower_configs(
-                winograd_model, lowering_config_dir, model_name, use_winograd
-            )
        else:
            tuned_model = mlir_model
    else:
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -132,6 +132,57 @@ p.add_argument(
    "img2img.",
 )

+p.add_argument(
+    "--use_hiresfix",
+    type=bool,
+    default=False,
+    help="Use Hires Fix to do higher resolution images, while trying to "
+    "avoid the issues that come with it. This is accomplished by first "
+    "generating an image using txt2img, then running it through img2img.",
+)
+
+p.add_argument(
+    "--hiresfix_height",
+    type=int,
+    default=768,
+    choices=range(128, 769, 8),
+    help="The height of the Hires Fix image.",
+)
+
+p.add_argument(
+    "--hiresfix_width",
+    type=int,
+    default=768,
+    choices=range(128, 769, 8),
+    help="The width of the Hires Fix image.",
+)
+
+p.add_argument(
+    "--hiresfix_strength",
+    type=float,
+    default=0.6,
+    help="The denoising strength to apply for the Hires Fix.",
+)
+
+p.add_argument(
+    "--resample_type",
+    type=str,
+    default="Nearest Neighbor",
+    choices=[
+        "Lanczos",
+        "Nearest Neighbor",
+        "Bilinear",
+        "Bicubic",
+        "Adaptive",
+        "Antialias",
+        "Box",
+        "Affine",
+        "Cubic",
+    ],
+    help="The resample type to use when resizing an image before being run "
+    "through stable diffusion.",
+)
+
 ##############################################################################
 # Stable Diffusion Training Params
 ##############################################################################
@@ -519,6 +570,14 @@ p.add_argument(
    "in shark importer. Does nothing if import_mlir is false (the default).",
 )

+p.add_argument(
+    "--compile_debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag to toggle debug assert/verify flags for imported IR in the"
+    "iree-compiler. Default to false.",
+)
+
 p.add_argument(
    "--iree_constant_folding",
    default=True,
@@ -574,6 +633,13 @@ p.add_argument(
    help="Flag for enabling rest API.",
 )

+p.add_argument(
+    "--debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag for enabling debugging log in WebUI.",
+)
+
 p.add_argument(
    "--output_gallery",
    default=True,
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -25,7 +25,7 @@ from shark.iree_utils.vulkan_utils import (
    get_iree_vulkan_runtime_flags,
 )
 from shark.iree_utils.metal_utils import get_metal_target_triple
-from shark.iree_utils.gpu_utils import get_cuda_sm_cc
+from shark.iree_utils.gpu_utils import get_cuda_sm_cc, get_iree_rocm_args
 from apps.stable_diffusion.src.utils.stable_args import args
 from apps.stable_diffusion.src.utils.resources import opt_flags
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
@@ -78,7 +78,7 @@ def _compile_module(shark_module, model_name, extra_args=[]):
                    )
                )
            path = shark_module.save_module(
-                os.getcwd(), model_name, extra_args
+                os.getcwd(), model_name, extra_args, debug=args.compile_debug
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -476,6 +476,8 @@ def get_available_devices():
    available_devices.extend(metal_devices)
    cuda_devices = get_devices_by_name("cuda")
    available_devices.extend(cuda_devices)
+    rocm_devices = get_devices_by_name("rocm")
+    available_devices.extend(rocm_devices)
    cpu_device = get_devices_by_name("cpu-sync")
    available_devices.extend(cpu_device)
    cpu_device = get_devices_by_name("cpu-task")
@@ -499,7 +501,10 @@ def get_opt_flags(model, precision="fp16"):
        iree_flags.append(
            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
        )
-
+    if "rocm" in args.device:
+        rocm_args = get_iree_rocm_args()
+        iree_flags.extend(rocm_args)
+        print(iree_flags)
    if args.iree_constant_folding == False:
        iree_flags.append("--iree-opt-const-expr-hoisting=False")
        iree_flags.append(
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,6 +1,7 @@
 from multiprocessing import Process, freeze_support
 import os
 import sys
+import logging

 if sys.platform == "darwin":
    # import before IREE to avoid torch-MLIR library issues
@@ -41,6 +42,8 @@ def launch_app(address):


 if __name__ == "__main__":
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
    # required to do multiprocessing in a pyinstaller freeze
    freeze_support()
    if args.api or "api" in args.ui.split(","):
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -3,6 +3,7 @@ import torch
 import time
 import gradio as gr
 import PIL
+from math import ceil
 from PIL import Image
 import base64
 from io import BytesIO
@@ -67,6 +68,7 @@ def img2img_inf(
    lora_hf_id: str,
    ondemand: bool,
    repeatable_seeds: bool,
+    resample_type: str,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -245,7 +247,7 @@ def img2img_inf(
            batch_size,
            height,
            width,
-            steps,
+            ceil(steps / strength),
            strength,
            guidance_scale,
            seeds[current_batch],
@@ -255,6 +257,7 @@ def img2img_inf(
            cpu_scheduling,
            args.max_embeddings_multiples,
            use_stencil=use_stencil,
+            resample_type=resample_type,
        )
        total_time = time.time() - start_time
        text_output = get_generation_text_info(
@@ -348,6 +351,7 @@ def img2img_api(
        lora_hf_id="",
        ondemand=False,
        repeatable_seeds=False,
+        resample_type="Lanczos",
    )

    # Converts generator type to subscriptable
@@ -432,7 +436,7 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                        lines=2,
                        elem_id="negative_prompt_box",
                    )
-
+                # TODO: make this import image prompt info if it exists
                img2img_init_image = gr.Image(
                    label="Input Image",
                    source="upload",
@@ -550,15 +554,6 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                        width = gr.Slider(
                            384, 768, value=args.width, step=8, label="Width"
                        )
-                        precision = gr.Radio(
-                            label="Precision",
-                            value=args.precision,
-                            choices=[
-                                "fp16",
-                                "fp32",
-                            ],
-                            visible=True,
-                        )
                        max_length = gr.Radio(
                            label="Max Length",
                            value=args.max_length,
@@ -581,11 +576,35 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                                step=0.01,
                                label="Denoising Strength",
                            )
+                            resample_type = gr.Dropdown(
+                                value=args.resample_type,
+                                choices=[
+                                    "Lanczos",
+                                    "Nearest Neighbor",
+                                    "Bilinear",
+                                    "Bicubic",
+                                    "Adaptive",
+                                    "Antialias",
+                                    "Box",
+                                    "Affine",
+                                    "Cubic",
+                                ],
+                                label="Resample Type",
+                            )
                        ondemand = gr.Checkbox(
                            value=args.ondemand,
                            label="Low VRAM",
                            interactive=True,
                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=args.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=True,
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            guidance_scale = gr.Slider(
@@ -695,6 +714,7 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                lora_hf_id,
                ondemand,
                repeatable_seeds,
+                resample_type,
            ],
            outputs=[img2img_gallery, std_output, img2img_status],
            show_progress="minimal" if args.progress_bar else "none",
--- a/apps/stable_diffusion/web/ui/minigpt4_ui.py
+++ b/apps/stable_diffusion/web/ui/minigpt4_ui.py
@@ -109,7 +109,7 @@ with gr.Blocks() as minigpt4_web:
    gr.Markdown(description)

    with gr.Row():
-        with gr.Column(scale=0.5):
+        with gr.Column():
            image = gr.Image(type="pil")
            upload_button = gr.Button(
                value="Upload & Start Chat",
--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -24,12 +24,9 @@ past_key_values = None

 model_map = {
    "llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
+    "llama2_13b": "meta-llama/Llama-2-13b-chat-hf",
    "llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
-    "codegen": "Salesforce/codegen25-7b-multi",
-    "vicuna1p3": "lmsys/vicuna-7b-v1.3",
    "vicuna": "TheBloke/vicuna-7B-1.1-HF",
-    "vicuna4": "TheBloke/vicuna-7B-1.1-HF",
-    "StableLM": "stabilityai/stablelm-tuned-alpha-3b",
 }

 # NOTE: Each `model_name` should have its own start message
@@ -43,6 +40,15 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
+    "llama2_13b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
    "llama2_70b": (
        "System: You are a helpful, respectful and honest assistant. Always answer "
        "as helpfully as possible, while being safe.  Your answers should not "
@@ -52,60 +58,39 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
-    "StableLM": (
-        "<|SYSTEM|># StableLM Tuned (Alpha version)"
-        "\n- StableLM is a helpful and harmless open-source AI language model "
-        "developed by StabilityAI."
-        "\n- StableLM is excited to be able to help the user, but will refuse "
-        "to do anything that could be considered harmful to the user."
-        "\n- StableLM is more than just an information source, StableLM is also "
-        "able to write poetry, short stories, and make jokes."
-        "\n- StableLM will refuse to participate in anything that "
-        "could harm a human."
-    ),
    "vicuna": (
        "A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's "
        "questions.\n"
    ),
-    "vicuna4": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "vicuna1p3": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "codegen": "",
 }


 def create_prompt(model_name, history):
    system_message = start_message[model_name]

-    if model_name in [
-        "StableLM",
-        "vicuna",
-        "vicuna4",
-        "vicuna1p3",
-        "llama2_7b",
-        "llama2_70b",
-    ]:
+    if "llama2" in model_name:
+        B_INST, E_INST = "[INST]", "[/INST]"
+        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+        conversation = "".join(
+            [f"{B_INST} {item[0]} {E_INST} {item[1]} " for item in history[1:]]
+        )
+        msg = f"{B_INST} {B_SYS} {system_message} {E_SYS} {history[0][0]} {E_INST} {history[0][1]} {conversation}"
+    elif model_name in ["vicuna"]:
        conversation = "".join(
            [
                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
                for item in history
            ]
        )
+        msg = system_message + conversation
+        msg = msg.strip()
    else:
        conversation = "".join(
            ["".join([item[0], item[1]]) for item in history]
        )
-
-    msg = system_message + conversation
-    msg = msg.strip()
+        msg = system_message + conversation
+        msg = msg.strip()
    return msg


@@ -149,14 +134,15 @@ def chat(
    model,
    device,
    precision,
+    download_vmfb,
    config_file,
    cli=False,
    progress=gr.Progress(),
 ):
    global past_key_values
    global model_vmfb_key
-
    global vicuna_model
+
    model_name, model_path = list(map(str.strip, model.split("=>")))
    if "cuda" in device:
        device = "cuda"
@@ -166,123 +152,73 @@ def chat(
        device = "cpu-task"
    elif "vulkan" in device:
        device = "vulkan"
+    elif "rocm" in device:
+        device = "rocm"
    else:
        print("unrecognized device")

+    from apps.language_models.scripts.vicuna import ShardedVicuna
+    from apps.language_models.scripts.vicuna import UnshardedVicuna
+    from apps.stable_diffusion.src import args
+
    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{precision}"
-    if model_name in [
-        "vicuna",
-        "vicuna4",
-        "vicuna1p3",
-        "codegen",
-        "llama2_7b",
-        "llama2_70b",
-    ]:
-        from apps.language_models.scripts.vicuna import ShardedVicuna
-        from apps.language_models.scripts.vicuna import UnshardedVicuna
-        from apps.stable_diffusion.src import args
-
-        if new_model_vmfb_key != model_vmfb_key:
-            model_vmfb_key = new_model_vmfb_key
-            max_toks = 128 if model_name == "codegen" else 512
-
-            # get iree flags that need to be overridden, from commandline args
-            _extra_args = []
-            # vulkan target triple
-            if args.iree_vulkan_target_triple != "":
-                _extra_args.append(
-                    f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-                )
-
-            if model_name == "vicuna4":
-                vicuna_model = ShardedVicuna(
-                    model_name,
-                    hf_model_path=model_path,
-                    device=device,
-                    precision=precision,
-                    max_num_tokens=max_toks,
-                    compressed=True,
-                    extra_args_cmd=_extra_args,
-                )
-            else:
-                #  if config_file is None:
-                vicuna_model = UnshardedVicuna(
-                    model_name,
-                    hf_model_path=model_path,
-                    hf_auth_token=args.hf_auth_token,
-                    device=device,
-                    precision=precision,
-                    max_num_tokens=max_toks,
-                    extra_args_cmd=_extra_args,
-                )
-                #  else:
-                #      if config_file is not None:
-                #          config_file = open(config_file)
-                #          config_json = json.load(config_file)
-                #          config_file.close()
-                #      else:
-                #          config_json = get_default_config()
-                #      vicuna_model = ShardedVicuna(
-                #          model_name,
-                #          device=device,
-                #          precision=precision,
-                #          config_json=config_json,
-                #      )
-
-        prompt = create_prompt(model_name, history)
-
-        partial_text = ""
-        count = 0
-        start_time = time.time()
-        for text, msg in progress.tqdm(
-            vicuna_model.generate(prompt, cli=cli),
-            desc="generating response",
-        ):
-            count += 1
-            if "formatted" in msg:
-                history[-1][1] = text
-                end_time = time.time()
-                tokens_per_sec = count / (end_time - start_time)
-                yield history, str(
-                    format(tokens_per_sec, ".2f")
-                ) + " tokens/sec"
-            else:
-                partial_text += text + " "
-                history[-1][1] = partial_text
-                yield history, ""
-
-        return history, ""
-
-    # else Model is StableLM
-    global sharkModel
-    from apps.language_models.src.pipelines.stablelm_pipeline import (
-        SharkStableLM,
-    )
-
    if new_model_vmfb_key != model_vmfb_key:
        model_vmfb_key = new_model_vmfb_key
-        # max_new_tokens=512
-        shark_slm = SharkStableLM(
-            model_name
-        )  # pass elements from UI as required
+        max_toks = 128 if model_name == "codegen" else 512
+
+        # get iree flags that need to be overridden, from commandline args
+        _extra_args = []
+        # vulkan target triple
+        if args.iree_vulkan_target_triple != "":
+            _extra_args.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+
+        if model_name == "vicuna4":
+            vicuna_model = ShardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                compressed=True,
+                extra_args_cmd=_extra_args,
+            )
+        else:
+            #  if config_file is None:
+            vicuna_model = UnshardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                hf_auth_token=args.hf_auth_token,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                download_vmfb=download_vmfb,
+                load_mlir_from_shark_tank=True,
+                extra_args_cmd=_extra_args,
+            )

-    # Construct the input message string for the model by concatenating the
-    # current system message and conversation history
-    if len(curr_system_message.split()) > 160:
-        print("clearing context")
    prompt = create_prompt(model_name, history)
-    generate_kwargs = dict(prompt=prompt)
-
-    words_list = shark_slm.generate(**generate_kwargs)

    partial_text = ""
-    for new_text in words_list:
-        partial_text += new_text
-        history[-1][1] = partial_text
-        # Yield an empty string to clean up the message textbox and the updated
-        # conversation history
-        yield history
-    return words_list
+    count = 0
+    start_time = time.time()
+    for text, msg in progress.tqdm(
+        vicuna_model.generate(prompt, cli=cli),
+        desc="generating response",
+    ):
+        count += 1
+        if "formatted" in msg:
+            history[-1][1] = text
+            end_time = time.time()
+            tokens_per_sec = count / (end_time - start_time)
+            yield history, str(format(tokens_per_sec, ".2f")) + " tokens/sec"
+        else:
+            partial_text += text + " "
+            history[-1][1] = partial_text
+            yield history, ""
+
+    return history, ""


 def llm_chat_api(InputData: dict):
@@ -336,6 +272,8 @@ def llm_chat_api(InputData: dict):
            device=device,
            precision=precision,
            max_num_tokens=max_toks,
+            download_vmfb=True,
+            load_mlir_from_shark_tank=True,
        )

    # TODO: add role dict for different models
@@ -398,7 +336,7 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        )
        model = gr.Dropdown(
            label="Select Model",
-            value=model_choices[4],
+            value=model_choices[0],
            choices=model_choices,
        )
        supported_devices = available_devices
@@ -406,15 +344,14 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        # show cpu-task device first in list for chatbot
        supported_devices = supported_devices[-1:] + supported_devices[:-1]
        supported_devices = [x for x in supported_devices if "sync" not in x]
-        #  print(supported_devices)
-        devices = gr.Dropdown(
+        device = gr.Dropdown(
            label="Device",
            value=supported_devices[0]
            if enabled
            else "Only CUDA Supported for now",
            choices=supported_devices,
            interactive=enabled,
-            #  multiselect=True,
+            # multiselect=True,
        )
        precision = gr.Radio(
            label="Precision",
@@ -426,7 +363,13 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
            ],
            visible=True,
        )
-        tokens_time = gr.Textbox(label="Tokens generated per second")
+        with gr.Column():
+            download_vmfb = gr.Checkbox(
+                label="Download vmfb from Shark tank if available",
+                value=True,
+                interactive=True,
+            )
+            tokens_time = gr.Textbox(label="Tokens generated per second")

    with gr.Row(visible=False):
        with gr.Group():
@@ -461,7 +404,15 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, devices, precision, config_file],
+        inputs=[
+            system_msg,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
        outputs=[chatbot, tokens_time],
        queue=True,
    )
@@ -469,7 +420,15 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, devices, precision, config_file],
+        inputs=[
+            system_msg,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
        outputs=[chatbot, tokens_time],
        queue=True,
    )
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -4,6 +4,7 @@ import time
 import sys
 import gradio as gr
 from PIL import Image
+from math import ceil
 import base64
 from io import BytesIO
 from fastapi.exceptions import HTTPException
@@ -26,6 +27,7 @@ from apps.stable_diffusion.src import (
    utils,
    save_output_img,
    prompt_examples,
+    Image2ImagePipeline,
 )
 from apps.stable_diffusion.src.utils import (
    get_generated_imgs_path,
@@ -62,6 +64,11 @@ def txt2img_inf(
    lora_hf_id: str,
    ondemand: bool,
    repeatable_seeds: bool,
+    use_hiresfix: bool,
+    hiresfix_height: int,
+    hiresfix_width: int,
+    hiresfix_strength: float,
+    resample_type: str,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -200,6 +207,81 @@ def txt2img_inf(
            cpu_scheduling,
            args.max_embeddings_multiples,
        )
+        # TODO: allow user to save original image
+        # TODO: add option to let user keep both pipelines loaded, and unload
+        #  either at will
+        # TODO: add custom step value slider
+        # TODO: add option to use secondary model for the img2img pass
+        if use_hiresfix is True:
+            new_config_obj = Config(
+                "img2img",
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                precision,
+                1,
+                max_length,
+                height,
+                width,
+                device,
+                use_lora=args.use_lora,
+                use_stencil="None",
+                ondemand=ondemand,
+            )
+
+            global_obj.clear_cache()
+            global_obj.set_cfg_obj(new_config_obj)
+            set_init_device_flags()
+            model_id = (
+                args.hf_model_id
+                if args.hf_model_id
+                else "stabilityai/stable-diffusion-2-1-base"
+            )
+            global_obj.set_schedulers(get_schedulers(model_id))
+            scheduler_obj = global_obj.get_scheduler(args.scheduler)
+
+            global_obj.set_sd_obj(
+                Image2ImagePipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    1,
+                    hiresfix_height,
+                    hiresfix_width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                    ondemand=args.ondemand,
+                )
+            )
+
+            global_obj.set_sd_scheduler(args.scheduler)
+
+            out_imgs = global_obj.get_sd_obj().generate_images(
+                prompt,
+                negative_prompt,
+                out_imgs[0],
+                batch_size,
+                hiresfix_height,
+                hiresfix_width,
+                ceil(steps / hiresfix_strength),
+                hiresfix_strength,
+                guidance_scale,
+                seeds[current_batch],
+                args.max_length,
+                dtype,
+                args.use_base_vae,
+                cpu_scheduling,
+                args.max_embeddings_multiples,
+                use_stencil="None",
+                resample_type=resample_type,
+            )
        total_time = time.time() - start_time
        text_output = get_generation_text_info(
            seeds[: current_batch + 1], device
@@ -271,6 +353,11 @@ def txt2img_api(
        lora_hf_id="",
        ondemand=False,
        repeatable_seeds=False,
+        use_hiresfix=False,
+        hiresfix_height=512,
+        hiresfix_width=512,
+        hiresfix_strength=0.6,
+        resample_type="Nearest Neighbor",
    )

    # Convert Generator to Subscriptable
@@ -460,6 +547,49 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                            label="Low VRAM",
                            interactive=True,
                        )
+                    with gr.Group():
+                        with gr.Row():
+                            use_hiresfix = gr.Checkbox(
+                                value=args.use_hiresfix,
+                                label="Use Hires Fix",
+                                interactive=True,
+                            )
+                            resample_type = gr.Dropdown(
+                                value=args.resample_type,
+                                choices=[
+                                    "Lanczos",
+                                    "Nearest Neighbor",
+                                    "Bilinear",
+                                    "Bicubic",
+                                    "Adaptive",
+                                    "Antialias",
+                                    "Box",
+                                    "Affine",
+                                    "Cubic",
+                                ],
+                                label="Resample Type",
+                            )
+                        hiresfix_height = gr.Slider(
+                            384,
+                            768,
+                            value=args.hiresfix_height,
+                            step=8,
+                            label="Hires Fix Height",
+                        )
+                        hiresfix_width = gr.Slider(
+                            384,
+                            768,
+                            value=args.hiresfix_width,
+                            step=8,
+                            label="Hires Fix Width",
+                        )
+                        hiresfix_strength = gr.Slider(
+                            0,
+                            1,
+                            value=args.hiresfix_strength,
+                            step=0.01,
+                            label="Hires Fix Denoising Strength",
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            batch_count = gr.Slider(
@@ -495,16 +625,6 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                        value=available_devices[0],
                        choices=available_devices,
                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        lambda: -1,
-                        inputs=[],
-                        outputs=[seed],
-                        queue=False,
-                    )
-                    stop_batch = gr.Button("Stop Batch")
-                    stable_diffusion = gr.Button("Generate Image(s)")
                with gr.Accordion(label="Prompt Examples!", open=False):
                    ex = gr.Examples(
                        examples=prompt_examples,
@@ -530,6 +650,18 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                        show_label=False,
                    )
                    txt2img_status = gr.Textbox(visible=False)
+                with gr.Row():
+                    stable_diffusion = gr.Button("Generate Image(s)")
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        lambda: -1,
+                        inputs=[],
+                        outputs=[seed],
+                        queue=False,
+                    )
+                    stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    blank_thing_for_row = None
                with gr.Row():
                    txt2img_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    txt2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
@@ -565,6 +697,11 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                lora_hf_id,
                ondemand,
                repeatable_seeds,
+                use_hiresfix,
+                hiresfix_height,
+                hiresfix_width,
+                hiresfix_strength,
+                resample_type,
            ],
            outputs=[txt2img_gallery, std_output, txt2img_status],
            show_progress="minimal" if args.progress_bar else "none",
--- a/apps/stable_diffusion/web/ui/utils.py
+++ b/apps/stable_diffusion/web/ui/utils.py
@@ -25,7 +25,7 @@ class Config:
    device: str
    use_lora: str
    use_stencil: str
-    ondemand: str
+    ondemand: str  # should this be expecting a bool instead?


 custom_model_filetypes = (
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -40,7 +40,7 @@ cmake --build build/
 *Prepare the model*
 ```bash
 wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  resnet50_tf.mlir -o resnet50_tf.vmfb
 ```
 *Prepare the input*

@@ -65,18 +65,18 @@ A tool for benchmarking other models is built and can be invoked with a command
 see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
 ```bash
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  stable_diff_tf.mlir -o stable_diff_tf.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
 ```
 VAE and Autoencoder are also available
 ```bash
 # VAE
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  vae.mlir -o vae.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32

 # CLIP Autoencoder
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  clip_autoencoder.mlir -o clip_autoencoder.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
 ```
--- a/docs/shark_iree_profiling.md
+++ b/docs/shark_iree_profiling.md
@@ -55,7 +55,7 @@ The command line for compilation will start something like this, where the `-` n
 The `-o output_filename.vmfb` flag can be used to specify the location to save the compiled vmfb. Note that a dump of the
 dispatches that can be compiled + run in isolation can be generated by adding `--iree-hal-dump-executable-benchmarks-to=/some/directory`. Say, if they are in the `benchmarks` directory, the following compile/run commands would work for Vulkan on RDNA3.
 ```
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux  benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb

 iree-benchmark-module --module=benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb --function=forward --device=vulkan
 ```
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -6,15 +6,15 @@ from distutils.sysconfig import get_python_lib
 import fileinput
 from pathlib import Path

-# Temorary workaround for transformers/__init__.py.
-path_to_tranformers_hook = Path(
+# Temporary workaround for transformers/__init__.py.
+path_to_transformers_hook = Path(
    get_python_lib()
    + "/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-transformers.py"
 )
-if path_to_tranformers_hook.is_file():
+if path_to_transformers_hook.is_file():
    pass
 else:
-    with open(path_to_tranformers_hook, "w") as f:
+    with open(path_to_transformers_hook, "w") as f:
        f.write("module_collection_mode = 'pyz+py'")

 path_to_skipfiles = Path(get_python_lib() + "/torch/_dynamo/skipfiles.py")
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,6 +18,7 @@ Pillow
 parameterized

 # Add transformers, diffusers and scipy since it most commonly used
+tokenizers==0.13.3
 transformers
 diffusers
 #accelerate is now required for diffusers import from ckpt.
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -130,14 +130,13 @@ fi

 $PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/cpu/

-if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
+if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
  T_VER=$($PYTHON -m pip show torch | grep Version)
-  TORCH_VERSION=${T_VER:9:17}
+  T_VER_MIN=${T_VER:14:12}
  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
-  TV_VERSION=${TV_VER:9:18}
-  $PYTHON -m pip uninstall -y torch torchvision
-  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
-  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu118/torch-${TORCH_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu118/torchvision-${TV_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl
+  TV_VER_MAJ=${TV_VER:9:6}
+  $PYTHON -m pip uninstall -y torchvision
+  $PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
  if [ $? -eq 0 ];then
    echo "Successfully Installed torch + cu118."
  else
@@ -146,7 +145,7 @@ if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
 fi

 if [[ -z "${NO_BREVITAS}" ]]; then
-  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@llm
+  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@dev
 fi

 if [[ -z "${CONDA_PREFIX}" && "$SKIP_VENV" != "1" ]]; then
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -52,6 +52,8 @@ def iree_device_map(device):
    )
    if len(uri_parts) == 1:
        return iree_driver
+    elif "rocm" in uri_parts:
+        return "rocm"
    else:
        return f"{iree_driver}://{uri_parts[1]}"

@@ -63,7 +65,6 @@ def get_supported_device_list():
 _IREE_DEVICE_MAP = {
    "cpu": "local-task",
    "cpu-task": "local-task",
-    "AMD-AIE": "local-task",
    "cpu-sync": "local-sync",
    "cuda": "cuda",
    "vulkan": "vulkan",
@@ -82,7 +83,6 @@ def iree_target_map(device):
 _IREE_TARGET_MAP = {
    "cpu": "llvm-cpu",
    "cpu-task": "llvm-cpu",
-    "AMD-AIE": "llvm-cpu",
    "cpu-sync": "llvm-cpu",
    "cuda": "cuda",
    "vulkan": "vulkan",
@@ -121,7 +121,10 @@ def check_device_drivers(device):
        return False
    elif device == "rocm":
        try:
-            subprocess.check_output("rocminfo")
+            if sys.platform == "win32":
+                subprocess.check_output("hipinfo")
+            else:
+                subprocess.check_output("rocminfo")
        except Exception:
            return True

--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import iree._runtime.scripts.iree_benchmark_module as benchmark_module
 from shark.iree_utils._common import run_cmd, iree_device_map
 from shark.iree_utils.cpu_utils import get_cpu_count
 import numpy as np
@@ -102,15 +101,13 @@ def build_benchmark_args_non_tensor_input(
    and whether it is training or not.
    Outputs: string that execute benchmark-module on target model.
    """
-    path = benchmark_module.__path__[0]
+    path = os.path.join(os.environ["VIRTUAL_ENV"], "bin")
    if platform.system() == "Windows":
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module.exe"
-        )
+        benchmarker_path = os.path.join(path, "iree-benchmark-module.exe")
+        time_extractor = None
    else:
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module"
-        )
+        benchmarker_path = os.path.join(path, "iree-benchmark-module")
+        time_extractor = "| awk 'END{{print $2 $3}}'"
    benchmark_cl = [benchmarker_path, f"--module={input_file}"]
    # TODO: The function named can be passed as one of the args.
    if function_name:
@@ -135,7 +132,7 @@ def run_benchmark_module(benchmark_cl):
    benchmark_path = benchmark_cl[0]
    assert os.path.exists(
        benchmark_path
-    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
+    ), "Cannot find iree_benchmark_module, Please contact SHARK maintainer on discord."
    bench_stdout, bench_stderr = run_cmd(" ".join(benchmark_cl))
    try:
        regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -46,7 +46,7 @@ def get_iree_device_args(device, extra_args=[]):
    if device_uri[0] == "cpu":
        from shark.iree_utils.cpu_utils import get_iree_cpu_args

-        data_tiling_flag = ["--iree-flow-enable-data-tiling"]
+        data_tiling_flag = ["--iree-opt-data-tiling"]
        u_kernel_flag = ["--iree-llvmcpu-enable-microkernels"]
        stack_size_flag = ["--iree-llvmcpu-stack-allocation-limit=256000"]

@@ -84,7 +84,7 @@ def get_iree_frontend_args(frontend):
    elif frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
        return [
            "--iree-llvmcpu-target-cpu-features=host",
-            "--iree-flow-demote-i64-to-i32",
+            "--iree-input-demote-i64-to-i32",
        ]
    else:
        # Frontend not found.
@@ -92,14 +92,27 @@ def get_iree_frontend_args(frontend):


 # Common args to be used given any frontend or device.
-def get_iree_common_args():
-    return [
-        "--iree-stream-resource-index-bits=64",
+def get_iree_common_args(debug=False):
+    common_args = [
        "--iree-stream-resource-max-allocation-size=4294967295",
-        "--iree-vm-target-index-bits=64",
        "--iree-vm-bytecode-module-strip-source-map=true",
        "--iree-util-zero-fill-elided-attrs",
    ]
+    if debug == True:
+        common_args.extend(
+            [
+                "--iree-opt-strip-assertions=false",
+                "--verify=true",
+            ]
+        )
+    else:
+        common_args.extend(
+            [
+                "--iree-opt-strip-assertions=true",
+                "--verify=false",
+            ]
+        )
+    return common_args


 # Args that are suitable only for certain models or groups of models.
@@ -278,12 +291,13 @@ def compile_module_to_flatbuffer(
    model_config_path,
    extra_args,
    model_name="None",
+    debug=False,
 ):
    # Setup Compile arguments wrt to frontends.
    input_type = ""
    args = get_iree_frontend_args(frontend)
    args += get_iree_device_args(device, extra_args)
-    args += get_iree_common_args()
+    args += get_iree_common_args(debug=debug)
    args += get_model_specific_args()
    args += extra_args

@@ -343,7 +357,8 @@ def load_vmfb_using_mmap(
    flatbuffer_blob_or_path, device: str, device_idx: int = None
 ):
    print(f"Loading module {flatbuffer_blob_or_path}...")
-
+    if "rocm" in device:
+        device = "rocm"
    with DetailLogger(timeout=2.5) as dl:
        # First get configs.
        if device_idx is not None:
@@ -410,10 +425,11 @@ def get_iree_compiled_module(
    extra_args: list = [],
    device_idx: int = None,
    mmap: bool = False,
+    debug: bool = False,
 ):
    """Given a module returns the compiled .vmfb and configs"""
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, frontend, model_config_path, extra_args
+        module, device, frontend, model_config_path, extra_args, debug
    )
    temp_file_to_unlink = None
    # TODO: Currently mmap=True control flow path has been switched off for mmap.
@@ -469,10 +485,11 @@ def export_iree_module_to_vmfb(
    model_config_path: str = None,
    module_name: str = None,
    extra_args: list = [],
+    debug: bool = False,
 ):
    # Compiles the module given specs and saves it as .vmfb file.
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, mlir_dialect, model_config_path, extra_args
+        module, device, mlir_dialect, model_config_path, extra_args, debug
    )
    if module_name is None:
        device_name = (
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -17,6 +17,7 @@
 import functools
 import iree.runtime as ireert
 import ctypes
+import sys
 from shark.parser import shark_args


@@ -42,21 +43,51 @@ def get_iree_gpu_args():
@functools.cache
 def get_iree_rocm_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    # get arch from rocminfo.
+    # get arch from hipinfo.
+    import os
    import re
    import subprocess

-    rocm_arch = re.match(
-        r".*(gfx\w+)",
-        subprocess.check_output(
-            "rocminfo | grep -i 'gfx'", shell=True, text=True
-        ),
-    ).group(1)
-    print(f"Found rocm arch {rocm_arch}...")
+    if sys.platform == "win32":
+        if "HIP_PATH" in os.environ:
+            rocm_path = os.environ["HIP_PATH"]
+            print(f"Found a ROCm installation at {rocm_path}.")
+        else:
+            print("Failed to find ROCM_PATH. Defaulting to C:\\AMD\\ROCM\\5.5")
+            rocm_path = "C:\\AMD\\ROCM\\5.5"
+    else:
+        if "ROCM_PATH" in os.environ:
+            rocm_path = os.environ["ROCM_PATH"]
+            print(f"Found a ROCm installation at {rocm_path}.")
+        else:
+            print("Failed to find ROCM_PATH. Defaulting to /opt/rocm")
+            rocm_path = "/opt/rocm/"
+
+    try:
+        if sys.platform == "win32":
+            rocm_arch = re.search(
+                r"gfx\d{3,}",
+                subprocess.check_output("hipinfo", shell=True, text=True),
+            ).group(0)
+        else:
+            rocm_arch = re.match(
+                r".*(gfx\w+)",
+                subprocess.check_output(
+                    "rocminfo | grep -i 'gfx'", shell=True, text=True
+                ),
+            ).group(1)
+        print(f"Found rocm arch {rocm_arch}...")
+    except:
+        print(
+            "Failed to find ROCm architecture from hipinfo / rocminfo. Defaulting to gfx1100."
+        )
+        rocm_arch = "gfx1100"
+
+    bc_path = os.path.join(rocm_path, "amdgcn", "bitcode")
    return [
        f"--iree-rocm-target-chip={rocm_arch}",
        "--iree-rocm-link-bc=true",
-        "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
+        f"--iree-rocm-bc-dir={bc_path}",
    ]


--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -57,11 +57,8 @@ def get_version(triple):
@functools.cache
 def get_extensions(triple):
    def make_ext_list(ext_list):
-        res = ""
-        for e in ext_list:
-            res += e + ", "
-        res = f"[{res[:-2]}]"
-        return res
+        res = ", ".join(ext_list)
+        return f"[{res}]"

    arch, product, os = triple
    if arch == "m1":
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -178,9 +178,7 @@ def get_iree_vulkan_args(device_num=0, extra_args=[]):
@functools.cache
 def get_iree_vulkan_runtime_flags():
    vulkan_runtime_flags = [
-        f"--vulkan_large_heap_block_size={shark_args.vulkan_large_heap_block_size}",
        f"--vulkan_validation_layers={'true' if shark_args.vulkan_validation_layers else 'false'}",
-        f"--vulkan_vma_allocator={'true' if shark_args.vulkan_vma_allocator else 'false'}",
    ]
    return vulkan_runtime_flags

--- a/shark/parser.py
+++ b/shark/parser.py
@@ -133,13 +133,6 @@ parser.add_argument(
    help="Profiles vulkan device and collects the .rdc info.",
 )

-parser.add_argument(
-    "--vulkan_large_heap_block_size",
-    default="2073741824",
-    help="Flag for setting VMA preferredLargeHeapBlockSize for "
-    "vulkan device, default is 4G.",
-)
-
 parser.add_argument(
    "--vulkan_validation_layers",
    default=False,
@@ -147,11 +140,4 @@ parser.add_argument(
    help="Flag for disabling vulkan validation layers when benchmarking.",
 )

-parser.add_argument(
-    "--vulkan_vma_allocator",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Flag for enabling / disabling Vulkan VMA Allocator.",
-)
-
 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_compile.py
+++ b/shark/shark_compile.py
@@ -115,7 +115,7 @@ def compile_int_precision(
    print(f"[DEBUG] converting torch to linalg")
    run_pipeline_with_repro_report(
        mlir_module,
-        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
    )
    from contextlib import redirect_stdout
--- a/shark/shark_generate_model_config.py
+++ b/shark/shark_generate_model_config.py
@@ -138,7 +138,7 @@ if __name__ == "__main__":
    firstVicunaCompileInput = (compilation_input_ids,)
    from apps.language_models.src.model_wrappers.vicuna_model import (
        FirstVicuna,
-        SecondVicuna,
+        SecondVicuna7B,
        CombinedModel,
    )

--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -509,22 +509,6 @@ def import_with_fx(
    from torch.fx.experimental.proxy_tensor import make_fx
    from torch._decomp import get_decompositions
    from typing import List
-    from brevitas_examples.llm.llm_quant.export import (
-        block_quant_layer_level_manager,
-    )
-    from brevitas_examples.llm.llm_quant.export import (
-        brevitas_layer_export_mode,
-    )
-    from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
-        LinearWeightBlockQuantHandlerFwd,
-    )
-    from brevitas_examples.llm.llm_quant.export import replace_call_fn_target
-    from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
-        matmul_rhs_group_quant_placeholder,
-    )
-    from brevitas.backport.fx.experimental.proxy_tensor import (
-        make_fx as brevitas_make_fx,
-    )

    golden_values = None
    if debug:
@@ -596,8 +580,30 @@ def import_with_fx(
        torch.ops.aten.native_layer_norm,
        torch.ops.aten.masked_fill.Tensor,
        torch.ops.aten.masked_fill.Scalar,
+        torch.ops.aten._scaled_dot_product_flash_attention.default,
+        torch.ops.aten.index_add,
+        torch.ops.aten.index_add_,
    ]
    if precision in ["int4", "int8"]:
+        from brevitas_examples.llm.llm_quant.export import (
+            block_quant_layer_level_manager,
+        )
+        from brevitas_examples.llm.llm_quant.export import (
+            brevitas_layer_export_mode,
+        )
+        from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
+            LinearWeightBlockQuantHandlerFwd,
+        )
+        from brevitas_examples.llm.llm_quant.export import (
+            replace_call_fn_target,
+        )
+        from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
+            matmul_rhs_group_quant_placeholder,
+        )
+        from brevitas.backport.fx.experimental.proxy_tensor import (
+            make_fx as brevitas_make_fx,
+        )
+
        export_context_manager = brevitas_layer_export_mode
        export_class = block_quant_layer_level_manager(
            export_handlers=[LinearWeightBlockQuantHandlerFwd]
@@ -677,5 +683,5 @@ def import_with_fx(
        )
        return mlir_module, func_name

-    mlir_module, func_name = mlir_importer.import_mlir()
+    mlir_module, func_name = mlir_importer.import_mlir(mlir_type=mlir_type)
    return mlir_module, func_name
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -192,7 +192,9 @@ class SharkInference:

    # TODO: Instead of passing directory and having names decided by the module
    # , user may want to save the module with manual names.
-    def save_module(self, dir=os.getcwd(), module_name=None, extra_args=[]):
+    def save_module(
+        self, dir=os.getcwd(), module_name=None, extra_args=[], debug=False
+    ):
        return export_iree_module_to_vmfb(
            self.mlir_module,
            self.device,
@@ -200,6 +202,7 @@ class SharkInference:
            self.mlir_dialect,
            module_name=module_name,
            extra_args=extra_args,
+            debug=debug,
        )

    # load and return the module.
--- a/shark/shark_trainer.py
+++ b/shark/shark_trainer.py
@@ -69,7 +69,7 @@ class SharkTrainer:
            self.frontend = frontend

    # Training function is needed in the case of torch_fn.
-    def compile(self, training_fn=None, extra_args=[]):
+    def compile(self, training_fn=None, mlir_type="linalg", extra_args=[]):
        if self.frontend in ["torch", "pytorch"]:
            packed_inputs = (
                dict(self.model.named_parameters()),
@@ -77,7 +77,12 @@ class SharkTrainer:
                tuple(self.input),
            )
            mlir_module, func_name = import_with_fx(
-                training_fn, packed_inputs, False, [], training=True
+                training_fn,
+                packed_inputs,
+                False,
+                [],
+                training=True,
+                mlir_type=mlir_type,
            )
            self.shark_runner = SharkRunner(
                mlir_module,
--- a/tank/examples/bert_tf/bert_large_run.py
+++ b/tank/examples/bert_tf/bert_large_run.py
@@ -85,8 +85,6 @@ if __name__ == "__main__":
    args = [
        "--iree-llvmcpu-target-cpu-features=host",
        "--iree-mhlo-demote-i64-to-i32=false",
-        "--iree-stream-resource-index-bits=64",
-        "--iree-vm-target-index-bits=64",
    ]
    backend_config = "dylib"
    # backend = "cuda"
--- a/tank/examples/opt/README.md
+++ b/tank/examples/opt/README.md
@@ -1,3 +1,26 @@
-# Running Different OPT Variants
+# Run OPT for sentence completion through SHARK

-To run different sizes of OPT, change the string `OPT_MODEL` string in `opt_torch_test.py`. The default is 350m parameters. 66b cases also exist in the file, simply uncomment the test cases.
+From base SHARK directory, follow instructions to set up a virtual environment with SHARK. (`./setup_venv.sh` or `./setup_venv.ps1`)
+Then, you may run opt_causallm.py to get a very simple sentence completion application running through SHARK
+```
+python opt_causallm.py
+```
+
+# Run OPT performance comparison on SHARK vs. PyTorch
+
+```
+python opt_perf_comparison.py --max-seq-len=512 --model-name=facebook/opt-1.3b \
+        --platform=shark
+```
+Any OPT model from huggingface should work with this script, and you can choose between `--platform=shark` or `--platform=huggingface` to generate benchmarks of OPT inference on SHARK / PyTorch. 
+
+# Run a small suite of OPT models through the benchmark script
+
+```
+python opt_perf_comparison_batch.py
+```
+This script will run benchmarks from a suite of OPT configurations:
+- Sequence Lengths: 32, 128, 256, 512
+- Parameter Counts: 125m, 350m, 1.3b
+
+note: Most of these scripts are written for use on CPU, as perf comparisons against pytorch can be problematic across platforms otherwise.
--- a/tank/examples/opt/opt_causallm.py
+++ b/tank/examples/opt/opt_causallm.py
@@ -59,7 +59,7 @@ def create_module(model_name, tokenizer, device):
    )

    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{device}"
-    shark_module.save_module(module_name=vmfb_name)
+    shark_module.save_module(module_name=vmfb_name, debug=False)
    vmfb_path = vmfb_name + ".vmfb"
    return vmfb_path

--- a/tank/examples/opt/opt_perf_comparison.py
+++ b/tank/examples/opt/opt_perf_comparison.py
@@ -1,18 +1,46 @@
+"""
+Script for comparing OPT model performance between SHARK and Huggingface
+PyTorch.
+
+Usage Example:
+
+python opt_perf_comparison.py --max-seq-len=32 --model-name=facebook/opt-125m \
+        --platform=shark
+
+python opt_perf_comparison.py --max-seq-len=512 --model-name=facebook/opt-1.3b \
+        --platform=shark
+
+See parse_args() below for command line argument usage.
+"""
+
+import argparse
 import collections
 import json
-import time
 import os
+import psutil
+import resource
+import time
+from typing import Tuple

 from shark.shark_inference import SharkInference
 from shark.shark_importer import import_with_fx
 from transformers import AutoTokenizer, OPTForCausalLM
 from shark_opt_wrapper import OPTForCausalLMModel

-MODEL_NAME = "facebook/opt-1.3b"
-OPT_MODELNAME = "opt-1.3b"
-OPT_FS_NAME = "opt_1-3b"
-MAX_SEQUENCE_LENGTH = 512
 DEVICE = "cpu"
+PLATFORM_SHARK = "shark"
+PLATFORM_HUGGINGFACE = "huggingface"
+
+# Dict keys for reports.
+REPORT_PLATFORM = "platform"
+REPORT_MODEL_NAME = "model"
+REPORT_MAX_SEQ_LEN = "max_seq_len"
+REPORT_LOAD_TIME = "load_time_sec"
+REPORT_RUN_TIME = "run_time_sec"
+REPORT_LOAD_PHYSICAL_MEMORY_MB = "load_physical_MB"
+REPORT_LOAD_VIRTUAL_MEMORY_MB = "load_virtual_MB"
+REPORT_RUN_PHYSICAL_MEMORY_MB = "run_physical_MB"
+REPORT_RUN_VIRTUAL_MEMORY_MB = "run_virtual_MB"

 PROMPTS = [
    "What is the meaning of life?",
@@ -30,15 +58,27 @@ PROMPTS = [
 ModelWrapper = collections.namedtuple("ModelWrapper", ["model", "tokenizer"])


-def create_vmfb_module(model_name, tokenizer, device):
-    opt_base_model = OPTForCausalLM.from_pretrained("facebook/" + model_name)
+def get_memory_info():
+    pid = os.getpid()
+    process = psutil.Process(pid)
+    return process.memory_info()
+
+
+def create_vmfb_module(
+    model_name: str,
+    tokenizer,
+    device: str,
+    max_seq_len: int,
+    recompile_shark: bool,
+):
+    opt_base_model = OPTForCausalLM.from_pretrained(model_name)
    opt_base_model.eval()
    opt_model = OPTForCausalLMModel(opt_base_model)
    encoded_inputs = tokenizer(
-        "What is the meaning of life?",
+        PROMPTS[0],
        padding="max_length",
        truncation=True,
-        max_length=MAX_SEQUENCE_LENGTH,
+        max_length=max_seq_len,
        return_tensors="pt",
    )
    inputs = (
@@ -48,8 +88,16 @@ def create_vmfb_module(model_name, tokenizer, device):
    # np.save("model_inputs_0.npy", inputs[0])
    # np.save("model_inputs_1.npy", inputs[1])

-    mlir_path = f"./{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch.mlir"
-    if os.path.isfile(mlir_path):
+    opt_fs_name = get_opt_fs_name(model_name)
+    mlir_path = f"./{opt_fs_name}_causallm_{max_seq_len}_torch.mlir"
+    # If MLIR has already been loaded and recompilation is not requested, use
+    # the loaded MLIR file.
+    has_mlir = os.path.isfile(mlir_path)
+    # The purpose of recompile_shark is to measure compilation time; the
+    # compilation time can be correctly measured only when MLIR has already been
+    # loaded.
+    assert not recompile_shark or has_mlir
+    if has_mlir:
        with open(mlir_path, "r") as f:
            model_mlir = f.read()
        print(f"Loaded .mlir from {mlir_path}")
@@ -58,7 +106,7 @@ def create_vmfb_module(model_name, tokenizer, device):
            model=opt_model,
            inputs=inputs,
            is_f16=False,
-            model_name=OPT_FS_NAME,
+            model_name=opt_fs_name,
            return_str=True,
        )
        with open(mlir_path, "w") as f:
@@ -72,18 +120,25 @@ def create_vmfb_module(model_name, tokenizer, device):
        is_benchmark=False,
    )

-    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{DEVICE}_tiled_ukernels"
+    vmfb_name = (
+        f"{opt_fs_name}_causallm_{max_seq_len}_torch_{DEVICE}_tiled_ukernels"
+    )
    shark_module.save_module(module_name=vmfb_name)
    vmfb_path = vmfb_name + ".vmfb"
    return vmfb_path


-def load_shark_model() -> ModelWrapper:
-    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{DEVICE}_tiled_ukernels.vmfb"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
-    if not os.path.isfile(vmfb_name):
+def load_shark_model(
+    model_name: str, max_seq_len: int, recompile_shark: bool
+) -> ModelWrapper:
+    opt_fs_name = get_opt_fs_name(model_name)
+    vmfb_name = f"{opt_fs_name}_causallm_{max_seq_len}_torch_{DEVICE}_tiled_ukernels.vmfb"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    if recompile_shark or not os.path.isfile(vmfb_name):
        print(f"vmfb not found. compiling and saving to {vmfb_name}")
-        create_vmfb_module(OPT_MODELNAME, tokenizer, DEVICE)
+        create_vmfb_module(
+            model_name, tokenizer, DEVICE, max_seq_len, recompile_shark
+        )
    shark_module = SharkInference(mlir_module=None, device="cpu-task")
    shark_module.load_module(vmfb_name)
    return ModelWrapper(model=shark_module, tokenizer=tokenizer)
@@ -94,20 +149,10 @@ def run_shark_model(model_wrapper: ModelWrapper, tokens):
    return model_wrapper.model("forward", tokens)


-def run_shark():
-    model_wrapper = load_shark_model()
-
-    prompt = "What is the meaning of life?"
-    logits = run_shark_model(model_wrapper, prompt)
-
-    # Print output logits to validate vs. pytorch + base transformers
-    print(logits[0])
-
-
-def load_huggingface_model() -> ModelWrapper:
+def load_huggingface_model(model_name: str) -> ModelWrapper:
    return ModelWrapper(
-        model=OPTForCausalLM.from_pretrained(MODEL_NAME),
-        tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
+        model=OPTForCausalLM.from_pretrained(model_name),
+        tokenizer=AutoTokenizer.from_pretrained(model_name),
    )


@@ -117,47 +162,68 @@ def run_huggingface_model(model_wrapper: ModelWrapper, tokens):
    )


-def run_huggingface():
-    model_wrapper = load_huggingface_model()
-    prompt = "What is the meaning of life?"
-    logits = run_huggingface_model(model_wrapper, prompt)
-
-    print(logits[0])
-
-
 def save_json(data, filename):
    with open(filename, "w") as file:
        json.dump(data, file)


-def collect_huggingface_logits():
+def collect_huggingface_logits(
+    model_name: str, max_seq_len: int, save_json: bool
+) -> Tuple[float, float]:
+    # Load
    t0 = time.time()
-    model_wrapper = load_huggingface_model()
-    print("--- Took {} seconds to load Huggingface.".format(time.time() - t0))
+    model_wrapper = load_huggingface_model(model_name)
+    load_time = time.time() - t0
+    print("--- Took {} seconds to load Huggingface.".format(load_time))
+    load_memory_info = get_memory_info()
+
    results = []
    tokenized_prompts = []
    for prompt in PROMPTS:
        tokens = model_wrapper.tokenizer(
            prompt,
            padding="max_length",
-            max_length=MAX_SEQUENCE_LENGTH,
+            max_length=max_seq_len,
            truncation=True,
            return_tensors="pt",
        )
        tokenized_prompts.append(tokens)
+
+    # Run
    t0 = time.time()
    for idx, tokens in enumerate(tokenized_prompts):
        print("prompt: {}".format(PROMPTS[idx]))
        logits = run_huggingface_model(model_wrapper, tokens)
-        results.append([PROMPTS[idx], logits[0].tolist()])
-    print("--- Took {} seconds to run Huggingface.".format(time.time() - t0))
-    save_json(results, "/tmp/huggingface.json")
+        if save_json:
+            results.append([PROMPTS[idx], logits[0].tolist()])
+    run_time = time.time() - t0
+    print("--- Took {} seconds to run Huggingface.".format(run_time))
+    if save_json:
+        save_json(results, "/tmp/huggingface.json")
+    run_memory_info = get_memory_info()
+    return {
+        REPORT_PLATFORM: PLATFORM_HUGGINGFACE,
+        REPORT_MODEL_NAME: model_name,
+        REPORT_MAX_SEQ_LEN: max_seq_len,
+        REPORT_LOAD_TIME: load_time,
+        REPORT_RUN_TIME: run_time / len(PROMPTS),
+        REPORT_LOAD_PHYSICAL_MEMORY_MB: load_memory_info.rss >> 20,
+        REPORT_LOAD_VIRTUAL_MEMORY_MB: load_memory_info.vms >> 20,
+        REPORT_RUN_PHYSICAL_MEMORY_MB: run_memory_info.rss >> 20,
+        REPORT_RUN_VIRTUAL_MEMORY_MB: run_memory_info.vms >> 20,
+    }


-def collect_shark_logits():
+def collect_shark_logits(
+    model_name: str, max_seq_len: int, recompile_shark: bool, save_json: bool
+) -> Tuple[float, float]:
+    # Load
    t0 = time.time()
-    model_wrapper = load_shark_model()
-    print("--- Took {} seconds to load Shark.".format(time.time() - t0))
+    model_wrapper = load_shark_model(model_name, max_seq_len, recompile_shark)
+    load_time = time.time() - t0
+    print("--- Took {} seconds to load Shark.".format(load_time))
+    load_memory_info = get_memory_info()
+
    results = []
    tokenized_prompts = []
    for prompt in PROMPTS:
@@ -165,7 +231,7 @@ def collect_shark_logits():
            prompt,
            padding="max_length",
            truncation=True,
-            max_length=MAX_SEQUENCE_LENGTH,
+            max_length=max_seq_len,
            return_tensors="pt",
        )
        inputs = (
@@ -173,16 +239,100 @@ def collect_shark_logits():
            tokens["attention_mask"],
        )
        tokenized_prompts.append(inputs)
+
+    # Run
    t0 = time.time()
    for idx, tokens in enumerate(tokenized_prompts):
        print("prompt: {}".format(PROMPTS[idx]))
        logits = run_shark_model(model_wrapper, tokens)
        lst = [e.tolist() for e in logits]
-        results.append([PROMPTS[idx], lst])
-    print("--- Took {} seconds to run Shark.".format(time.time() - t0))
-    save_json(results, "/tmp/shark.json")
+        if save_json:
+            results.append([PROMPTS[idx], lst])
+    run_time = time.time() - t0
+    print("--- Took {} seconds to run Shark.".format(run_time))
+    if save_json:
+        save_json(results, "/tmp/shark.json")
+    platform_postfix = "-compile" if recompile_shark else "-precompiled"
+    run_memory_info = get_memory_info()
+    return {
+        REPORT_PLATFORM: PLATFORM_SHARK + platform_postfix,
+        REPORT_MODEL_NAME: model_name,
+        REPORT_MAX_SEQ_LEN: max_seq_len,
+        REPORT_LOAD_TIME: load_time,
+        REPORT_RUN_TIME: run_time / len(PROMPTS),
+        REPORT_LOAD_PHYSICAL_MEMORY_MB: load_memory_info.rss >> 20,
+        REPORT_LOAD_VIRTUAL_MEMORY_MB: load_memory_info.vms >> 20,
+        REPORT_RUN_PHYSICAL_MEMORY_MB: run_memory_info.rss >> 20,
+        REPORT_RUN_VIRTUAL_MEMORY_MB: run_memory_info.vms >> 20,
+    }
+
+
+def get_opt_fs_name(model_name: str) -> str:
+    """Cleanses the model name ino a file system-friendly name.
+
+    Example: get_opt_fs_name('facebook/opt-1.3b') == 'opt_1-3b'
+    """
+    slash_split = model_name.split("/")
+    assert 1 <= len(slash_split) <= 2, "There should be at most one slash."
+    model_name = slash_split[-1]
+    for src_pattern, dest_pattern in (("-", "_"), (".", "-")):
+        model_name = model_name.replace(src_pattern, dest_pattern)
+    return model_name
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save-json",
+        help="If set, saves output JSON.",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+    )
+    parser.add_argument(
+        "--max-seq-len", help="Max sequence length", type=int, default=32
+    )
+    parser.add_argument(
+        "--model-name",
+        help="Model name",
+        type=str,
+        choices=[
+            "facebook/opt-125m",
+            "facebook/opt-350m",
+            "facebook/opt-1.3b",
+            "facebook/opt-6.7b",
+        ],
+        default="facebook/opt-1.3b",
+    )
+    parser.add_argument(
+        "--recompile-shark",
+        help="If set, recompiles MLIR",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+    )
+    parser.add_argument(
+        "--platform",
+        help="Either shark or huggingface",
+        type=str,
+        choices=[PLATFORM_SHARK, PLATFORM_HUGGINGFACE],
+        default=PLATFORM_SHARK,
+    )
+    args = parser.parse_args()
+    print("args={}".format(args))
+    return args


 if __name__ == "__main__":
-    collect_shark_logits()
-    collect_huggingface_logits()
+    args = parse_args()
+    if args.platform == PLATFORM_SHARK:
+        shark_report = collect_shark_logits(
+            args.model_name,
+            args.max_seq_len,
+            args.recompile_shark,
+            args.save_json,
+        )
+        print("# Summary: {}".format(json.dumps(shark_report)))
+    else:
+        huggingface_report = collect_huggingface_logits(
+            args.model_name, args.max_seq_len, args.save_json
+        )
+        print("# Summary: {}".format(json.dumps(huggingface_report)))
--- a/tank/examples/opt/opt_perf_comparison_batch.py
+++ b/tank/examples/opt/opt_perf_comparison_batch.py
@@ -0,0 +1,30 @@
+"""
+Script for running opt_perf_comparison.py in batch with a series of arguments.
+
+Usage: python opt_perf_comparison_batch.py
+"""
+
+from typing import Iterable, List
+import shlex
+import subprocess
+
+
+def make_commands() -> Iterable[List[str]]:
+    command = shlex.split("python opt_perf_comparison.py --no-save-json")
+    max_seq_lens = [32, 128, 256, 512]
+    model_names = ["facebook/opt-" + e for e in ["125m", "350m", "1.3b"]]
+    for max_seq_len in max_seq_lens:
+        for model_name in model_names:
+            yield command + [
+                f"--max-seq-len={max_seq_len}",
+                f"--model-name={model_name}",
+            ]
+
+
+def main():
+    for command in make_commands():
+        result = subprocess.run(command, check=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -287,6 +287,9 @@ class SharkModuleTester:
        repro_path = os.path.join("reproducers", self.tmp_prefix, "*")

        bashCommand = f"gsutil cp -r {repro_path} gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
+        print(
+            f"Uploading reproducer {repro_path} to gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
+        )
        process = subprocess.run(bashCommand.split())

    def postprocess_outputs(self, golden_out, result):
Author	SHA1	Message	Date
Ean Garvey	ca609afb6a	Update README.md (#1830 )	2023-09-14 10:33:57 -05:00
Gaurav Shukla	11bdce9790	[flags] Fix vulkan runtime flags as vma is dropped from iree (#1831 )	2023-09-14 08:58:59 -05:00
Ean Garvey	684943a4a6	(SD) Fix tokenizers imports in pyinstaller builds. (#1828 ) * Fix tokenizers metadata. * (SD) Disable VAE lowering configs (rdna3) and add versioned tunings. * Update sd_annotation.py * (SD) Add cv2 to spec. * Update stencil pipeline with the new img2img arg.	2023-09-12 12:23:48 -05:00
PhaneeshB	b817bb8455	add roles for llama2	2023-09-12 10:59:28 +05:30
Ean Garvey	780f520f02	Fix vk.target_env extensions and remove redundant SD imports. (#1826 ) * Remove redundant IREE runtime imports. * Fix vulkan target env extensions.	2023-09-11 13:42:52 -05:00
Dom	c61b6f8d65	Code refactoring (#1817 ) * use join * fix bug * further code optimizations --------- Co-authored-by: Daniel Garvey <34486624+dan-garvey@users.noreply.github.com>	2023-09-11 11:30:56 -05:00
Abhishek Varma	c854208d49	[Llama2] Prefetch llama2 tokenizer configs (#1824 ) -- This commit prefetches llama2 tokenizer configs from shark_tank. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-09-08 11:29:54 -07:00
Gaurav Shukla	c5dcfc1f13	[vicuna] Exit when mlir is not present in shark tank (#1825 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-08 10:30:29 -07:00
Abhishek Varma	bde63ee8ae	Add logging feature in WebUI (#1821 )	2023-09-08 05:48:05 -07:00
Vivek Khandelwal	9681d494eb	Update decomp list and shark trainer for DLRM	2023-09-06 21:24:50 +05:30
Gaurav Shukla	ede6bf83e2	[vicuna] Disabling the IR generation path Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-06 20:13:17 +05:30
Ean Garvey	2c2693fb7d	Fix torchvision versioning in Linux importer setup. (#1809 )	2023-09-05 12:57:03 -05:00
Vivek Khandelwal	1d31b2b2c6	Fix StableHLO Compilation flag	2023-09-05 21:32:33 +05:30
Gaurav Shukla	d2f64eefa3	[chatbot] Remove few outdated models from list (#1814 )	2023-09-04 09:26:32 -07:00
Abhishek Varma	87ae14b6ff	[SD] Add sdpfa decomposition + update IREE flag -- This commit adds Scaled Dot Product Flash Attention's decomposition in shark_importer. -- It also updates `iree-flow-enable-data-tiling` to `iree-opt-data-tiling`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-09-04 18:03:53 +05:30
Phaneesh Barwaria	1ccafa1fc1	fix llama2-70b rewrite tensor dim	2023-09-01 17:27:06 +05:30
jinchen62	4c3d8a0a7f	Enable downloading vmfb/mlir for webui (#1807 )	2023-08-31 11:05:47 -07:00
jinchen62	3601dc7c3b	Fix llama2 13b combined ir (#1803 )	2023-08-28 11:34:44 -07:00
Daniel Garvey	671881cf87	Llama2 70b (#1783 ) * llama2 70b IR gen * fix IR sec llama2 + debug * llama270b --------- Co-authored-by: PhaneeshB <b.phaneesh@gmail.com>	2023-08-25 23:04:28 -07:00
Gaurav Shukla	4e9be6be59	[chatbot] Add debug as class attribute (#1799 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-08-25 21:46:29 -07:00
Ean Garvey	9c8cbaf498	Add support for ROCM (Windows) in Studio + compile utils (#1770 ) * WIP: MSVC ROCM support for SHARK Studio * Make get_iree_rocm_args platform-agnostic. * Update stable_args.py * Update rocm arg handling in SD utils * Guard quantization imports. Co-authored-by: jam https://github.com/jammm	2023-08-25 20:56:05 -07:00
Ean Garvey	9e348a114e	Revert changes process_skipfiles.py (#1798 ) Keeps a small typo fix but reverts the rest of changes to this file from `450c231171`	2023-08-25 15:31:49 -07:00
jinchen62	51f90a4d56	Update conversion passes for brevitas quant op (#1795 )	2023-08-25 17:28:07 -05:00
Abhishek Varma	310d5d0a49	Fix llama2 13b crashing + add spec file for CLI execution of Llama (#1797 ) * [Llama2] Add a fix for Llama2 13B downloading/crashing -- This commit fixes downloading/crashing of llama2 13B on wrong .mlir file. -- Also adds support for downloading vmfb from shark_tank in CLI. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> * [llama2] Add a spec file to run Llama/Vicuna CLI exe -- This commit adds a spec file to run Llama/Vicuna CLI exe. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> --------- Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-08-25 09:36:09 -05:00
Ean Garvey	9697981004	Pipe through a debug option to iree compile utils. (#1796 ) * Update compile_utils.py * Pipe through a flag to toggle debug options in compile utils. * Update SharkLLMBase.py	2023-08-25 07:11:11 -07:00
Ean Garvey	450c231171	Add tokenizers to requirements.txt (#1790 ) * Add tokenizers to requirements and pin version * Update process_skipfiles.py	2023-08-24 19:44:04 -05:00
Ean Garvey	07f6f4a2f7	Add a short README for the OPT examples and small tweaks. (#1793 ) * Small changes to OPT example. * Update opt README. * Add a few modes to batch script. * Update README.md	2023-08-24 17:26:11 -07:00
jinchen62	610813c72f	Add iree flag to strip assertions (#1791 )	2023-08-24 10:51:19 -07:00
Ean Garvey	8e3860c9e6	Remove flags that are default in upstream IREE (#1785 ) * Remove index bits flags now set by default * Update shark_studio_imports.py	2023-08-24 11:57:54 -05:00
xzuyn	e37d6720eb	Add Hires Fix (#1787 ) * improper test hiresfix * add sliders & use `clear_cache` * add resample choices & fix step adjustment * add step adjustment to img2img * add resample options to img2img * simplify hiresfix - import `img2img_inf` from `img2img_ui.py` instead of just copying it into `txt2img_ui.py` * set `hri` to None after using * add more resample types, and don't show output until hiresfix is done * cleaner implementation * ran black * ran black again with jupyter dependencies	2023-08-24 09:01:41 -07:00
Vivek Khandelwal	16160d9a7d	Fix combine mlir script	2023-08-24 19:10:49 +05:30
Sungsoon Cho	79075a1a07	Opt perf (#1786 ) * Define command line args, model-name, max-seq-len, platform, etc. * Add usage example. * Add opt_perf_comparision_batch.py. * Use shlex instead.	2023-08-24 08:33:12 -05:00
Abhishek Varma	db990826d3	Add Llama2 13B int4 fp16 support (#1784 ) Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-08-23 10:00:32 -07:00
gpetters94	7ee3e4ba5d	Add stencil_unet_512 support (#1778 ) This should fix any remaining issues with stencils and long prompts.	2023-08-22 12:23:46 -04:00
Vivek Khandelwal	05889a8fe1	Add LLaMa2-int4-fp16 support (#1782 )	2023-08-22 07:45:50 -07:00
jinchen62	b87efe7686	Fix venv setup for brevitas (#1779 )	2023-08-21 11:58:51 -07:00