Fix llama2 13b combined ir (#1803 )

Llama2 70b (#1783 )
* llama2 70b IR gen * fix IR sec llama2 + debug * llama270b --------- Co-authored-by: PhaneeshB <b.phaneesh@gmail.com>
2026-04-20 03:00:34 -04:00 · 2023-08-28 11:34:44 -07:00 · 2023-08-25 23:04:28 -07:00 · 2023-08-25 21:46:29 -07:00 · 2023-08-25 20:56:05 -07:00 · 2023-08-25 15:31:49 -07:00
23 changed files with 860 additions and 118 deletions
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -237,7 +237,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
            print(f"[DEBUG] converting torch to linalg")
            run_pipeline_with_repro_report(
                module,
-                "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
            )
        else:
--- a/apps/language_models/scripts/stablelm.py
+++ b/apps/language_models/scripts/stablelm.py
@@ -46,6 +46,7 @@ def compile_stableLM(
    model_vmfb_name,
    device="cuda",
    precision="fp32",
+    debug=False,
 ):
    from shark.shark_inference import SharkInference

@@ -92,7 +93,7 @@ def compile_stableLM(
    shark_module.compile()

    path = shark_module.save_module(
-        vmfb_path.parent.absolute(), vmfb_path.stem
+        vmfb_path.parent.absolute(), vmfb_path.stem, debug=debug
    )
    print("Saved vmfb at ", str(path))

--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
@@ -39,6 +39,7 @@ from apps.language_models.src.model_wrappers.vicuna_model import (
    FirstVicuna,
    SecondVicuna7B,
    SecondVicuna13B,
+    SecondVicuna70B,
 )
 from apps.language_models.utils import (
    get_vmfb_from_path,
@@ -48,8 +49,6 @@ from shark.shark_importer import get_f16_inputs
 from shark.shark_importer import import_with_fx
 from shark.shark_inference import SharkInference

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl

 parser = argparse.ArgumentParser(
    prog="vicuna runner",
@@ -193,7 +192,6 @@ class VicunaBase(SharkLLMBase):
        first_vicuna_mlir,
        second_vicuna_mlir,
        output_name,
-        model_name=None,
    ):
        print(f"[DEBUG] combining first and second mlir")
        print(f"[DEBUG] output_name = {output_name}")
@@ -357,8 +355,7 @@ class VicunaBase(SharkLLMBase):
            f_.writelines(line + "\n" for line in global_vars)
            f_.writelines(line + "\n" for line in f1)
            f_.writelines(line + "\n" for line in f2)
-            if not (model_name and "llama2_13b" in model_name):
-                f_.writelines(line + "\n" for line in [module_end])
+            f_.writelines(line + "\n" for line in [module_end])

        del maps1
        del maps2
@@ -444,6 +441,7 @@ class ShardedVicuna(VicunaBase):
        weight_group_size=128,
        compressed=False,
        extra_args_cmd=[],
+        debug=False,
    ) -> None:
        super().__init__(
            model_name,
@@ -454,6 +452,7 @@ class ShardedVicuna(VicunaBase):
        self.max_sequence_length = 256
        self.device = device
        self.precision = precision
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
        self.config = config_json
        self.weight_group_size = weight_group_size
@@ -641,7 +640,7 @@ class ShardedVicuna(VicunaBase):
        return device_idx

    def compile_lmhead(
-        self, lmh, hidden_states, device="cpu", device_idx=None
+        self, lmh, hidden_states, device="cpu", device_idx=None,
    ):
        # compile the lm head of the vicuna model
        # This can be used for both first and second vicuna, so only needs to be run once
@@ -689,7 +688,7 @@ class ShardedVicuna(VicunaBase):
        if vmfb_path.exists():
            shark_module.load_module(vmfb_path)
        else:
-            shark_module.save_module(module_name="lmhead")
+            shark_module.save_module(module_name="lmhead", debug=self.debug)
            shark_module.load_module(vmfb_path)
        compiled_module = LMHeadCompiled(shark_module)
        return compiled_module
@@ -735,7 +734,7 @@ class ShardedVicuna(VicunaBase):
        if vmfb_path.exists():
            shark_module.load_module(vmfb_path)
        else:
-            shark_module.save_module(module_name="norm")
+            shark_module.save_module(module_name="norm", debug=self.debug)
            shark_module.load_module(vmfb_path)
        compiled_module = VicunaNormCompiled(shark_module)
        return compiled_module
@@ -786,14 +785,14 @@ class ShardedVicuna(VicunaBase):
        if vmfb_path.exists():
            shark_module.load_module(vmfb_path)
        else:
-            shark_module.save_module(module_name="embedding")
+            shark_module.save_module(module_name="embedding", debug=self.debug)
            shark_module.load_module(vmfb_path)
        compiled_module = VicunaEmbeddingCompiled(shark_module)

        return compiled_module

    def compile_to_vmfb_one_model(
-        self, inputs0, layers0, inputs1, layers1, device="cpu"
+        self, inputs0, layers0, inputs1, layers1, device="cpu",
    ):
        mlirs, modules = [], []
        assert len(layers0) == len(layers1)
@@ -803,7 +802,6 @@ class ShardedVicuna(VicunaBase):
            # if vmfb_path.exists():
            #    continue
            if mlir_path.exists():
-                # print(f"Found layer {idx} mlir")
                f_ = open(mlir_path, "rb")
                bytecode = f_.read()
                f_.close()
@@ -839,6 +837,8 @@ class ShardedVicuna(VicunaBase):
                    layer0, inputs0[0], inputs0[1], inputs0[2]
                )
                if self.precision in ["int4", "int8"]:
+                    from brevitas_examples.llm.llm_quant.quantize import quantize_model
+                    from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
                    module0 = torch_mlir.compile(
                        ts_g,
                        (
@@ -855,7 +855,7 @@ class ShardedVicuna(VicunaBase):
                    print(f"[DEBUG] converting torch to linalg")
                    run_pipeline_with_repro_report(
                        module0,
-                        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
                    )
                else:
@@ -899,7 +899,7 @@ class ShardedVicuna(VicunaBase):
                    print(f"[DEBUG] converting torch to linalg")
                    run_pipeline_with_repro_report(
                        module1,
-                        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
                    )
                else:
@@ -924,7 +924,6 @@ class ShardedVicuna(VicunaBase):
                mlirs.append(module_combined)

            if vmfb_path.exists():
-                # print(f"Found layer {idx} vmfb")
                device_idx = self.get_device_index(
                    f"first_vicuna.model.model.layers.{idx}[\s.$]"
                )
@@ -956,6 +955,7 @@ class ShardedVicuna(VicunaBase):
                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
                    ]
                    + self.extra_args,
+                    debug=self.debug,
                )
                module.load_module(vmfb_path)
            modules.append(module)
@@ -972,7 +972,6 @@ class ShardedVicuna(VicunaBase):
            # if vmfb_path.exists():
            #    continue
            if mlir_path.exists():
-                # print(f"Found layer {idx} mlir")
                f_ = open(mlir_path, "rb")
                bytecode = f_.read()
                f_.close()
@@ -991,7 +990,6 @@ class ShardedVicuna(VicunaBase):
                mlirs.append(bytecode)

            if vmfb_path.exists():
-                # print(f"Found layer {idx} vmfb")
                device_idx = self.get_device_index(
                    f"first_vicuna.model.model.layers.{idx}[\s.$]"
                )
@@ -1023,6 +1021,7 @@ class ShardedVicuna(VicunaBase):
                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
                    ]
                    + self.extra_args,
+                    debug=self.debug,
                )
                module.load_module(vmfb_path)
            modules.append(module)
@@ -1039,6 +1038,8 @@ class ShardedVicuna(VicunaBase):
            )

        if self.precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
            print("Applying weight quantization..")
            weight_bit_width = 4 if self.precision == "int4" else 8
            quantize_model(
@@ -1229,12 +1230,13 @@ class UnshardedVicuna(VicunaBase):
        precision="int8",
        vicuna_mlir_path=None,
        vicuna_vmfb_path=None,
-        load_mlir_from_shark_tank=True,
+        load_mlir_from_shark_tank=False,
        low_device_memory=False,
        weight_group_size=128,
        download_vmfb=False,
        cache_vicunas=False,
        extra_args_cmd=[],
+        debug=False,
    ) -> None:
        super().__init__(
            model_name,
@@ -1263,17 +1265,18 @@ class UnshardedVicuna(VicunaBase):
        self.load_mlir_from_shark_tank = load_mlir_from_shark_tank
        self.low_device_memory = low_device_memory
        self.weight_group_size = weight_group_size
+        self.debug = debug
        if self.vicuna_mlir_path == None:
            self.vicuna_mlir_path = self.get_model_path()
        if self.vicuna_vmfb_path == None:
            self.vicuna_vmfb_path = self.get_model_path(suffix="vmfb")
        self.tokenizer = self.get_tokenizer()
        self.cache_vicunas = cache_vicunas
-        self.compile()
+        self.compile(download_vmfb)

    def get_model_path(self, suffix="mlir"):
        safe_device = self.device.split("-")[0]
-        if suffix == "mlir":
+        if suffix in ["mlirbc", "mlir"]:
            return Path(f"{self.model_name}_{self.precision}.{suffix}")
        return Path(
            f"{self.model_name}_{self.precision}_{safe_device}.{suffix}"
@@ -1333,7 +1336,7 @@ class UnshardedVicuna(VicunaBase):
            new_lines.append(line)
        return "\n".join(new_lines)

-    def write_in_dynamic_inputs1(self, module, model_name):
+    def write_in_dynamic_inputs1(self, module):
        print("[DEBUG] writing dynamic inputs to second vicuna")

        def remove_constant_dim(line):
@@ -1361,9 +1364,12 @@ class UnshardedVicuna(VicunaBase):

        module = module.splitlines()
        new_lines = []
+
        # Using a while loop and the pop method to avoid creating a copy of module
-        if "llama2_13b" in model_name:
+        if "llama2_13b" in self.model_name:
            pkv_tensor_shape = "tensor<1x40x?x128x"
+        elif "llama2_70b" in self.model_name:
+            pkv_tensor_shape = "tensor<1x60x?x128x"
        else:
            pkv_tensor_shape = "tensor<1x32x?x128x"
        if self.precision in ["fp16", "int4", "int8"]:
@@ -1429,7 +1435,7 @@ class UnshardedVicuna(VicunaBase):
            mlir_generated = False
            if self.load_mlir_from_shark_tank:
                # download MLIR from shark tank
-                for suffix in ["mlir", "mlirbc"]:
+                for suffix in ["mlirbc", "mlir"]:
                    self.vicuna_mlir_path = self.get_model_path(suffix)
                    download_public_file(
                        f"gs://shark_tank/{self.model_name}/unsharded/mlir/{self.vicuna_mlir_path.name}",
@@ -1456,9 +1462,10 @@ class UnshardedVicuna(VicunaBase):
                else:
                    compilation_prompt = "".join(["0" for _ in range(17)])

-                if Path(f"first_{self.precision}.mlir").exists():
-                    print(f"loading first_{self.precision}.mlir")
-                    with open(Path(f"first_{self.precision}.mlir"), "r") as f:
+                first_model_path = f"first_{self.model_name}_{self.precision}.mlir"
+                if Path(first_model_path).exists():
+                    print(f"loading {first_model_path}")
+                    with open(Path(first_model_path), "r") as f:
                        first_module = f.read()
                else:
                    # generate first vicuna
@@ -1511,7 +1518,7 @@ class UnshardedVicuna(VicunaBase):
                        print(f"[DEBUG] converting torch to linalg")
                        run_pipeline_with_repro_report(
                            first_module,
-                            "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                            "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                            description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
                        )
                    else:
@@ -1533,13 +1540,14 @@ class UnshardedVicuna(VicunaBase):
                        str(first_module), dynamic_input_size=19
                    )
                    if self.cache_vicunas:
-                        with open(f"first_{self.precision}.mlir", "w+") as f:
+                        with open(first_model_path, "w+") as f:
                            f.write(first_module)
                        print("Finished writing IR after dynamic")
-
-                if Path(f"second_{self.precision}.mlir").exists():
-                    print(f"loading second_{self.precision}.mlir")
-                    with open(Path(f"second_{self.precision}.mlir"), "r") as f:
+                print(f"[DEBUG] Starting generation of second llama")
+                second_model_path = f"second_{self.model_name}_{self.precision}.mlir"
+                if Path(second_model_path).exists():
+                    print(f"loading {second_model_path}")
+                    with open(Path(second_model_path), "r") as f:
                        second_module = f.read()
                else:
                    # generate second vicuna
@@ -1549,6 +1557,9 @@ class UnshardedVicuna(VicunaBase):
                    if self.model_name == "llama2_13b":
                        dim1 = 40
                        total_tuple = 80
+                    elif self.model_name == "llama2_70b":
+                        dim1 = 8
+                        total_tuple = 160
                    else:
                        dim1 = 32
                        total_tuple = 64
@@ -1565,6 +1576,14 @@ class UnshardedVicuna(VicunaBase):
                            self.model_name,
                            self.hf_auth_token,
                        )
+                    elif self.model_name == "llama2_70b":
+                        model = SecondVicuna70B(
+                            self.hf_model_path,
+                            self.precision,
+                            self.weight_group_size,
+                            self.model_name,
+                            self.hf_auth_token,
+                        )
                    else:
                        model = SecondVicuna7B(
                            self.hf_model_path,
@@ -1593,9 +1612,7 @@ class UnshardedVicuna(VicunaBase):
                    secondVicunaCompileInput = list(secondVicunaCompileInput)
                    for i in range(len(secondVicunaCompileInput)):
                        if i != 0:
-                            secondVicunaCompileInput[
-                                i
-                            ] = torch_mlir.TensorPlaceholder.like(
+                            secondVicunaCompileInput[i] = torch_mlir.TensorPlaceholder.like(
                                secondVicunaCompileInput[i], dynamic_axes=[2]
                            )
                    secondVicunaCompileInput = tuple(secondVicunaCompileInput)
@@ -1610,9 +1627,10 @@ class UnshardedVicuna(VicunaBase):
                            use_tracing=False,
                            verbose=False,
                        )
+                        print(f"[DEBUG] converting torch to linalg")
                        run_pipeline_with_repro_report(
                            second_module,
-                            "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                            "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                            description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
                        )
                    else:
@@ -1626,6 +1644,7 @@ class UnshardedVicuna(VicunaBase):
                    del ts_graph
                    del secondVicunaCompileInput
                    gc.collect()
+
                    print(
                        "[DEBUG] successfully generated second vicuna linalg mlir"
                    )
@@ -1633,7 +1652,7 @@ class UnshardedVicuna(VicunaBase):
                        str(second_module)
                    )
                    if self.cache_vicunas:
-                        with open(f"second_{self.precision}.mlir", "w") as f:
+                        with open(second_model_path, "w+") as f:
                            f.write(second_module)
                        print("Finished writing IR after dynamic")

@@ -1641,10 +1660,12 @@ class UnshardedVicuna(VicunaBase):
                    first_module,
                    second_module,
                    self.vicuna_mlir_path,
-                    self.model_name,
                )
                del first_module, second_module

+        print(self.device)
+        if "rocm" in self.device:
+            self.device = "rocm"
        shark_module = SharkInference(
            mlir_module=combined_module,
            device=self.device,
@@ -1659,6 +1680,7 @@ class UnshardedVicuna(VicunaBase):
                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
            ]
            + self.extra_args,
+            debug=self.debug,
        )
        print("Saved vic vmfb at ", str(path))
        shark_module.load_module(path)
@@ -1726,7 +1748,6 @@ class UnshardedVicuna(VicunaBase):
            yield detok, ""

        res_str = self.decode_tokens(res_tokens)
-        # print(f"[DEBUG] final output : \n{res_str}")
        yield res_str, "formatted"

    def autocomplete(self, prompt):
--- a/apps/language_models/shark_llama_cli.spec
+++ b/apps/language_models/shark_llama_cli.spec
@@ -0,0 +1,94 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import collect_submodules
+from PyInstaller.utils.hooks import copy_metadata
+
+import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
+
+datas = []
+datas += collect_data_files('torch')
+datas += copy_metadata('torch')
+datas += copy_metadata('tqdm')
+datas += copy_metadata('regex')
+datas += copy_metadata('requests')
+datas += copy_metadata('packaging')
+datas += copy_metadata('filelock')
+datas += copy_metadata('numpy')
+datas += copy_metadata('tokenizers')
+datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torch-mlir')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
+datas += copy_metadata('huggingface-hub')
+datas += copy_metadata('sentencepiece')
+datas += copy_metadata("pyyaml")
+datas += collect_data_files("tokenizers")
+datas += collect_data_files("tiktoken")
+datas += collect_data_files("accelerate")
+datas += collect_data_files('diffusers')
+datas += collect_data_files('transformers')
+datas += collect_data_files('opencv-python')
+datas += collect_data_files('pytorch_lightning')
+datas += collect_data_files('skimage')
+datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_client')
+datas += collect_data_files('iree')
+datas += collect_data_files('google-cloud-storage')
+datas += collect_data_files('py-cpuinfo')
+datas += collect_data_files("shark", include_py_files=True)
+datas += collect_data_files("timm", include_py_files=True)
+datas += collect_data_files("tqdm")
+datas += collect_data_files("tkinter")
+datas += collect_data_files("webview")
+datas += collect_data_files("sentencepiece")
+datas += collect_data_files("jsonschema")
+datas += collect_data_files("jsonschema_specifications")
+datas += collect_data_files("cpuinfo")
+datas += collect_data_files("langchain")
+
+binaries = []
+
+block_cipher = None
+
+hiddenimports = ['shark', 'shark.shark_inference', 'apps']
+hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
+
+a = Analysis(
+    ['scripts/vicuna.py'],
+    pathex=['.'],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=hiddenimports,
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='shark_llama_cli',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/apps/language_models/src/model_wrappers/vicuna4.py
+++ b/apps/language_models/src/model_wrappers/vicuna4.py
@@ -57,8 +57,6 @@ from shark.shark_importer import get_f16_inputs
 from shark.shark_importer import import_with_fx
 from shark.shark_inference import SharkInference

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import (
    LlamaDecoderLayer,
--- a/apps/language_models/src/model_wrappers/vicuna_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model.py
@@ -1,9 +1,6 @@
 import torch
 from transformers import AutoModelForCausalLM

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
-

 class FirstVicuna(torch.nn.Module):
    def __init__(
@@ -21,7 +18,13 @@ class FirstVicuna(torch.nn.Module):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
+        print(f"[DEBUG] model_path : {model_path}")
        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("First Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
@@ -64,7 +67,13 @@ class SecondVicuna7B(torch.nn.Module):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
+        print(f"[DEBUG] model_path : {model_path}")
        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("Second Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
@@ -148,8 +157,6 @@ class SecondVicuna7B(torch.nn.Module):
        i63,
        i64,
    ):
-        # input_ids = input_tuple[0]
-        # input_tuple = torch.unbind(pkv, dim=0)
        token = i0
        past_key_values = (
            (i1, i2),
@@ -294,7 +301,7 @@ class SecondVicuna13B(torch.nn.Module):
    def __init__(
        self,
        model_path,
-        precision="fp32",
+        precision="int8",
        weight_group_size=128,
        model_name="vicuna",
        hf_auth_token: str = None,
@@ -307,6 +314,11 @@ class SecondVicuna13B(torch.nn.Module):
            model_path, low_cpu_mem_usage=True, **kwargs
        )
        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("Second Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
@@ -406,8 +418,6 @@ class SecondVicuna13B(torch.nn.Module):
        i79,
        i80,
    ):
-        # input_ids = input_tuple[0]
-        # input_tuple = torch.unbind(pkv, dim=0)
        token = i0
        past_key_values = (
            (i1, i2),
@@ -580,6 +590,540 @@ class SecondVicuna13B(torch.nn.Module):
        return tuple(return_vals)


+class SecondVicuna70B(torch.nn.Module):
+    def __init__(
+        self,
+        model_path,
+        precision="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        print(f"[DEBUG] model_path : {model_path}")
+        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
+            print("Second Vicuna applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                get_model_impl(self.model).layers,
+                dtype=torch.float16,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+        i65,
+        i66,
+        i67,
+        i68,
+        i69,
+        i70,
+        i71,
+        i72,
+        i73,
+        i74,
+        i75,
+        i76,
+        i77,
+        i78,
+        i79,
+        i80,
+        i81,
+        i82,
+        i83,
+        i84,
+        i85,
+        i86,
+        i87,
+        i88,
+        i89,
+        i90,
+        i91,
+        i92,
+        i93,
+        i94,
+        i95,
+        i96,
+        i97,
+        i98,
+        i99,
+        i100,
+        i101,
+        i102,
+        i103,
+        i104,
+        i105,
+        i106,
+        i107,
+        i108,
+        i109,
+        i110,
+        i111,
+        i112,
+        i113,
+        i114,
+        i115,
+        i116,
+        i117,
+        i118,
+        i119,
+        i120,
+        i121,
+        i122,
+        i123,
+        i124,
+        i125,
+        i126,
+        i127,
+        i128,
+        i129,
+        i130,
+        i131,
+        i132,
+        i133,
+        i134,
+        i135,
+        i136,
+        i137,
+        i138,
+        i139,
+        i140,
+        i141,
+        i142,
+        i143,
+        i144,
+        i145,
+        i146,
+        i147,
+        i148,
+        i149,
+        i150,
+        i151,
+        i152,
+        i153,
+        i154,
+        i155,
+        i156,
+        i157,
+        i158,
+        i159,
+        i160,
+    ):
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+            (
+                i65,
+                i66,
+            ),
+            (
+                i67,
+                i68,
+            ),
+            (
+                i69,
+                i70,
+            ),
+            (
+                i71,
+                i72,
+            ),
+            (
+                i73,
+                i74,
+            ),
+            (
+                i75,
+                i76,
+            ),
+            (
+                i77,
+                i78,
+            ),
+            (
+                i79,
+                i80,
+            ),
+            (
+                i81,
+                i82,
+            ),
+            (
+                i83,
+                i84,
+            ),
+            (
+                i85,
+                i86,
+            ),
+            (
+                i87,
+                i88,
+            ),
+            (
+                i89,
+                i90,
+            ),
+            (
+                i91,
+                i92,
+            ),
+            (
+                i93,
+                i94,
+            ),
+            (
+                i95,
+                i96,
+            ),
+            (
+                i97,
+                i98,
+            ),
+            (
+                i99,
+                i100,
+            ),
+            (
+                i101,
+                i102,
+            ),
+            (
+                i103,
+                i104,
+            ),
+            (
+                i105,
+                i106,
+            ),
+            (
+                i107,
+                i108,
+            ),
+            (
+                i109,
+                i110,
+            ),
+            (
+                i111,
+                i112,
+            ),
+            (
+                i113,
+                i114,
+            ),
+            (
+                i115,
+                i116,
+            ),
+            (
+                i117,
+                i118,
+            ),
+            (
+                i119,
+                i120,
+            ),
+            (
+                i121,
+                i122,
+            ),
+            (
+                i123,
+                i124,
+            ),
+            (
+                i125,
+                i126,
+            ),
+            (
+                i127,
+                i128,
+            ),
+            (
+                i129,
+                i130,
+            ),
+            (
+                i131,
+                i132,
+            ),
+            (
+                i133,
+                i134,
+            ),
+            (
+                i135,
+                i136,
+            ),
+            (
+                i137,
+                i138,
+            ),
+            (
+                i139,
+                i140,
+            ),
+            (
+                i141,
+                i142,
+            ),
+            (
+                i143,
+                i144,
+            ),
+            (
+                i145,
+                i146,
+            ),
+            (
+                i147,
+                i148,
+            ),
+            (
+                i149,
+                i150,
+            ),
+            (
+                i151,
+                i152,
+            ),
+            (
+                i153,
+                i154,
+            ),
+            (
+                i155,
+                i156,
+            ),
+            (
+                i157,
+                i158,
+            ),
+            (
+                i159,
+                i160,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
+        return_vals.append(op.logits)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
 class CombinedModel(torch.nn.Module):
    def __init__(
        self,
--- a/apps/language_models/src/pipelines/SharkLLMBase.py
+++ b/apps/language_models/src/pipelines/SharkLLMBase.py
@@ -3,7 +3,10 @@ from abc import ABC, abstractmethod

 class SharkLLMBase(ABC):
    def __init__(
-        self, model_name, hf_model_path=None, max_num_tokens=512
+        self,
+        model_name,
+        hf_model_path=None,
+        max_num_tokens=512,
    ) -> None:
        self.model_name = model_name
        self.hf_model_path = hf_model_path
--- a/apps/language_models/src/pipelines/falcon_pipeline.py
+++ b/apps/language_models/src/pipelines/falcon_pipeline.py
@@ -71,6 +71,7 @@ class Falcon(SharkLLMBase):
        precision="fp32",
        falcon_mlir_path=None,
        falcon_vmfb_path=None,
+        debug=False,
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
        self.max_padding_length = 100
@@ -78,6 +79,7 @@ class Falcon(SharkLLMBase):
        self.precision = precision
        self.falcon_vmfb_path = falcon_vmfb_path
        self.falcon_mlir_path = falcon_mlir_path
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
        self.shark_model = self.compile()
        self.src_model = self.get_src_model()
@@ -208,6 +210,7 @@ class Falcon(SharkLLMBase):
                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
                "--iree-spirv-index-bits=64",
            ],
+            debug=self.debug,
        )
        print("Saved falcon vmfb at ", str(path))
        shark_module.load_module(path)
--- a/apps/language_models/src/pipelines/minigpt4_pipeline.py
+++ b/apps/language_models/src/pipelines/minigpt4_pipeline.py
@@ -178,7 +178,7 @@ def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):


 def compile_module(
-    shark_module, extended_model_name, generate_vmfb, extra_args=[]
+    shark_module, extended_model_name, generate_vmfb, extra_args=[], debug=False,
 ):
    if generate_vmfb:
        vmfb_path = os.path.join(os.getcwd(), extended_model_name + ".vmfb")
@@ -190,7 +190,7 @@ def compile_module(
                "No vmfb found. Compiling and saving to {}".format(vmfb_path)
            )
            path = shark_module.save_module(
-                os.getcwd(), extended_model_name, extra_args
+                os.getcwd(), extended_model_name, extra_args, debug=debug
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -199,7 +199,7 @@ def compile_module(


 def compile_int_precision(
-    model, inputs, precision, device, generate_vmfb, extended_model_name
+    model, inputs, precision, device, generate_vmfb, extended_model_name, debug=False
 ):
    torchscript_module = import_with_fx(
        model,
@@ -219,7 +219,7 @@ def compile_int_precision(
    print(f"[DEBUG] converting torch to linalg")
    run_pipeline_with_repro_report(
        mlir_module,
-        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
    )
    from contextlib import redirect_stdout
@@ -251,6 +251,7 @@ def compile_int_precision(
            extended_model_name=extended_model_name,
            generate_vmfb=generate_vmfb,
            extra_args=extra_args,
+            debug=debug,
        ),
        bytecode,
    )
@@ -294,6 +295,7 @@ def shark_compile_through_fx_int(
        device,
        generate_or_load_vmfb,
        extended_model_name,
+        debug,
    )
    extra_args = [
        "--iree-hal-dump-executable-sources-to=ies",
--- a/apps/language_models/src/pipelines/stablelm_pipeline.py
+++ b/apps/language_models/src/pipelines/stablelm_pipeline.py
@@ -32,11 +32,13 @@ class SharkStableLM(SharkLLMBase):
        max_num_tokens=512,
        device="cuda",
        precision="fp32",
+        debug="False",
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
        self.max_sequence_len = 256
        self.device = device
        self.precision = precision
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
        self.shark_model = self.compile()

@@ -111,7 +113,7 @@ class SharkStableLM(SharkLLMBase):
        shark_module.compile()

        path = shark_module.save_module(
-            vmfb_path.parent.absolute(), vmfb_path.stem
+            vmfb_path.parent.absolute(), vmfb_path.stem, debug=self.debug
        )
        print("Saved vmfb at ", str(path))

--- a/apps/stable_diffusion/shark_studio_imports.py
+++ b/apps/stable_diffusion/shark_studio_imports.py
@@ -74,8 +74,11 @@ datas += [
 # hidden imports for pyinstaller
 hiddenimports = ["shark", "shark.shark_inference", "apps"]
 hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+blacklist = ["tests", "convert"]
 hiddenimports += [
-    x for x in collect_submodules("transformers") if "tests" not in x
+    x
+    for x in collect_submodules("transformers")
+    if not any(kw in x for kw in blacklist)
 ]
 hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
 hiddenimports += ["iree._runtime", "iree._runtime_libs"]
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -570,6 +570,14 @@ p.add_argument(
    "in shark importer. Does nothing if import_mlir is false (the default).",
 )

+p.add_argument(
+    "--compile_debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag to toggle debug assert/verify flags for imported IR in the"
+    "iree-compiler. Default to false.",
+)
+
 p.add_argument(
    "--iree_constant_folding",
    default=True,
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -25,7 +25,7 @@ from shark.iree_utils.vulkan_utils import (
    get_iree_vulkan_runtime_flags,
 )
 from shark.iree_utils.metal_utils import get_metal_target_triple
-from shark.iree_utils.gpu_utils import get_cuda_sm_cc
+from shark.iree_utils.gpu_utils import get_cuda_sm_cc, get_iree_rocm_args
 from apps.stable_diffusion.src.utils.stable_args import args
 from apps.stable_diffusion.src.utils.resources import opt_flags
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
@@ -78,7 +78,7 @@ def _compile_module(shark_module, model_name, extra_args=[]):
                    )
                )
            path = shark_module.save_module(
-                os.getcwd(), model_name, extra_args
+                os.getcwd(), model_name, extra_args, debug=args.compile_debug
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -476,6 +476,8 @@ def get_available_devices():
    available_devices.extend(metal_devices)
    cuda_devices = get_devices_by_name("cuda")
    available_devices.extend(cuda_devices)
+    rocm_devices = get_devices_by_name("rocm")
+    available_devices.extend(rocm_devices)
    cpu_device = get_devices_by_name("cpu-sync")
    available_devices.extend(cpu_device)
    cpu_device = get_devices_by_name("cpu-task")
@@ -499,7 +501,10 @@ def get_opt_flags(model, precision="fp16"):
        iree_flags.append(
            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
        )
-
+    if "rocm" in args.device:
+        rocm_args = get_iree_rocm_args()
+        iree_flags.extend(rocm_args)
+        print(iree_flags)
    if args.iree_constant_folding == False:
        iree_flags.append("--iree-opt-const-expr-hoisting=False")
        iree_flags.append(
--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -194,6 +194,18 @@ def chat(
        from apps.language_models.scripts.vicuna import UnshardedVicuna
        from apps.stable_diffusion.src import args

+        if vicuna_model == 0:
+            if "cuda" in device:
+                device = "cuda"
+            elif "sync" in device:
+                device = "cpu-sync"
+            elif "task" in device:
+                device = "cpu-task"
+            elif "vulkan" in device:
+                device = "vulkan"
+            elif "rocm" in device:
+                device = "rocm"
+
        if new_model_vmfb_key != model_vmfb_key:
            model_vmfb_key = new_model_vmfb_key
            max_toks = 128 if model_name == "codegen" else 512
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -7,22 +7,13 @@ import fileinput
 from pathlib import Path

 # Temporary workaround for transformers/__init__.py.
-path_to_stdhooks = Path(
-    get_python_lib() + "/_pyinstaller_hooks_contrib/hooks/stdhooks"
-)
 path_to_transformers_hook = Path(
-    str(path_to_stdhooks) + "hook-transformers.py"
+    get_python_lib()
+    + "/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-transformers.py"
 )
 if path_to_transformers_hook.is_file():
    pass
 else:
-    if not path_to_stdhooks.is_dir():
-        import os
-
-        print(
-            f"Path to pyinstaller stdhooks not found. Please check your pyinstaller packages at {path_to_stdhooks}."
-        )
-        os.mkdir(path_to_stdhooks)
    with open(path_to_transformers_hook, "w") as f:
        f.write("module_collection_mode = 'pyz+py'")

--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -52,6 +52,8 @@ def iree_device_map(device):
    )
    if len(uri_parts) == 1:
        return iree_driver
+    elif "rocm" in uri_parts:
+        return "rocm"
    else:
        return f"{iree_driver}://{uri_parts[1]}"

@@ -63,7 +65,6 @@ def get_supported_device_list():
 _IREE_DEVICE_MAP = {
    "cpu": "local-task",
    "cpu-task": "local-task",
-    "AMD-AIE": "local-task",
    "cpu-sync": "local-sync",
    "cuda": "cuda",
    "vulkan": "vulkan",
@@ -82,7 +83,6 @@ def iree_target_map(device):
 _IREE_TARGET_MAP = {
    "cpu": "llvm-cpu",
    "cpu-task": "llvm-cpu",
-    "AMD-AIE": "llvm-cpu",
    "cpu-sync": "llvm-cpu",
    "cuda": "cuda",
    "vulkan": "vulkan",
@@ -121,7 +121,10 @@ def check_device_drivers(device):
        return False
    elif device == "rocm":
        try:
-            subprocess.check_output("rocminfo")
+            if sys.platform == "win32":
+                subprocess.check_output("hipinfo")
+            else:
+                subprocess.check_output("rocminfo")
        except Exception:
            return True

--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import iree._runtime.scripts.iree_benchmark_module as benchmark_module
 from shark.iree_utils._common import run_cmd, iree_device_map
 from shark.iree_utils.cpu_utils import get_cpu_count
 import numpy as np
@@ -102,15 +101,13 @@ def build_benchmark_args_non_tensor_input(
    and whether it is training or not.
    Outputs: string that execute benchmark-module on target model.
    """
-    path = benchmark_module.__path__[0]
+    path = os.path.join(os.environ["VIRTUAL_ENV"], "bin")
    if platform.system() == "Windows":
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module.exe"
-        )
+        benchmarker_path = os.path.join(path, "iree-benchmark-module.exe")
+        time_extractor = None
    else:
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module"
-        )
+        benchmarker_path = os.path.join(path, "iree-benchmark-module")
+        time_extractor = "| awk 'END{{print $2 $3}}'"
    benchmark_cl = [benchmarker_path, f"--module={input_file}"]
    # TODO: The function named can be passed as one of the args.
    if function_name:
@@ -135,7 +132,7 @@ def run_benchmark_module(benchmark_cl):
    benchmark_path = benchmark_cl[0]
    assert os.path.exists(
        benchmark_path
-    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
+    ), "Cannot find iree_benchmark_module, Please contact SHARK maintainer on discord."
    bench_stdout, bench_stderr = run_cmd(" ".join(benchmark_cl))
    try:
        regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -92,13 +92,27 @@ def get_iree_frontend_args(frontend):


 # Common args to be used given any frontend or device.
-def get_iree_common_args():
-    return [
+def get_iree_common_args(debug=False):
+    common_args = [
        "--iree-stream-resource-max-allocation-size=4294967295",
        "--iree-vm-bytecode-module-strip-source-map=true",
        "--iree-util-zero-fill-elided-attrs",
-        "--iree-opt-strip-assertions=true",
    ]
+    if debug == True:
+        common_args.extend(
+            [
+                "--iree-opt-strip-assertions=false",
+                "--verify=true",
+            ]
+        )
+    else:
+        common_args.extend(
+            [
+                "--iree-opt-strip-assertions=true",
+                "--verify=false",
+            ]
+        )
+    return common_args


 # Args that are suitable only for certain models or groups of models.
@@ -277,12 +291,13 @@ def compile_module_to_flatbuffer(
    model_config_path,
    extra_args,
    model_name="None",
+    debug=False,
 ):
    # Setup Compile arguments wrt to frontends.
    input_type = ""
    args = get_iree_frontend_args(frontend)
    args += get_iree_device_args(device, extra_args)
-    args += get_iree_common_args()
+    args += get_iree_common_args(debug=debug)
    args += get_model_specific_args()
    args += extra_args

@@ -342,7 +357,8 @@ def load_vmfb_using_mmap(
    flatbuffer_blob_or_path, device: str, device_idx: int = None
 ):
    print(f"Loading module {flatbuffer_blob_or_path}...")
-
+    if "rocm" in device:
+        device = "rocm"
    with DetailLogger(timeout=2.5) as dl:
        # First get configs.
        if device_idx is not None:
@@ -409,10 +425,11 @@ def get_iree_compiled_module(
    extra_args: list = [],
    device_idx: int = None,
    mmap: bool = False,
+    debug: bool = False,
 ):
    """Given a module returns the compiled .vmfb and configs"""
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, frontend, model_config_path, extra_args
+        module, device, frontend, model_config_path, extra_args, debug
    )
    temp_file_to_unlink = None
    # TODO: Currently mmap=True control flow path has been switched off for mmap.
@@ -468,10 +485,11 @@ def export_iree_module_to_vmfb(
    model_config_path: str = None,
    module_name: str = None,
    extra_args: list = [],
+    debug: bool = False,
 ):
    # Compiles the module given specs and saves it as .vmfb file.
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, mlir_dialect, model_config_path, extra_args
+        module, device, mlir_dialect, model_config_path, extra_args, debug
    )
    if module_name is None:
        device_name = (
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -17,6 +17,7 @@
 import functools
 import iree.runtime as ireert
 import ctypes
+import sys
 from shark.parser import shark_args


@@ -42,21 +43,51 @@ def get_iree_gpu_args():
@functools.cache
 def get_iree_rocm_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    # get arch from rocminfo.
+    # get arch from hipinfo.
+    import os
    import re
    import subprocess

-    rocm_arch = re.match(
-        r".*(gfx\w+)",
-        subprocess.check_output(
-            "rocminfo | grep -i 'gfx'", shell=True, text=True
-        ),
-    ).group(1)
-    print(f"Found rocm arch {rocm_arch}...")
+    if sys.platform == "win32":
+        if "HIP_PATH" in os.environ:
+            rocm_path = os.environ["HIP_PATH"]
+            print(f"Found a ROCm installation at {rocm_path}.")
+        else:
+            print("Failed to find ROCM_PATH. Defaulting to C:\\AMD\\ROCM\\5.5")
+            rocm_path = "C:\\AMD\\ROCM\\5.5"
+    else:
+        if "ROCM_PATH" in os.environ:
+            rocm_path = os.environ["ROCM_PATH"]
+            print(f"Found a ROCm installation at {rocm_path}.")
+        else:
+            print("Failed to find ROCM_PATH. Defaulting to /opt/rocm")
+            rocm_path = "/opt/rocm/"
+
+    try:
+        if sys.platform == "win32":
+            rocm_arch = re.search(
+                r"gfx\d{3,}",
+                subprocess.check_output("hipinfo", shell=True, text=True),
+            ).group(0)
+        else:
+            rocm_arch = re.match(
+                r".*(gfx\w+)",
+                subprocess.check_output(
+                    "rocminfo | grep -i 'gfx'", shell=True, text=True
+                ),
+            ).group(1)
+        print(f"Found rocm arch {rocm_arch}...")
+    except:
+        print(
+            "Failed to find ROCm architecture from hipinfo / rocminfo. Defaulting to gfx1100."
+        )
+        rocm_arch = "gfx1100"
+
+    bc_path = os.path.join(rocm_path, "amdgcn", "bitcode")
    return [
        f"--iree-rocm-target-chip={rocm_arch}",
        "--iree-rocm-link-bc=true",
-        "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
+        f"--iree-rocm-bc-dir={bc_path}",
    ]


--- a/shark/shark_compile.py
+++ b/shark/shark_compile.py
@@ -115,7 +115,7 @@ def compile_int_precision(
    print(f"[DEBUG] converting torch to linalg")
    run_pipeline_with_repro_report(
        mlir_module,
-        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
    )
    from contextlib import redirect_stdout
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -509,22 +509,6 @@ def import_with_fx(
    from torch.fx.experimental.proxy_tensor import make_fx
    from torch._decomp import get_decompositions
    from typing import List
-    from brevitas_examples.llm.llm_quant.export import (
-        block_quant_layer_level_manager,
-    )
-    from brevitas_examples.llm.llm_quant.export import (
-        brevitas_layer_export_mode,
-    )
-    from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
-        LinearWeightBlockQuantHandlerFwd,
-    )
-    from brevitas_examples.llm.llm_quant.export import replace_call_fn_target
-    from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
-        matmul_rhs_group_quant_placeholder,
-    )
-    from brevitas.backport.fx.experimental.proxy_tensor import (
-        make_fx as brevitas_make_fx,
-    )

    golden_values = None
    if debug:
@@ -598,6 +582,25 @@ def import_with_fx(
        torch.ops.aten.masked_fill.Scalar,
    ]
    if precision in ["int4", "int8"]:
+        from brevitas_examples.llm.llm_quant.export import (
+            block_quant_layer_level_manager,
+        )
+        from brevitas_examples.llm.llm_quant.export import (
+            brevitas_layer_export_mode,
+        )
+        from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
+            LinearWeightBlockQuantHandlerFwd,
+        )
+        from brevitas_examples.llm.llm_quant.export import (
+            replace_call_fn_target,
+        )
+        from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
+            matmul_rhs_group_quant_placeholder,
+        )
+        from brevitas.backport.fx.experimental.proxy_tensor import (
+            make_fx as brevitas_make_fx,
+        )
+
        export_context_manager = brevitas_layer_export_mode
        export_class = block_quant_layer_level_manager(
            export_handlers=[LinearWeightBlockQuantHandlerFwd]
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -192,7 +192,9 @@ class SharkInference:

    # TODO: Instead of passing directory and having names decided by the module
    # , user may want to save the module with manual names.
-    def save_module(self, dir=os.getcwd(), module_name=None, extra_args=[]):
+    def save_module(
+        self, dir=os.getcwd(), module_name=None, extra_args=[], debug=False
+    ):
        return export_iree_module_to_vmfb(
            self.mlir_module,
            self.device,
@@ -200,6 +202,7 @@ class SharkInference:
            self.mlir_dialect,
            module_name=module_name,
            extra_args=extra_args,
+            debug=debug,
        )

    # load and return the module.
--- a/tank/examples/opt/opt_causallm.py
+++ b/tank/examples/opt/opt_causallm.py
@@ -59,7 +59,7 @@ def create_module(model_name, tokenizer, device):
    )

    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{device}"
-    shark_module.save_module(module_name=vmfb_name)
+    shark_module.save_module(module_name=vmfb_name, debug=False)
    vmfb_path = vmfb_name + ".vmfb"
    return vmfb_path
Author	SHA1	Message	Date
jinchen62	3601dc7c3b	Fix llama2 13b combined ir (#1803 )	2023-08-28 11:34:44 -07:00
Daniel Garvey	671881cf87	Llama2 70b (#1783 ) * llama2 70b IR gen * fix IR sec llama2 + debug * llama270b --------- Co-authored-by: PhaneeshB <b.phaneesh@gmail.com>	2023-08-25 23:04:28 -07:00
Gaurav Shukla	4e9be6be59	[chatbot] Add debug as class attribute (#1799 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-08-25 21:46:29 -07:00
Ean Garvey	9c8cbaf498	Add support for ROCM (Windows) in Studio + compile utils (#1770 ) * WIP: MSVC ROCM support for SHARK Studio * Make get_iree_rocm_args platform-agnostic. * Update stable_args.py * Update rocm arg handling in SD utils * Guard quantization imports. Co-authored-by: jam https://github.com/jammm	2023-08-25 20:56:05 -07:00
Ean Garvey	9e348a114e	Revert changes process_skipfiles.py (#1798 ) Keeps a small typo fix but reverts the rest of changes to this file from `450c231171`	2023-08-25 15:31:49 -07:00
jinchen62	51f90a4d56	Update conversion passes for brevitas quant op (#1795 )	2023-08-25 17:28:07 -05:00
Abhishek Varma	310d5d0a49	Fix llama2 13b crashing + add spec file for CLI execution of Llama (#1797 ) * [Llama2] Add a fix for Llama2 13B downloading/crashing -- This commit fixes downloading/crashing of llama2 13B on wrong .mlir file. -- Also adds support for downloading vmfb from shark_tank in CLI. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> * [llama2] Add a spec file to run Llama/Vicuna CLI exe -- This commit adds a spec file to run Llama/Vicuna CLI exe. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> --------- Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-08-25 09:36:09 -05:00
Ean Garvey	9697981004	Pipe through a debug option to iree compile utils. (#1796 ) * Update compile_utils.py * Pipe through a flag to toggle debug options in compile utils. * Update SharkLLMBase.py	2023-08-25 07:11:11 -07:00