Update README.md (#1830 )

[flags] Fix vulkan runtime flags as vma is dropped from iree (#1831 )
(SD) Fix tokenizers imports in pyinstaller builds. (#1828 )
2026-04-20 03:00:34 -04:00 · 2023-09-14 10:33:57 -05:00 · 2023-09-14 08:58:59 -05:00 · 2023-09-12 12:23:48 -05:00 · 2023-09-12 10:59:28 +05:30 · 2023-09-11 13:42:52 -05:00
18 changed files with 417 additions and 504 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -196,3 +196,6 @@ db_dir_UserData

 # Embeded browser cache and other
 apps/stable_diffusion/web/EBWebView/
+
+# Llama2 tokenizer configs
+llama2_tokenizer_configs/
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ High Performance Machine Learning Distribution
  <summary>Prerequisites - Drivers </summary>
  
 #### Install your Windows hardware drivers
-* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-2-1).
+* [AMD RDNA Users] Download the latest driver (23.2.1 is the oldest supported) [here](https://www.amd.com/en/support).
 * [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
 * [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
  
--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from tqdm import tqdm
 from typing import List, Tuple
 import subprocess
+import sys

 import torch
 import torch_mlir
@@ -413,8 +414,7 @@ class VicunaBase(SharkLLMBase):
            _past_key_values = torch.tensor(output[1:])
            _token = torch.argmax(_logits[:, -1, :], dim=1)

-        skip_sp_tok = True if self.model_name == "codegen" else False
-        _detok = self.tokenizer.decode(_token, skip_special_tokens=skip_sp_tok)
+        _detok = self.tokenizer.decode(_token, skip_special_tokens=False)
        ret_dict = {
            "token": _token,
            "detok": _detok,
@@ -465,17 +465,11 @@ class ShardedVicuna(VicunaBase):
            kwargs = {
                "use_auth_token": "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"
            }
-        if self.model_name == "codegen":
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                trust_remote_code=True,
-            )
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                use_fast=False,
-                **kwargs,
-            )
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.hf_model_path,
+            use_fast=False,
+            **kwargs,
+        )
        return tokenizer

    def get_src_model(self):
@@ -1244,10 +1238,6 @@ class UnshardedVicuna(VicunaBase):
            max_num_tokens,
            extra_args_cmd=extra_args_cmd,
        )
-        if "llama2" in self.model_name and hf_auth_token == None:
-            raise ValueError(
-                "HF auth token required. Pass it using --hf_auth_token flag."
-            )
        self.hf_auth_token = hf_auth_token
        if self.model_name == "llama2_7b":
            self.hf_model_path = "meta-llama/Llama-2-7b-chat-hf"
@@ -1272,7 +1262,7 @@ class UnshardedVicuna(VicunaBase):
            self.vicuna_vmfb_path = self.get_model_path(suffix="vmfb")
        self.tokenizer = self.get_tokenizer()
        self.cache_vicunas = cache_vicunas
-        self.compile(download_vmfb)
+        self.compile()

    def get_model_path(self, suffix="mlir"):
        safe_device = self.device.split("-")[0]
@@ -1283,18 +1273,21 @@ class UnshardedVicuna(VicunaBase):
        )

    def get_tokenizer(self):
-        kwargs = {"use_auth_token": self.hf_auth_token}
-        if self.model_name == "codegen":
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                trust_remote_code=True,
-            )
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                use_fast=False,
-                **kwargs,
+        local_tokenizer_path = Path(Path.cwd(), "llama2_tokenizer_configs")
+        local_tokenizer_path.mkdir(parents=True, exist_ok=True)
+        tokenizer_files_to_download = [
+            "config.json",
+            "special_tokens_map.json",
+            "tokenizer.model",
+            "tokenizer_config.json",
+        ]
+        for tokenizer_file in tokenizer_files_to_download:
+            download_public_file(
+                f"gs://shark_tank/llama2_tokenizer/{tokenizer_file}",
+                Path(local_tokenizer_path, tokenizer_file),
+                single_file=True,
            )
+        tokenizer = AutoTokenizer.from_pretrained(str(local_tokenizer_path))
        return tokenizer

    def get_src_model(self):
@@ -1369,7 +1362,7 @@ class UnshardedVicuna(VicunaBase):
        if "llama2_13b" in self.model_name:
            pkv_tensor_shape = "tensor<1x40x?x128x"
        elif "llama2_70b" in self.model_name:
-            pkv_tensor_shape = "tensor<1x60x?x128x"
+            pkv_tensor_shape = "tensor<1x8x?x128x"
        else:
            pkv_tensor_shape = "tensor<1x32x?x128x"
        if self.precision in ["fp16", "int4", "int8"]:
@@ -1404,13 +1397,13 @@ class UnshardedVicuna(VicunaBase):

        return "\n".join(new_lines)

-    def compile(self, download_vmfb=False):
+    def compile(self):
        # Testing : DO NOT Download Vmfbs if not found. Modify later
        # download vmfbs for A100
-        print(
-            f"Looking into gs://shark_tank/{self.model_name}/unsharded/vmfb/{self.vicuna_vmfb_path.name}"
-        )
-        if not self.vicuna_vmfb_path.exists() and download_vmfb:
+        if not self.vicuna_vmfb_path.exists() and self.download_vmfb:
+            print(
+                f"Looking into gs://shark_tank/{self.model_name}/unsharded/vmfb/{self.vicuna_vmfb_path.name}"
+            )
            download_public_file(
                f"gs://shark_tank/{self.model_name}/unsharded/vmfb/{self.vicuna_vmfb_path.name}",
                self.vicuna_vmfb_path.absolute(),
@@ -1423,245 +1416,240 @@ class UnshardedVicuna(VicunaBase):
            print(f"[DEBUG] vmfb found at {self.vicuna_vmfb_path.absolute()}")
            return

-        print(f"[DEBUG] vmfb not found at {self.vicuna_vmfb_path.absolute()}")
-        if self.vicuna_mlir_path.exists():
-            print(f"[DEBUG] mlir found at {self.vicuna_mlir_path.absolute()}")
-            with open(self.vicuna_mlir_path, "rb") as f:
-                combined_module = f.read()
-        else:
-            print(
-                f"[DEBUG] mlir not found at {self.vicuna_mlir_path.absolute()}"
-            )
-            mlir_generated = False
-            if self.load_mlir_from_shark_tank:
-                # download MLIR from shark tank
-                for suffix in ["mlirbc", "mlir"]:
-                    self.vicuna_mlir_path = self.get_model_path(suffix)
-                    download_public_file(
-                        f"gs://shark_tank/{self.model_name}/unsharded/mlir/{self.vicuna_mlir_path.name}",
-                        self.vicuna_mlir_path.absolute(),
-                        single_file=True,
-                    )
-                    if self.vicuna_mlir_path.exists():
-                        with open(self.vicuna_mlir_path, "rb") as f:
-                            combined_module = f.read()
-                        mlir_generated = True
-                        break
-                self.vicuna_mlir_path = self.get_model_path("mlir")
-                if not mlir_generated:
-                    print(
-                        f"[DEBUG] failed to download {self.vicuna_mlir_path.name} from shark tank"
-                    )
+        print(f"[DEBUG] vmfb not found")
+        mlir_generated = False
+        for suffix in ["mlirbc", "mlir"]:
+            self.vicuna_mlir_path = self.get_model_path(suffix)
+            if not self.vicuna_mlir_path.exists() and self.load_mlir_from_shark_tank:
+                print(
+                    f"Looking into gs://shark_tank/{self.model_name}/unsharded/mlir/{self.vicuna_mlir_path.name}"
+                )
+                download_public_file(
+                    f"gs://shark_tank/{self.model_name}/unsharded/mlir/{self.vicuna_mlir_path.name}",
+                    self.vicuna_mlir_path.absolute(),
+                    single_file=True,
+                )
+            if self.vicuna_mlir_path.exists():
+                print(f"[DEBUG] mlir found at {self.vicuna_mlir_path.absolute()}")
+                with open(self.vicuna_mlir_path, "rb") as f:
+                    combined_module = f.read()
+                mlir_generated = True
+                break

-            if not mlir_generated:
-                print("[DEBUG] generating mlir on device")
-                # Select a compilation prompt such that the resulting input_ids
-                # from the model's tokenizer has shape [1, 19]
-                if self.model_name == "codegen":
-                    compilation_prompt = "def hello_world():\n    print('Hello World')\n    print('Hello World')"
-                else:
-                    compilation_prompt = "".join(["0" for _ in range(17)])
+        if not mlir_generated:
+            print(f"[DEBUG] mlir not found")
+            # Disabling this path of IR generation for now as it is broken.
+            print("Please check if the mlir file is present at the shark tank. Exiting.")
+            self.shark_model = None
+            sys.exit()
+            return

-                first_model_path = f"first_{self.model_name}_{self.precision}.mlir"
-                if Path(first_model_path).exists():
-                    print(f"loading {first_model_path}")
-                    with open(Path(first_model_path), "r") as f:
-                        first_module = f.read()
+            print("[DEBUG] generating mlir on device")
+            # Select a compilation prompt such that the resulting input_ids
+            # from the model's tokenizer has shape [1, 19]
+            compilation_prompt = "".join(["0" for _ in range(17)])
+
+            first_model_path = f"first_{self.model_name}_{self.precision}.mlir"
+            if Path(first_model_path).exists():
+                print(f"loading {first_model_path}")
+                with open(Path(first_model_path), "r") as f:
+                    first_module = f.read()
+            else:
+                # generate first vicuna
+                compilation_input_ids = self.tokenizer(
+                    compilation_prompt,
+                    return_tensors="pt",
+                ).input_ids
+                compilation_input_ids = torch.tensor(
+                    compilation_input_ids
+                ).reshape([1, 19])
+                firstVicunaCompileInput = (compilation_input_ids,)
+                model = FirstVicuna(
+                    self.hf_model_path,
+                    self.precision,
+                    self.weight_group_size,
+                    self.model_name,
+                    self.hf_auth_token,
+                )
+                print(f"[DEBUG] generating torchscript graph")
+                is_f16 = self.precision in ["fp16", "int4"]
+                ts_graph = import_with_fx(
+                    model,
+                    firstVicunaCompileInput,
+                    is_f16=is_f16,
+                    precision=self.precision,
+                    f16_input_mask=[False, False],
+                    mlir_type="torchscript",
+                )
+                del model
+                firstVicunaCompileInput = list(firstVicunaCompileInput)
+                firstVicunaCompileInput[
+                    0
+                ] = torch_mlir.TensorPlaceholder.like(
+                    firstVicunaCompileInput[0], dynamic_axes=[1]
+                )
+
+                firstVicunaCompileInput = tuple(firstVicunaCompileInput)
+                first_module = None
+                print(f"[DEBUG] generating torch mlir")
+                if self.precision in ["int4", "int8"]:
+                    first_module = torch_mlir.compile(
+                        ts_graph,
+                        [*firstVicunaCompileInput],
+                        output_type=torch_mlir.OutputType.TORCH,
+                        backend_legal_ops=["quant.matmul_rhs_group_quant"],
+                        extra_library=brevitas_matmul_rhs_group_quant_library,
+                        use_tracing=False,
+                        verbose=False,
+                    )
+                    print(f"[DEBUG] converting torch to linalg")
+                    run_pipeline_with_repro_report(
+                        first_module,
+                        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
+                    )
                else:
-                    # generate first vicuna
-                    compilation_input_ids = self.tokenizer(
-                        compilation_prompt,
-                        return_tensors="pt",
-                    ).input_ids
-                    compilation_input_ids = torch.tensor(
-                        compilation_input_ids
-                    ).reshape([1, 19])
-                    firstVicunaCompileInput = (compilation_input_ids,)
-                    model = FirstVicuna(
+                    first_module = torch_mlir.compile(
+                        ts_graph,
+                        [*firstVicunaCompileInput],
+                        torch_mlir.OutputType.LINALG_ON_TENSORS,
+                        use_tracing=False,
+                        verbose=False,
+                    )
+                del ts_graph
+                del firstVicunaCompileInput
+                gc.collect()
+
+                print(
+                    "[DEBUG] successfully generated first vicuna linalg mlir"
+                )
+                first_module = self.write_in_dynamic_inputs0(
+                    str(first_module), dynamic_input_size=19
+                )
+                if self.cache_vicunas:
+                    with open(first_model_path, "w+") as f:
+                        f.write(first_module)
+                    print("Finished writing IR after dynamic")
+
+            print(f"[DEBUG] Starting generation of second llama")
+            second_model_path = f"second_{self.model_name}_{self.precision}.mlir"
+            if Path(second_model_path).exists():
+                print(f"loading {second_model_path}")
+                with open(Path(second_model_path), "r") as f:
+                    second_module = f.read()
+            else:
+                # generate second vicuna
+                compilation_input_ids = torch.zeros(
+                    [1, 1], dtype=torch.int64
+                )
+                if self.model_name == "llama2_13b":
+                    dim1 = 40
+                    total_tuple = 80
+                elif self.model_name == "llama2_70b":
+                    dim1 = 8
+                    total_tuple = 160
+                else:
+                    dim1 = 32
+                    total_tuple = 64
+                pkv = tuple(
+                    (torch.zeros([1, dim1, 19, 128], dtype=torch.float32))
+                    for _ in range(total_tuple)
+                )
+                secondVicunaCompileInput = (compilation_input_ids,) + pkv
+                if self.model_name == "llama2_13b":
+                    model = SecondVicuna13B(
                        self.hf_model_path,
                        self.precision,
                        self.weight_group_size,
                        self.model_name,
                        self.hf_auth_token,
                    )
-                    print(f"[DEBUG] generating torchscript graph")
-                    is_f16 = self.precision in ["fp16", "int4"]
-                    ts_graph = import_with_fx(
-                        model,
-                        firstVicunaCompileInput,
-                        is_f16=is_f16,
-                        precision=self.precision,
-                        f16_input_mask=[False, False],
-                        mlir_type="torchscript",
+                elif self.model_name == "llama2_70b":
+                    model = SecondVicuna70B(
+                        self.hf_model_path,
+                        self.precision,
+                        self.weight_group_size,
+                        self.model_name,
+                        self.hf_auth_token,
                    )
-                    del model
-                    firstVicunaCompileInput = list(firstVicunaCompileInput)
-                    firstVicunaCompileInput[
-                        0
-                    ] = torch_mlir.TensorPlaceholder.like(
-                        firstVicunaCompileInput[0], dynamic_axes=[1]
-                    )
-
-                    firstVicunaCompileInput = tuple(firstVicunaCompileInput)
-                    first_module = None
-                    print(f"[DEBUG] generating torch mlir")
-                    if self.precision in ["int4", "int8"]:
-                        first_module = torch_mlir.compile(
-                            ts_graph,
-                            [*firstVicunaCompileInput],
-                            output_type=torch_mlir.OutputType.TORCH,
-                            backend_legal_ops=["quant.matmul_rhs_group_quant"],
-                            extra_library=brevitas_matmul_rhs_group_quant_library,
-                            use_tracing=False,
-                            verbose=False,
-                        )
-                        print(f"[DEBUG] converting torch to linalg")
-                        run_pipeline_with_repro_report(
-                            first_module,
-                            "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
-                            description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
-                        )
-                    else:
-                        first_module = torch_mlir.compile(
-                            ts_graph,
-                            [*firstVicunaCompileInput],
-                            torch_mlir.OutputType.LINALG_ON_TENSORS,
-                            use_tracing=False,
-                            verbose=False,
-                        )
-                    del ts_graph
-                    del firstVicunaCompileInput
-                    gc.collect()
-
-                    print(
-                        "[DEBUG] successfully generated first vicuna linalg mlir"
-                    )
-                    first_module = self.write_in_dynamic_inputs0(
-                        str(first_module), dynamic_input_size=19
-                    )
-                    if self.cache_vicunas:
-                        with open(first_model_path, "w+") as f:
-                            f.write(first_module)
-                        print("Finished writing IR after dynamic")
-                print(f"[DEBUG] Starting generation of second llama")
-                second_model_path = f"second_{self.model_name}_{self.precision}.mlir"
-                if Path(second_model_path).exists():
-                    print(f"loading {second_model_path}")
-                    with open(Path(second_model_path), "r") as f:
-                        second_module = f.read()
                else:
-                    # generate second vicuna
-                    compilation_input_ids = torch.zeros(
-                        [1, 1], dtype=torch.int64
+                    model = SecondVicuna7B(
+                        self.hf_model_path,
+                        self.precision,
+                        self.weight_group_size,
+                        self.model_name,
+                        self.hf_auth_token,
                    )
-                    if self.model_name == "llama2_13b":
-                        dim1 = 40
-                        total_tuple = 80
-                    elif self.model_name == "llama2_70b":
-                        dim1 = 8
-                        total_tuple = 160
-                    else:
-                        dim1 = 32
-                        total_tuple = 64
-                    pkv = tuple(
-                        (torch.zeros([1, dim1, 19, 128], dtype=torch.float32))
-                        for _ in range(total_tuple)
-                    )
-                    secondVicunaCompileInput = (compilation_input_ids,) + pkv
-                    if self.model_name == "llama2_13b":
-                        model = SecondVicuna13B(
-                            self.hf_model_path,
-                            self.precision,
-                            self.weight_group_size,
-                            self.model_name,
-                            self.hf_auth_token,
-                        )
-                    elif self.model_name == "llama2_70b":
-                        model = SecondVicuna70B(
-                            self.hf_model_path,
-                            self.precision,
-                            self.weight_group_size,
-                            self.model_name,
-                            self.hf_auth_token,
-                        )
-                    else:
-                        model = SecondVicuna7B(
-                            self.hf_model_path,
-                            self.precision,
-                            self.weight_group_size,
-                            self.model_name,
-                            self.hf_auth_token,
-                        )
-                    print(f"[DEBUG] generating torchscript graph")
-                    is_f16 = self.precision in ["fp16", "int4"]
-                    ts_graph = import_with_fx(
-                        model,
-                        secondVicunaCompileInput,
-                        is_f16=is_f16,
-                        precision=self.precision,
-                        f16_input_mask=[False] + [True] * total_tuple,
-                        mlir_type="torchscript",
-                    )
-                    del model
-                    if self.precision in ["fp16", "int4"]:
-                        secondVicunaCompileInput = get_f16_inputs(
-                            secondVicunaCompileInput,
-                            True,
-                            f16_input_mask=[False] + [True] * total_tuple,
-                        )
-                    secondVicunaCompileInput = list(secondVicunaCompileInput)
-                    for i in range(len(secondVicunaCompileInput)):
-                        if i != 0:
-                            secondVicunaCompileInput[i] = torch_mlir.TensorPlaceholder.like(
-                                secondVicunaCompileInput[i], dynamic_axes=[2]
-                            )
-                    secondVicunaCompileInput = tuple(secondVicunaCompileInput)
-                    print(f"[DEBUG] generating torch mlir")
-                    if self.precision in ["int4", "int8"]:
-                        second_module = torch_mlir.compile(
-                            ts_graph,
-                            [*secondVicunaCompileInput],
-                            output_type=torch_mlir.OutputType.TORCH,
-                            backend_legal_ops=["quant.matmul_rhs_group_quant"],
-                            extra_library=brevitas_matmul_rhs_group_quant_library,
-                            use_tracing=False,
-                            verbose=False,
-                        )
-                        print(f"[DEBUG] converting torch to linalg")
-                        run_pipeline_with_repro_report(
-                            second_module,
-                            "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
-                            description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
-                        )
-                    else:
-                        second_module = torch_mlir.compile(
-                            ts_graph,
-                            [*secondVicunaCompileInput],
-                            torch_mlir.OutputType.LINALG_ON_TENSORS,
-                            use_tracing=False,
-                            verbose=False,
-                        )
-                    del ts_graph
-                    del secondVicunaCompileInput
-                    gc.collect()
-
-                    print(
-                        "[DEBUG] successfully generated second vicuna linalg mlir"
-                    )
-                    second_module = self.write_in_dynamic_inputs1(
-                        str(second_module)
-                    )
-                    if self.cache_vicunas:
-                        with open(second_model_path, "w+") as f:
-                            f.write(second_module)
-                        print("Finished writing IR after dynamic")
-
-                combined_module = self.combine_mlir_scripts(
-                    first_module,
-                    second_module,
-                    self.vicuna_mlir_path,
+                print(f"[DEBUG] generating torchscript graph")
+                is_f16 = self.precision in ["fp16", "int4"]
+                ts_graph = import_with_fx(
+                    model,
+                    secondVicunaCompileInput,
+                    is_f16=is_f16,
+                    precision=self.precision,
+                    f16_input_mask=[False] + [True] * total_tuple,
+                    mlir_type="torchscript",
                )
-                del first_module, second_module
+                del model
+                if self.precision in ["fp16", "int4"]:
+                    secondVicunaCompileInput = get_f16_inputs(
+                        secondVicunaCompileInput,
+                        True,
+                        f16_input_mask=[False] + [True] * total_tuple,
+                    )
+                secondVicunaCompileInput = list(secondVicunaCompileInput)
+                for i in range(len(secondVicunaCompileInput)):
+                    if i != 0:
+                        secondVicunaCompileInput[i] = torch_mlir.TensorPlaceholder.like(
+                            secondVicunaCompileInput[i], dynamic_axes=[2]
+                        )
+                secondVicunaCompileInput = tuple(secondVicunaCompileInput)
+                print(f"[DEBUG] generating torch mlir")
+                if self.precision in ["int4", "int8"]:
+                    second_module = torch_mlir.compile(
+                        ts_graph,
+                        [*secondVicunaCompileInput],
+                        output_type=torch_mlir.OutputType.TORCH,
+                        backend_legal_ops=["quant.matmul_rhs_group_quant"],
+                        extra_library=brevitas_matmul_rhs_group_quant_library,
+                        use_tracing=False,
+                        verbose=False,
+                    )
+                    print(f"[DEBUG] converting torch to linalg")
+                    run_pipeline_with_repro_report(
+                        second_module,
+                        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
+                    )
+                else:
+                    second_module = torch_mlir.compile(
+                        ts_graph,
+                        [*secondVicunaCompileInput],
+                        torch_mlir.OutputType.LINALG_ON_TENSORS,
+                        use_tracing=False,
+                        verbose=False,
+                    )
+                del ts_graph
+                del secondVicunaCompileInput
+                gc.collect()
+
+                print(
+                    "[DEBUG] successfully generated second vicuna linalg mlir"
+                )
+                second_module = self.write_in_dynamic_inputs1(
+                    str(second_module)
+                )
+                if self.cache_vicunas:
+                    with open(second_model_path, "w+") as f:
+                        f.write(second_module)
+                    print("Finished writing IR after dynamic")
+
+            combined_module = self.combine_mlir_scripts(
+                first_module,
+                second_module,
+                self.vicuna_mlir_path,
+            )
+            del first_module, second_module

        print(self.device)
        if "rocm" in self.device:
@@ -1691,9 +1679,8 @@ class UnshardedVicuna(VicunaBase):
            if type(res_tokens[i]) != int:
                res_tokens[i] = int(res_tokens[i][0])

-        skip_sp_tok = True if self.model_name == "codegen" else False
        res_str = self.tokenizer.decode(
-            res_tokens, skip_special_tokens=skip_sp_tok
+            res_tokens, skip_special_tokens=False
        )
        return res_str

@@ -1736,7 +1723,7 @@ class UnshardedVicuna(VicunaBase):
            pkv = generated_token_op["past_key_values"]
            detok = generated_token_op["detok"]

-            if token == 2 and self.model_name != "codegen":
+            if token == 2:
                break
            res_tokens.append(token)
            if detok == "<0x0A>":
@@ -1784,47 +1771,37 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
-    "StableLM": (
-        "<|SYSTEM|># StableLM Tuned (Alpha version)"
-        "\n- StableLM is a helpful and harmless open-source AI language model "
-        "developed by StabilityAI."
-        "\n- StableLM is excited to be able to help the user, but will refuse "
-        "to do anything that could be considered harmful to the user."
-        "\n- StableLM is more than just an information source, StableLM is also "
-        "able to write poetry, short stories, and make jokes."
-        "\n- StableLM will refuse to participate in anything that "
-        "could harm a human."
-    ),
    "vicuna": (
        "A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's "
        "questions.\n"
    ),
-    "vicuna4": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "vicuna1p3": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "codegen": "",
 }


 def create_prompt(model_name, history):
    global start_message
    system_message = start_message[model_name]
-    conversation = "".join(
-        [
-            "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
-            for item in history
-        ]
-    )
-    msg = system_message + conversation
-    msg = msg.strip()
+    if "llama2" in model_name:
+        B_INST, E_INST = "[INST]", "[/INST]"
+        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+        conversation = "".join(
+            [
+                f"{B_INST} {item[0].strip()} {E_INST} {item[1].strip()} "
+                for item in history[1:]
+            ]
+        )
+        msg = f"{B_INST} {B_SYS} {system_message} {E_SYS} {history[0][0]} {E_INST} {history[0][1]} {conversation}"
+
+    else:
+        conversation = "".join(
+            [
+                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
+                for item in history
+            ]
+        )
+        msg = system_message + conversation
+        msg = msg.strip()
    return msg


--- a/apps/stable_diffusion/shark_studio_imports.py
+++ b/apps/stable_diffusion/shark_studio_imports.py
@@ -15,8 +15,8 @@ pathex = [

 # datafiles for pyinstaller
 datas = []
-datas += collect_data_files("torch")
 datas += copy_metadata("torch")
+datas += copy_metadata("tokenizers")
 datas += copy_metadata("tqdm")
 datas += copy_metadata("regex")
 datas += copy_metadata("requests")
@@ -31,18 +31,17 @@ datas += copy_metadata("Pillow")
 datas += copy_metadata("sentencepiece")
 datas += copy_metadata("pyyaml")
 datas += copy_metadata("huggingface-hub")
+datas += collect_data_files("torch")
 datas += collect_data_files("tokenizers")
 datas += collect_data_files("tiktoken")
 datas += collect_data_files("accelerate")
 datas += collect_data_files("diffusers")
 datas += collect_data_files("transformers")
 datas += collect_data_files("pytorch_lightning")
-datas += collect_data_files("opencv_python")
 datas += collect_data_files("skimage")
 datas += collect_data_files("gradio")
 datas += collect_data_files("gradio_client")
 datas += collect_data_files("iree")
-datas += collect_data_files("google_cloud_storage")
 datas += collect_data_files("shark", include_py_files=True)
 datas += collect_data_files("timm", include_py_files=True)
 datas += collect_data_files("tqdm")
@@ -53,6 +52,7 @@ datas += collect_data_files("jsonschema")
 datas += collect_data_files("jsonschema_specifications")
 datas += collect_data_files("cpuinfo")
 datas += collect_data_files("langchain")
+datas += collect_data_files("cv2")
 datas += [
    ("src/utils/resources/prompts.json", "resources"),
    ("src/utils/resources/model_db.json", "resources"),
@@ -81,4 +81,4 @@ hiddenimports += [
    if not any(kw in x for kw in blacklist)
 ]
 hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
-hiddenimports += ["iree._runtime", "iree._runtime_libs"]
+hiddenimports += ["iree._runtime", "iree.compiler._mlir_libs._mlir.ir"]
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -273,6 +273,7 @@ class StencilPipeline(StableDiffusionPipeline):
        cpu_scheduling,
        max_embeddings_multiples,
        use_stencil,
+        resample_type,
    ):
        # Control Embedding check & conversion
        # TODO: 1. Change `num_images_per_prompt`.
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -158,9 +158,9 @@ def load_lower_configs(base_model_id=None):
                f"{spec}.json"
            )

-    full_gs_url = config_bucket + config_name
    lowering_config_dir = os.path.join(WORKDIR, "configs", config_name)
    print("Loading lowering config file from ", lowering_config_dir)
+    full_gs_url = config_bucket + config_name
    download_public_file(full_gs_url, lowering_config_dir, True)
    return lowering_config_dir

@@ -281,13 +281,9 @@ def sd_model_annotation(mlir_model, model_name, base_model_id=None):
        if "rdna2" not in args.iree_vulkan_target_triple.split("-")[0]:
            use_winograd = True
            winograd_config_dir = load_winograd_configs()
-            winograd_model = annotate_with_winograd(
+            tuned_model = annotate_with_winograd(
                mlir_model, winograd_config_dir, model_name
            )
-            lowering_config_dir = load_lower_configs(base_model_id)
-            tuned_model = annotate_with_lower_configs(
-                winograd_model, lowering_config_dir, model_name, use_winograd
-            )
        else:
            tuned_model = mlir_model
    else:
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -633,6 +633,13 @@ p.add_argument(
    help="Flag for enabling rest API.",
 )

+p.add_argument(
+    "--debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag for enabling debugging log in WebUI.",
+)
+
 p.add_argument(
    "--output_gallery",
    default=True,
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,6 +1,7 @@
 from multiprocessing import Process, freeze_support
 import os
 import sys
+import logging

 if sys.platform == "darwin":
    # import before IREE to avoid torch-MLIR library issues
@@ -41,6 +42,8 @@ def launch_app(address):


 if __name__ == "__main__":
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
    # required to do multiprocessing in a pyinstaller freeze
    freeze_support()
    if args.api or "api" in args.ui.split(","):
--- a/apps/stable_diffusion/web/ui/minigpt4_ui.py
+++ b/apps/stable_diffusion/web/ui/minigpt4_ui.py
@@ -109,7 +109,7 @@ with gr.Blocks() as minigpt4_web:
    gr.Markdown(description)

    with gr.Row():
-        with gr.Column(scale=0.5):
+        with gr.Column():
            image = gr.Image(type="pil")
            upload_button = gr.Button(
                value="Upload & Start Chat",
--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -26,11 +26,7 @@ model_map = {
    "llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
    "llama2_13b": "meta-llama/Llama-2-13b-chat-hf",
    "llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
-    "codegen": "Salesforce/codegen25-7b-multi",
-    "vicuna1p3": "lmsys/vicuna-7b-v1.3",
    "vicuna": "TheBloke/vicuna-7B-1.1-HF",
-    "vicuna4": "TheBloke/vicuna-7B-1.1-HF",
-    "StableLM": "stabilityai/stablelm-tuned-alpha-3b",
 }

 # NOTE: Each `model_name` should have its own start message
@@ -62,61 +58,39 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
-    "StableLM": (
-        "<|SYSTEM|># StableLM Tuned (Alpha version)"
-        "\n- StableLM is a helpful and harmless open-source AI language model "
-        "developed by StabilityAI."
-        "\n- StableLM is excited to be able to help the user, but will refuse "
-        "to do anything that could be considered harmful to the user."
-        "\n- StableLM is more than just an information source, StableLM is also "
-        "able to write poetry, short stories, and make jokes."
-        "\n- StableLM will refuse to participate in anything that "
-        "could harm a human."
-    ),
    "vicuna": (
        "A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's "
        "questions.\n"
    ),
-    "vicuna4": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "vicuna1p3": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "codegen": "",
 }


 def create_prompt(model_name, history):
    system_message = start_message[model_name]

-    if model_name in [
-        "StableLM",
-        "vicuna",
-        "vicuna4",
-        "vicuna1p3",
-        "llama2_7b",
-        "llama2_13b",
-        "llama2_70b",
-    ]:
+    if "llama2" in model_name:
+        B_INST, E_INST = "[INST]", "[/INST]"
+        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+        conversation = "".join(
+            [f"{B_INST} {item[0]} {E_INST} {item[1]} " for item in history[1:]]
+        )
+        msg = f"{B_INST} {B_SYS} {system_message} {E_SYS} {history[0][0]} {E_INST} {history[0][1]} {conversation}"
+    elif model_name in ["vicuna"]:
        conversation = "".join(
            [
                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
                for item in history
            ]
        )
+        msg = system_message + conversation
+        msg = msg.strip()
    else:
        conversation = "".join(
            ["".join([item[0], item[1]]) for item in history]
        )
-
-    msg = system_message + conversation
-    msg = msg.strip()
+        msg = system_message + conversation
+        msg = msg.strip()
    return msg


@@ -160,14 +134,15 @@ def chat(
    model,
    device,
    precision,
+    download_vmfb,
    config_file,
    cli=False,
    progress=gr.Progress(),
 ):
    global past_key_values
    global model_vmfb_key
-
    global vicuna_model
+
    model_name, model_path = list(map(str.strip, model.split("=>")))
    if "cuda" in device:
        device = "cuda"
@@ -177,136 +152,73 @@ def chat(
        device = "cpu-task"
    elif "vulkan" in device:
        device = "vulkan"
+    elif "rocm" in device:
+        device = "rocm"
    else:
        print("unrecognized device")

+    from apps.language_models.scripts.vicuna import ShardedVicuna
+    from apps.language_models.scripts.vicuna import UnshardedVicuna
+    from apps.stable_diffusion.src import args
+
    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{precision}"
-    if model_name in [
-        "vicuna",
-        "vicuna4",
-        "vicuna1p3",
-        "codegen",
-        "llama2_7b",
-        "llama2_13b",
-        "llama2_70b",
-    ]:
-        from apps.language_models.scripts.vicuna import ShardedVicuna
-        from apps.language_models.scripts.vicuna import UnshardedVicuna
-        from apps.stable_diffusion.src import args
-
-        if vicuna_model == 0:
-            if "cuda" in device:
-                device = "cuda"
-            elif "sync" in device:
-                device = "cpu-sync"
-            elif "task" in device:
-                device = "cpu-task"
-            elif "vulkan" in device:
-                device = "vulkan"
-            elif "rocm" in device:
-                device = "rocm"
-
-        if new_model_vmfb_key != model_vmfb_key:
-            model_vmfb_key = new_model_vmfb_key
-            max_toks = 128 if model_name == "codegen" else 512
-
-            # get iree flags that need to be overridden, from commandline args
-            _extra_args = []
-            # vulkan target triple
-            if args.iree_vulkan_target_triple != "":
-                _extra_args.append(
-                    f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-                )
-
-            if model_name == "vicuna4":
-                vicuna_model = ShardedVicuna(
-                    model_name,
-                    hf_model_path=model_path,
-                    device=device,
-                    precision=precision,
-                    max_num_tokens=max_toks,
-                    compressed=True,
-                    extra_args_cmd=_extra_args,
-                )
-            else:
-                #  if config_file is None:
-                vicuna_model = UnshardedVicuna(
-                    model_name,
-                    hf_model_path=model_path,
-                    hf_auth_token=args.hf_auth_token,
-                    device=device,
-                    precision=precision,
-                    max_num_tokens=max_toks,
-                    extra_args_cmd=_extra_args,
-                )
-                #  else:
-                #      if config_file is not None:
-                #          config_file = open(config_file)
-                #          config_json = json.load(config_file)
-                #          config_file.close()
-                #      else:
-                #          config_json = get_default_config()
-                #      vicuna_model = ShardedVicuna(
-                #          model_name,
-                #          device=device,
-                #          precision=precision,
-                #          config_json=config_json,
-                #      )
-
-        prompt = create_prompt(model_name, history)
-
-        partial_text = ""
-        count = 0
-        start_time = time.time()
-        for text, msg in progress.tqdm(
-            vicuna_model.generate(prompt, cli=cli),
-            desc="generating response",
-        ):
-            count += 1
-            if "formatted" in msg:
-                history[-1][1] = text
-                end_time = time.time()
-                tokens_per_sec = count / (end_time - start_time)
-                yield history, str(
-                    format(tokens_per_sec, ".2f")
-                ) + " tokens/sec"
-            else:
-                partial_text += text + " "
-                history[-1][1] = partial_text
-                yield history, ""
-
-        return history, ""
-
-    # else Model is StableLM
-    global sharkModel
-    from apps.language_models.src.pipelines.stablelm_pipeline import (
-        SharkStableLM,
-    )
-
    if new_model_vmfb_key != model_vmfb_key:
        model_vmfb_key = new_model_vmfb_key
-        # max_new_tokens=512
-        shark_slm = SharkStableLM(
-            model_name
-        )  # pass elements from UI as required
+        max_toks = 128 if model_name == "codegen" else 512
+
+        # get iree flags that need to be overridden, from commandline args
+        _extra_args = []
+        # vulkan target triple
+        if args.iree_vulkan_target_triple != "":
+            _extra_args.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+
+        if model_name == "vicuna4":
+            vicuna_model = ShardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                compressed=True,
+                extra_args_cmd=_extra_args,
+            )
+        else:
+            #  if config_file is None:
+            vicuna_model = UnshardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                hf_auth_token=args.hf_auth_token,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                download_vmfb=download_vmfb,
+                load_mlir_from_shark_tank=True,
+                extra_args_cmd=_extra_args,
+            )

-    # Construct the input message string for the model by concatenating the
-    # current system message and conversation history
-    if len(curr_system_message.split()) > 160:
-        print("clearing context")
    prompt = create_prompt(model_name, history)
-    generate_kwargs = dict(prompt=prompt)
-
-    words_list = shark_slm.generate(**generate_kwargs)

    partial_text = ""
-    for new_text in words_list:
-        partial_text += new_text
-        history[-1][1] = partial_text
-        # Yield an empty string to clean up the message textbox and the updated
-        # conversation history
-        yield history
-    return words_list
+    count = 0
+    start_time = time.time()
+    for text, msg in progress.tqdm(
+        vicuna_model.generate(prompt, cli=cli),
+        desc="generating response",
+    ):
+        count += 1
+        if "formatted" in msg:
+            history[-1][1] = text
+            end_time = time.time()
+            tokens_per_sec = count / (end_time - start_time)
+            yield history, str(format(tokens_per_sec, ".2f")) + " tokens/sec"
+        else:
+            partial_text += text + " "
+            history[-1][1] = partial_text
+            yield history, ""
+
+    return history, ""


 def llm_chat_api(InputData: dict):
@@ -360,6 +272,8 @@ def llm_chat_api(InputData: dict):
            device=device,
            precision=precision,
            max_num_tokens=max_toks,
+            download_vmfb=True,
+            load_mlir_from_shark_tank=True,
        )

    # TODO: add role dict for different models
@@ -422,7 +336,7 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        )
        model = gr.Dropdown(
            label="Select Model",
-            value=model_choices[4],
+            value=model_choices[0],
            choices=model_choices,
        )
        supported_devices = available_devices
@@ -430,15 +344,14 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        # show cpu-task device first in list for chatbot
        supported_devices = supported_devices[-1:] + supported_devices[:-1]
        supported_devices = [x for x in supported_devices if "sync" not in x]
-        #  print(supported_devices)
-        devices = gr.Dropdown(
+        device = gr.Dropdown(
            label="Device",
            value=supported_devices[0]
            if enabled
            else "Only CUDA Supported for now",
            choices=supported_devices,
            interactive=enabled,
-            #  multiselect=True,
+            # multiselect=True,
        )
        precision = gr.Radio(
            label="Precision",
@@ -450,7 +363,13 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
            ],
            visible=True,
        )
-        tokens_time = gr.Textbox(label="Tokens generated per second")
+        with gr.Column():
+            download_vmfb = gr.Checkbox(
+                label="Download vmfb from Shark tank if available",
+                value=True,
+                interactive=True,
+            )
+            tokens_time = gr.Textbox(label="Tokens generated per second")

    with gr.Row(visible=False):
        with gr.Group():
@@ -485,7 +404,15 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, devices, precision, config_file],
+        inputs=[
+            system_msg,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
        outputs=[chatbot, tokens_time],
        queue=True,
    )
@@ -493,7 +420,15 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, devices, precision, config_file],
+        inputs=[
+            system_msg,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
        outputs=[chatbot, tokens_time],
        queue=True,
    )
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -130,14 +130,13 @@ fi

 $PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/cpu/

-if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
+if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
  T_VER=$($PYTHON -m pip show torch | grep Version)
-  TORCH_VERSION=${T_VER:9:17}
+  T_VER_MIN=${T_VER:14:12}
  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
-  TV_VERSION=${TV_VER:9:18}
-  $PYTHON -m pip uninstall -y torch torchvision
-  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
-  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu118/torch-${TORCH_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu118/torchvision-${TV_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl
+  TV_VER_MAJ=${TV_VER:9:6}
+  $PYTHON -m pip uninstall -y torchvision
+  $PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
  if [ $? -eq 0 ];then
    echo "Successfully Installed torch + cu118."
  else
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -46,7 +46,7 @@ def get_iree_device_args(device, extra_args=[]):
    if device_uri[0] == "cpu":
        from shark.iree_utils.cpu_utils import get_iree_cpu_args

-        data_tiling_flag = ["--iree-flow-enable-data-tiling"]
+        data_tiling_flag = ["--iree-opt-data-tiling"]
        u_kernel_flag = ["--iree-llvmcpu-enable-microkernels"]
        stack_size_flag = ["--iree-llvmcpu-stack-allocation-limit=256000"]

@@ -84,7 +84,7 @@ def get_iree_frontend_args(frontend):
    elif frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
        return [
            "--iree-llvmcpu-target-cpu-features=host",
-            "--iree-flow-demote-i64-to-i32",
+            "--iree-input-demote-i64-to-i32",
        ]
    else:
        # Frontend not found.
--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -57,11 +57,8 @@ def get_version(triple):
@functools.cache
 def get_extensions(triple):
    def make_ext_list(ext_list):
-        res = ""
-        for e in ext_list:
-            res += e + ", "
-        res = f"[{res[:-2]}]"
-        return res
+        res = ", ".join(ext_list)
+        return f"[{res}]"

    arch, product, os = triple
    if arch == "m1":
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -178,9 +178,7 @@ def get_iree_vulkan_args(device_num=0, extra_args=[]):
@functools.cache
 def get_iree_vulkan_runtime_flags():
    vulkan_runtime_flags = [
-        f"--vulkan_large_heap_block_size={shark_args.vulkan_large_heap_block_size}",
        f"--vulkan_validation_layers={'true' if shark_args.vulkan_validation_layers else 'false'}",
-        f"--vulkan_vma_allocator={'true' if shark_args.vulkan_vma_allocator else 'false'}",
    ]
    return vulkan_runtime_flags

--- a/shark/parser.py
+++ b/shark/parser.py
@@ -133,13 +133,6 @@ parser.add_argument(
    help="Profiles vulkan device and collects the .rdc info.",
 )

-parser.add_argument(
-    "--vulkan_large_heap_block_size",
-    default="2073741824",
-    help="Flag for setting VMA preferredLargeHeapBlockSize for "
-    "vulkan device, default is 4G.",
-)
-
 parser.add_argument(
    "--vulkan_validation_layers",
    default=False,
@@ -147,11 +140,4 @@ parser.add_argument(
    help="Flag for disabling vulkan validation layers when benchmarking.",
 )

-parser.add_argument(
-    "--vulkan_vma_allocator",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Flag for enabling / disabling Vulkan VMA Allocator.",
-)
-
 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -580,6 +580,9 @@ def import_with_fx(
        torch.ops.aten.native_layer_norm,
        torch.ops.aten.masked_fill.Tensor,
        torch.ops.aten.masked_fill.Scalar,
+        torch.ops.aten._scaled_dot_product_flash_attention.default,
+        torch.ops.aten.index_add,
+        torch.ops.aten.index_add_,
    ]
    if precision in ["int4", "int8"]:
        from brevitas_examples.llm.llm_quant.export import (
@@ -680,5 +683,5 @@ def import_with_fx(
        )
        return mlir_module, func_name

-    mlir_module, func_name = mlir_importer.import_mlir()
+    mlir_module, func_name = mlir_importer.import_mlir(mlir_type=mlir_type)
    return mlir_module, func_name
--- a/shark/shark_trainer.py
+++ b/shark/shark_trainer.py
@@ -69,7 +69,7 @@ class SharkTrainer:
            self.frontend = frontend

    # Training function is needed in the case of torch_fn.
-    def compile(self, training_fn=None, extra_args=[]):
+    def compile(self, training_fn=None, mlir_type="linalg", extra_args=[]):
        if self.frontend in ["torch", "pytorch"]:
            packed_inputs = (
                dict(self.model.named_parameters()),
@@ -77,7 +77,12 @@ class SharkTrainer:
                tuple(self.input),
            )
            mlir_module, func_name = import_with_fx(
-                training_fn, packed_inputs, False, [], training=True
+                training_fn,
+                packed_inputs,
+                False,
+                [],
+                training=True,
+                mlir_type=mlir_type,
            )
            self.shark_runner = SharkRunner(
                mlir_module,
--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -287,6 +287,9 @@ class SharkModuleTester:
        repro_path = os.path.join("reproducers", self.tmp_prefix, "*")

        bashCommand = f"gsutil cp -r {repro_path} gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
+        print(
+            f"Uploading reproducer {repro_path} to gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
+        )
        process = subprocess.run(bashCommand.split())

    def postprocess_outputs(self, golden_out, result):
Author	SHA1	Message	Date
Ean Garvey	ca609afb6a	Update README.md (#1830 )	2023-09-14 10:33:57 -05:00
Gaurav Shukla	11bdce9790	[flags] Fix vulkan runtime flags as vma is dropped from iree (#1831 )	2023-09-14 08:58:59 -05:00
Ean Garvey	684943a4a6	(SD) Fix tokenizers imports in pyinstaller builds. (#1828 ) * Fix tokenizers metadata. * (SD) Disable VAE lowering configs (rdna3) and add versioned tunings. * Update sd_annotation.py * (SD) Add cv2 to spec. * Update stencil pipeline with the new img2img arg.	2023-09-12 12:23:48 -05:00
PhaneeshB	b817bb8455	add roles for llama2	2023-09-12 10:59:28 +05:30
Ean Garvey	780f520f02	Fix vk.target_env extensions and remove redundant SD imports. (#1826 ) * Remove redundant IREE runtime imports. * Fix vulkan target env extensions.	2023-09-11 13:42:52 -05:00
Dom	c61b6f8d65	Code refactoring (#1817 ) * use join * fix bug * further code optimizations --------- Co-authored-by: Daniel Garvey <34486624+dan-garvey@users.noreply.github.com>	2023-09-11 11:30:56 -05:00
Abhishek Varma	c854208d49	[Llama2] Prefetch llama2 tokenizer configs (#1824 ) -- This commit prefetches llama2 tokenizer configs from shark_tank. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-09-08 11:29:54 -07:00
Gaurav Shukla	c5dcfc1f13	[vicuna] Exit when mlir is not present in shark tank (#1825 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-08 10:30:29 -07:00
Abhishek Varma	bde63ee8ae	Add logging feature in WebUI (#1821 )	2023-09-08 05:48:05 -07:00
Vivek Khandelwal	9681d494eb	Update decomp list and shark trainer for DLRM	2023-09-06 21:24:50 +05:30
Gaurav Shukla	ede6bf83e2	[vicuna] Disabling the IR generation path Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-06 20:13:17 +05:30
Ean Garvey	2c2693fb7d	Fix torchvision versioning in Linux importer setup. (#1809 )	2023-09-05 12:57:03 -05:00
Vivek Khandelwal	1d31b2b2c6	Fix StableHLO Compilation flag	2023-09-05 21:32:33 +05:30
Gaurav Shukla	d2f64eefa3	[chatbot] Remove few outdated models from list (#1814 )	2023-09-04 09:26:32 -07:00
Abhishek Varma	87ae14b6ff	[SD] Add sdpfa decomposition + update IREE flag -- This commit adds Scaled Dot Product Flash Attention's decomposition in shark_importer. -- It also updates `iree-flow-enable-data-tiling` to `iree-opt-data-tiling`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-09-04 18:03:53 +05:30
Phaneesh Barwaria	1ccafa1fc1	fix llama2-70b rewrite tensor dim	2023-09-01 17:27:06 +05:30
jinchen62	4c3d8a0a7f	Enable downloading vmfb/mlir for webui (#1807 )	2023-08-31 11:05:47 -07:00