Switch most compile flows to use ireec.compile_file. (#1863 )

* Switch most compile flows to use ireec.compile_file. * re-add input type to compile_str path. * Check if mlir_module exists before checking if it's a path or pyobject. * Fix some save_dir cases
Remove tf dependencies from importer path. (#1874 )
2026-04-20 03:00:34 -04:00 · 2023-10-06 23:04:43 -05:00 · 2023-10-06 12:27:12 -07:00 · 2023-10-06 10:03:07 -07:00 · 2023-10-06 11:34:49 -05:00 · 2023-10-05 20:15:21 -07:00
69 changed files with 3381 additions and 2910 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -193,3 +193,9 @@ stencil_annotator/
 # For DocuChat
 apps/language_models/langchain/user_path/
 db_dir_UserData
+
+# Embeded browser cache and other
+apps/stable_diffusion/web/EBWebView/
+
+# Llama2 tokenizer configs
+llama2_tokenizer_configs/
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ High Performance Machine Learning Distribution
  <summary>Prerequisites - Drivers </summary>
  
 #### Install your Windows hardware drivers
-* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-2-1).
+* [AMD RDNA Users] Download the latest driver (23.2.1 is the oldest supported) [here](https://www.amd.com/en/support).
 * [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
 * [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
  
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -20,7 +20,7 @@ import gc
 from pathlib import Path
 from shark.shark_inference import SharkInference
 from shark.shark_downloader import download_public_file
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 from apps.stable_diffusion.src import args

 # Brevitas
@@ -237,7 +237,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
            print(f"[DEBUG] converting torch to linalg")
            run_pipeline_with_repro_report(
                module,
-                "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
            )
        else:
@@ -256,6 +256,11 @@ class H2OGPTSHARKModel(torch.nn.Module):
        bytecode = bytecode_stream.getvalue()
        del module

+        bytecode = save_mlir(
+            bytecode,
+            model_name=f"h2ogpt_{precision}",
+            frontend="torch",
+        )
        return bytecode

    def forward(self, input_ids, attention_mask):
--- a/apps/language_models/scripts/llama_ir_conversion_utils.py
+++ b/apps/language_models/scripts/llama_ir_conversion_utils.py
@@ -0,0 +1,442 @@
+from pathlib import Path
+import argparse
+from argparse import RawTextHelpFormatter
+import re, gc
+
+"""
+    This script can be used as a standalone utility to convert IRs to dynamic + combine them.
+    Following are the various ways this script can be used :-
+        a. To convert a single Linalg IR to dynamic IR:
+            --dynamic --first_ir_path=<PATH TO FIRST IR>
+        b. To convert two Linalg IRs to dynamic IR:
+            --dynamic --first_ir_path=<PATH TO SECOND IR> --first_ir_path=<PATH TO SECOND IR>
+        c. To combine two Linalg IRs into one:
+            --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>
+        d. To convert both IRs into dynamic as well as combine the IRs:
+            --dynamic --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>
+
+    NOTE: For dynamic you'll also need to provide the following set of flags:-
+           i. For First Llama : --dynamic_input_size (DEFAULT: 19)
+          ii. For Second Llama: --model_name (DEFAULT: llama2_7b)
+                                --precision (DEFAULT: 'int4')
+          You may use --save_dynamic to also save the dynamic IR in option d above.
+          Else for option a. and b. the dynamic IR(s) will get saved by default.
+"""
+
+
+def combine_mlir_scripts(
+    first_vicuna_mlir,
+    second_vicuna_mlir,
+    output_name,
+    return_ir=True,
+):
+    print(f"[DEBUG] combining first and second mlir")
+    print(f"[DEBUG] output_name = {output_name}")
+    maps1 = []
+    maps2 = []
+    constants = set()
+    f1 = []
+    f2 = []
+
+    print(f"[DEBUG] processing first vicuna mlir")
+    first_vicuna_mlir = first_vicuna_mlir.splitlines()
+    while first_vicuna_mlir:
+        line = first_vicuna_mlir.pop(0)
+        if re.search("#map\d*\s*=", line):
+            maps1.append(line)
+        elif re.search("arith.constant", line):
+            constants.add(line)
+        elif not re.search("module", line):
+            line = re.sub("forward", "first_vicuna_forward", line)
+            f1.append(line)
+    f1 = f1[:-1]
+    del first_vicuna_mlir
+    gc.collect()
+
+    for i, map_line in enumerate(maps1):
+        map_var = map_line.split(" ")[0]
+        map_line = re.sub(f"{map_var}(?!\d)", map_var + "_0", map_line)
+        maps1[i] = map_line
+        f1 = [
+            re.sub(f"{map_var}(?!\d)", map_var + "_0", func_line)
+            for func_line in f1
+        ]
+
+    print(f"[DEBUG] processing second vicuna mlir")
+    second_vicuna_mlir = second_vicuna_mlir.splitlines()
+    while second_vicuna_mlir:
+        line = second_vicuna_mlir.pop(0)
+        if re.search("#map\d*\s*=", line):
+            maps2.append(line)
+        elif "global_seed" in line:
+            continue
+        elif re.search("arith.constant", line):
+            constants.add(line)
+        elif not re.search("module", line):
+            line = re.sub("forward", "second_vicuna_forward", line)
+            f2.append(line)
+    f2 = f2[:-1]
+    del second_vicuna_mlir
+    gc.collect()
+
+    for i, map_line in enumerate(maps2):
+        map_var = map_line.split(" ")[0]
+        map_line = re.sub(f"{map_var}(?!\d)", map_var + "_1", map_line)
+        maps2[i] = map_line
+        f2 = [
+            re.sub(f"{map_var}(?!\d)", map_var + "_1", func_line)
+            for func_line in f2
+        ]
+
+    module_start = 'module attributes {torch.debug_module_name = "_lambda"} {'
+    module_end = "}"
+
+    global_vars = []
+    vnames = []
+    global_var_loading1 = []
+    global_var_loading2 = []
+
+    print(f"[DEBUG] processing constants")
+    counter = 0
+    constants = list(constants)
+    while constants:
+        constant = constants.pop(0)
+        vname, vbody = constant.split("=")
+        vname = re.sub("%", "", vname)
+        vname = vname.strip()
+        vbody = re.sub("arith.constant", "", vbody)
+        vbody = vbody.strip()
+        if len(vbody.split(":")) < 2:
+            print(constant)
+        vdtype = vbody.split(":")[-1].strip()
+        fixed_vdtype = vdtype
+        if "c1_i64" in vname:
+            print(constant)
+            counter += 1
+        if counter == 2:
+            counter = 0
+            print("detected duplicate")
+            continue
+        vnames.append(vname)
+        if "true" not in vname:
+            global_vars.append(
+                f"ml_program.global private @{vname}({vbody}) : {fixed_vdtype}"
+            )
+            global_var_loading1.append(
+                f"\t\t%{vname} = ml_program.global_load_const @{vname} : {fixed_vdtype}"
+            )
+            global_var_loading2.append(
+                f"\t\t%{vname} = ml_program.global_load_const @{vname} : {fixed_vdtype}"
+            )
+        else:
+            global_vars.append(
+                f"ml_program.global private @{vname}({vbody}) : i1"
+            )
+            global_var_loading1.append(
+                f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
+            )
+            global_var_loading2.append(
+                f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
+            )
+
+    new_f1, new_f2 = [], []
+
+    print(f"[DEBUG] processing f1")
+    for line in f1:
+        if "func.func" in line:
+            new_f1.append(line)
+            for global_var in global_var_loading1:
+                new_f1.append(global_var)
+        else:
+            new_f1.append(line)
+
+    print(f"[DEBUG] processing f2")
+    for line in f2:
+        if "func.func" in line:
+            new_f2.append(line)
+            for global_var in global_var_loading2:
+                if (
+                    "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
+                    in global_var
+                ):
+                    print(global_var)
+                new_f2.append(global_var)
+        else:
+            new_f2.append(line)
+
+    f1 = new_f1
+    f2 = new_f2
+
+    del new_f1
+    del new_f2
+    gc.collect()
+
+    print(
+        [
+            "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64" in x
+            for x in [maps1, maps2, global_vars, f1, f2]
+        ]
+    )
+
+    # doing it this way rather than assembling the whole string
+    # to prevent OOM with 64GiB RAM when encoding the file.
+
+    print(f"[DEBUG] Saving mlir to {output_name}")
+    with open(output_name, "w+") as f_:
+        f_.writelines(line + "\n" for line in maps1)
+        f_.writelines(line + "\n" for line in maps2)
+        f_.writelines(line + "\n" for line in [module_start])
+        f_.writelines(line + "\n" for line in global_vars)
+        f_.writelines(line + "\n" for line in f1)
+        f_.writelines(line + "\n" for line in f2)
+        f_.writelines(line + "\n" for line in [module_end])
+
+    del maps1
+    del maps2
+    del module_start
+    del global_vars
+    del f1
+    del f2
+    del module_end
+    gc.collect()
+
+    if return_ir:
+        print(f"[DEBUG] Reading combined mlir back in")
+        with open(output_name, "rb") as f:
+            return f.read()
+
+
+def write_in_dynamic_inputs0(module, dynamic_input_size):
+    print("[DEBUG] writing dynamic inputs to first vicuna")
+    # Current solution for ensuring mlir files support dynamic inputs
+    # TODO: find a more elegant way to implement this
+    new_lines = []
+    module = module.splitlines()
+    while module:
+        line = module.pop(0)
+        line = re.sub(f"{dynamic_input_size}x", "?x", line)
+        if "?x" in line:
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
+        line = re.sub(f" {dynamic_input_size},", " %dim,", line)
+        if "tensor.empty" in line and "?x?" in line:
+            line = re.sub(
+                "tensor.empty\(%dim\)", "tensor.empty(%dim, %dim)", line
+            )
+        if "arith.cmpi" in line:
+            line = re.sub(f"c{dynamic_input_size}", "dim", line)
+        if "%0 = tensor.empty(%dim) : tensor<?xi64>" in line:
+            new_lines.append("%dim = tensor.dim %arg0, %c1 : tensor<1x?xi64>")
+        if "%dim = tensor.dim %arg0, %c1 : tensor<1x?xi64>" in line:
+            continue
+
+        new_lines.append(line)
+    return "\n".join(new_lines)
+
+
+def write_in_dynamic_inputs1(module, model_name, precision):
+    print("[DEBUG] writing dynamic inputs to second vicuna")
+
+    def remove_constant_dim(line):
+        if "c19_i64" in line:
+            line = re.sub("c19_i64", "dim_i64", line)
+        if "19x" in line:
+            line = re.sub("19x", "?x", line)
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
+        if "tensor.empty" in line and "?x?" in line:
+            line = re.sub(
+                "tensor.empty\(%dim\)",
+                "tensor.empty(%dim, %dim)",
+                line,
+            )
+        if "arith.cmpi" in line:
+            line = re.sub("c19", "dim", line)
+        if " 19," in line:
+            line = re.sub(" 19,", " %dim,", line)
+        if "x20x" in line or "<20x" in line:
+            line = re.sub("20x", "?x", line)
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dimp1)", line)
+        if " 20," in line:
+            line = re.sub(" 20,", " %dimp1,", line)
+        return line
+
+    module = module.splitlines()
+    new_lines = []
+
+    # Using a while loop and the pop method to avoid creating a copy of module
+    if "llama2_13b" in model_name:
+        pkv_tensor_shape = "tensor<1x40x?x128x"
+    elif "llama2_70b" in model_name:
+        pkv_tensor_shape = "tensor<1x8x?x128x"
+    else:
+        pkv_tensor_shape = "tensor<1x32x?x128x"
+    if precision in ["fp16", "int4", "int8"]:
+        pkv_tensor_shape += "f16>"
+    else:
+        pkv_tensor_shape += "f32>"
+
+    while module:
+        line = module.pop(0)
+        if "%c19_i64 = arith.constant 19 : i64" in line:
+            new_lines.append("%c2 = arith.constant 2 : index")
+            new_lines.append(
+                f"%dim_4_int = tensor.dim %arg1, %c2 : {pkv_tensor_shape}"
+            )
+            new_lines.append(
+                "%dim_i64 = arith.index_cast %dim_4_int : index to i64"
+            )
+            continue
+        if "%c2 = arith.constant 2 : index" in line:
+            continue
+        if "%c20_i64 = arith.constant 20 : i64" in line:
+            new_lines.append("%c1_i64 = arith.constant 1 : i64")
+            new_lines.append("%c20_i64 = arith.addi %dim_i64, %c1_i64 : i64")
+            new_lines.append(
+                "%dimp1 = arith.index_cast %c20_i64 : i64 to index"
+            )
+            continue
+        line = remove_constant_dim(line)
+        new_lines.append(line)
+
+    return "\n".join(new_lines)
+
+
+def save_dynamic_ir(ir_to_save, output_file):
+    if not ir_to_save:
+        return
+    # We only get string output from the dynamic conversion utility.
+    from contextlib import redirect_stdout
+
+    with open(output_file, "w") as f:
+        with redirect_stdout(f):
+            print(ir_to_save)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="llama ir utility",
+        description="\tThis script can be used as a standalone utility to convert IRs to dynamic + combine them.\n"
+        + "\tFollowing are the various ways this script can be used :-\n"
+        + "\t\ta. To convert a single Linalg IR to dynamic IR:\n"
+        + "\t\t\t--dynamic --first_ir_path=<PATH TO FIRST IR>\n"
+        + "\t\tb. To convert two Linalg IRs to dynamic IR:\n"
+        + "\t\t\t--dynamic --first_ir_path=<PATH TO SECOND IR> --first_ir_path=<PATH TO SECOND IR>\n"
+        + "\t\tc. To combine two Linalg IRs into one:\n"
+        + "\t\t\t--combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>\n"
+        + "\t\td. To convert both IRs into dynamic as well as combine the IRs:\n"
+        + "\t\t\t--dynamic --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>\n\n"
+        + "\tNOTE: For dynamic you'll also need to provide the following set of flags:-\n"
+        + "\t\t i. For First Llama : --dynamic_input_size (DEFAULT: 19)\n"
+        + "\t\tii. For Second Llama: --model_name (DEFAULT: llama2_7b)\n"
+        + "\t\t\t--precision (DEFAULT: 'int4')\n"
+        + "\t      You may use --save_dynamic to also save the dynamic IR in option d above.\n"
+        + "\t      Else for option a. and b. the dynamic IR(s) will get saved by default.\n",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--precision",
+        "-p",
+        default="int4",
+        choices=["fp32", "fp16", "int8", "int4"],
+        help="Precision of the concerned IR",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="llama2_7b",
+        choices=["vicuna", "llama2_7b", "llama2_13b", "llama2_70b"],
+        help="Specify which model to run.",
+    )
+    parser.add_argument(
+        "--first_ir_path",
+        default=None,
+        help="path to first llama mlir file",
+    )
+    parser.add_argument(
+        "--second_ir_path",
+        default=None,
+        help="path to second llama mlir file",
+    )
+    parser.add_argument(
+        "--dynamic_input_size",
+        type=int,
+        default=19,
+        help="Specify the static input size to replace with dynamic dim.",
+    )
+    parser.add_argument(
+        "--dynamic",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Converts the IR(s) to dynamic",
+    )
+    parser.add_argument(
+        "--save_dynamic",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Save the individual IR(s) after converting to dynamic",
+    )
+    parser.add_argument(
+        "--combine",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Converts the IR(s) to dynamic",
+    )
+
+    args, unknown = parser.parse_known_args()
+
+    dynamic = args.dynamic
+    combine = args.combine
+    assert (
+        dynamic or combine
+    ), "neither `dynamic` nor `combine` flag is turned on"
+    first_ir_path = args.first_ir_path
+    second_ir_path = args.second_ir_path
+    assert first_ir_path or second_ir_path, "no input ir has been provided"
+    if combine:
+        assert (
+            first_ir_path and second_ir_path
+        ), "you will need to provide both IRs to combine"
+    precision = args.precision
+    model_name = args.model_name
+    dynamic_input_size = args.dynamic_input_size
+    save_dynamic = args.save_dynamic
+
+    print(f"Dynamic conversion utility is turned {'ON' if dynamic else 'OFF'}")
+    print(f"Combining IR utility is turned {'ON' if combine else 'OFF'}")
+
+    if dynamic and not combine:
+        save_dynamic = True
+
+    first_ir = None
+    first_dynamic_ir_name = None
+    second_ir = None
+    second_dynamic_ir_name = None
+    if first_ir_path:
+        first_dynamic_ir_name = f"{Path(first_ir_path).stem}_dynamic"
+        with open(first_ir_path, "r") as f:
+            first_ir = f.read()
+    if second_ir_path:
+        second_dynamic_ir_name = f"{Path(second_ir_path).stem}_dynamic"
+        with open(second_ir_path, "r") as f:
+            second_ir = f.read()
+    if dynamic:
+        first_ir = (
+            write_in_dynamic_inputs0(first_ir, dynamic_input_size)
+            if first_ir
+            else None
+        )
+        second_ir = (
+            write_in_dynamic_inputs1(second_ir, model_name, precision)
+            if second_ir
+            else None
+        )
+        if save_dynamic:
+            save_dynamic_ir(first_ir, f"{first_dynamic_ir_name}.mlir")
+            save_dynamic_ir(second_ir, f"{second_dynamic_ir_name}.mlir")
+
+    if combine:
+        combine_mlir_scripts(
+            first_ir,
+            second_ir,
+            f"{model_name}_{precision}.mlir",
+            return_ir=False,
+        )
--- a/apps/language_models/scripts/stablelm.py
+++ b/apps/language_models/scripts/stablelm.py
@@ -46,6 +46,7 @@ def compile_stableLM(
    model_vmfb_name,
    device="cuda",
    precision="fp32",
+    debug=False,
 ):
    from shark.shark_inference import SharkInference

@@ -92,7 +93,7 @@ def compile_stableLM(
    shark_module.compile()

    path = shark_module.save_module(
-        vmfb_path.parent.absolute(), vmfb_path.stem
+        vmfb_path.parent.absolute(), vmfb_path.stem, debug=debug
    )
    print("Saved vmfb at ", str(path))

--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
--- a/apps/language_models/shark_llama_cli.spec
+++ b/apps/language_models/shark_llama_cli.spec
@@ -0,0 +1,94 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import collect_submodules
+from PyInstaller.utils.hooks import copy_metadata
+
+import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
+
+datas = []
+datas += collect_data_files('torch')
+datas += copy_metadata('torch')
+datas += copy_metadata('tqdm')
+datas += copy_metadata('regex')
+datas += copy_metadata('requests')
+datas += copy_metadata('packaging')
+datas += copy_metadata('filelock')
+datas += copy_metadata('numpy')
+datas += copy_metadata('tokenizers')
+datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torch-mlir')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
+datas += copy_metadata('huggingface-hub')
+datas += copy_metadata('sentencepiece')
+datas += copy_metadata("pyyaml")
+datas += collect_data_files("tokenizers")
+datas += collect_data_files("tiktoken")
+datas += collect_data_files("accelerate")
+datas += collect_data_files('diffusers')
+datas += collect_data_files('transformers')
+datas += collect_data_files('opencv-python')
+datas += collect_data_files('pytorch_lightning')
+datas += collect_data_files('skimage')
+datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_client')
+datas += collect_data_files('iree')
+datas += collect_data_files('google-cloud-storage')
+datas += collect_data_files('py-cpuinfo')
+datas += collect_data_files("shark", include_py_files=True)
+datas += collect_data_files("timm", include_py_files=True)
+datas += collect_data_files("tqdm")
+datas += collect_data_files("tkinter")
+datas += collect_data_files("webview")
+datas += collect_data_files("sentencepiece")
+datas += collect_data_files("jsonschema")
+datas += collect_data_files("jsonschema_specifications")
+datas += collect_data_files("cpuinfo")
+datas += collect_data_files("langchain")
+
+binaries = []
+
+block_cipher = None
+
+hiddenimports = ['shark', 'shark.shark_inference', 'apps']
+hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
+
+a = Analysis(
+    ['scripts/vicuna.py'],
+    pathex=['.'],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=hiddenimports,
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='shark_llama_cli',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/apps/language_models/src/model_wrappers/vicuna4.py
+++ b/apps/language_models/src/model_wrappers/vicuna4.py
@@ -47,18 +47,15 @@ from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
 )
 from apps.language_models.src.model_wrappers.vicuna_model import (
    FirstVicuna,
-    SecondVicuna,
+    SecondVicuna7B,
 )
 from apps.language_models.utils import (
    get_vmfb_from_path,
 )
 from shark.shark_downloader import download_public_file
 from shark.shark_importer import get_f16_inputs
-from shark.shark_importer import import_with_fx
 from shark.shark_inference import SharkInference

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import (
    LlamaDecoderLayer,
--- a/apps/language_models/src/model_wrappers/vicuna_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model.py
@@ -1,15 +1,13 @@
 import torch
 from transformers import AutoModelForCausalLM

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
-

 class FirstVicuna(torch.nn.Module):
    def __init__(
        self,
        model_path,
        precision="fp32",
+        accumulates="fp32",
        weight_group_size=128,
        model_name="vicuna",
        hf_auth_token: str = None,
@@ -18,15 +16,24 @@ class FirstVicuna(torch.nn.Module):
        kwargs = {"torch_dtype": torch.float32}
        if "llama2" in model_name:
            kwargs["use_auth_token"] = hf_auth_token
+        self.accumulates = (
+            torch.float32 if accumulates == "fp32" else torch.float16
+        )
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
+        print(f"[DEBUG] model_path : {model_path}")
        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("First Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
                get_model_impl(self.model).layers,
-                dtype=torch.float32,
+                dtype=self.accumulates,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
                weight_scale_precision="float",
@@ -40,7 +47,9 @@ class FirstVicuna(torch.nn.Module):
    def forward(self, input_ids):
        op = self.model(input_ids=input_ids, use_cache=True)
        return_vals = []
-        return_vals.append(op.logits)
+        token = torch.argmax(op.logits[:, -1, :], dim=1)
+        return_vals.append(token)
+
        temp_past_key_values = op.past_key_values
        for item in temp_past_key_values:
            return_vals.append(item[0])
@@ -48,11 +57,12 @@ class FirstVicuna(torch.nn.Module):
        return tuple(return_vals)


-class SecondVicuna(torch.nn.Module):
+class SecondVicuna7B(torch.nn.Module):
    def __init__(
        self,
        model_path,
        precision="fp32",
+        accumulates="fp32",
        weight_group_size=128,
        model_name="vicuna",
        hf_auth_token: str = None,
@@ -64,12 +74,21 @@ class SecondVicuna(torch.nn.Module):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
+        self.accumulates = (
+            torch.float32 if accumulates == "fp32" else torch.float16
+        )
+        print(f"[DEBUG] model_path : {model_path}")
        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("Second Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
                get_model_impl(self.model).layers,
-                dtype=torch.float32,
+                dtype=self.accumulates,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
                weight_scale_precision="float",
@@ -148,8 +167,6 @@ class SecondVicuna(torch.nn.Module):
        i63,
        i64,
    ):
-        # input_ids = input_tuple[0]
-        # input_tuple = torch.unbind(pkv, dim=0)
        token = i0
        past_key_values = (
            (i1, i2),
@@ -282,6 +299,842 @@ class SecondVicuna(torch.nn.Module):
            input_ids=token, use_cache=True, past_key_values=past_key_values
        )
        return_vals = []
+        token = torch.argmax(op.logits[:, -1, :], dim=1)
+        return_vals.append(token)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
+class SecondVicuna13B(torch.nn.Module):
+    def __init__(
+        self,
+        model_path,
+        precision="int8",
+        accumulates="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        self.accumulates = (
+            torch.float32 if accumulates == "fp32" else torch.float16
+        )
+        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
+            print("Second Vicuna applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                get_model_impl(self.model).layers,
+                dtype=self.accumulates,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+        i65,
+        i66,
+        i67,
+        i68,
+        i69,
+        i70,
+        i71,
+        i72,
+        i73,
+        i74,
+        i75,
+        i76,
+        i77,
+        i78,
+        i79,
+        i80,
+    ):
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+            (
+                i65,
+                i66,
+            ),
+            (
+                i67,
+                i68,
+            ),
+            (
+                i69,
+                i70,
+            ),
+            (
+                i71,
+                i72,
+            ),
+            (
+                i73,
+                i74,
+            ),
+            (
+                i75,
+                i76,
+            ),
+            (
+                i77,
+                i78,
+            ),
+            (
+                i79,
+                i80,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
+        return_vals.append(op.logits)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
+class SecondVicuna70B(torch.nn.Module):
+    def __init__(
+        self,
+        model_path,
+        precision="fp32",
+        accumulates="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        self.accumulates = (
+            torch.float32 if accumulates == "fp32" else torch.float16
+        )
+        print(f"[DEBUG] model_path : {model_path}")
+        if precision in ["int4", "int8"]:
+            from brevitas_examples.llm.llm_quant.quantize import quantize_model
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
+            print("Second Vicuna applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                get_model_impl(self.model).layers,
+                dtype=self.accumulates,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+        i65,
+        i66,
+        i67,
+        i68,
+        i69,
+        i70,
+        i71,
+        i72,
+        i73,
+        i74,
+        i75,
+        i76,
+        i77,
+        i78,
+        i79,
+        i80,
+        i81,
+        i82,
+        i83,
+        i84,
+        i85,
+        i86,
+        i87,
+        i88,
+        i89,
+        i90,
+        i91,
+        i92,
+        i93,
+        i94,
+        i95,
+        i96,
+        i97,
+        i98,
+        i99,
+        i100,
+        i101,
+        i102,
+        i103,
+        i104,
+        i105,
+        i106,
+        i107,
+        i108,
+        i109,
+        i110,
+        i111,
+        i112,
+        i113,
+        i114,
+        i115,
+        i116,
+        i117,
+        i118,
+        i119,
+        i120,
+        i121,
+        i122,
+        i123,
+        i124,
+        i125,
+        i126,
+        i127,
+        i128,
+        i129,
+        i130,
+        i131,
+        i132,
+        i133,
+        i134,
+        i135,
+        i136,
+        i137,
+        i138,
+        i139,
+        i140,
+        i141,
+        i142,
+        i143,
+        i144,
+        i145,
+        i146,
+        i147,
+        i148,
+        i149,
+        i150,
+        i151,
+        i152,
+        i153,
+        i154,
+        i155,
+        i156,
+        i157,
+        i158,
+        i159,
+        i160,
+    ):
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+            (
+                i65,
+                i66,
+            ),
+            (
+                i67,
+                i68,
+            ),
+            (
+                i69,
+                i70,
+            ),
+            (
+                i71,
+                i72,
+            ),
+            (
+                i73,
+                i74,
+            ),
+            (
+                i75,
+                i76,
+            ),
+            (
+                i77,
+                i78,
+            ),
+            (
+                i79,
+                i80,
+            ),
+            (
+                i81,
+                i82,
+            ),
+            (
+                i83,
+                i84,
+            ),
+            (
+                i85,
+                i86,
+            ),
+            (
+                i87,
+                i88,
+            ),
+            (
+                i89,
+                i90,
+            ),
+            (
+                i91,
+                i92,
+            ),
+            (
+                i93,
+                i94,
+            ),
+            (
+                i95,
+                i96,
+            ),
+            (
+                i97,
+                i98,
+            ),
+            (
+                i99,
+                i100,
+            ),
+            (
+                i101,
+                i102,
+            ),
+            (
+                i103,
+                i104,
+            ),
+            (
+                i105,
+                i106,
+            ),
+            (
+                i107,
+                i108,
+            ),
+            (
+                i109,
+                i110,
+            ),
+            (
+                i111,
+                i112,
+            ),
+            (
+                i113,
+                i114,
+            ),
+            (
+                i115,
+                i116,
+            ),
+            (
+                i117,
+                i118,
+            ),
+            (
+                i119,
+                i120,
+            ),
+            (
+                i121,
+                i122,
+            ),
+            (
+                i123,
+                i124,
+            ),
+            (
+                i125,
+                i126,
+            ),
+            (
+                i127,
+                i128,
+            ),
+            (
+                i129,
+                i130,
+            ),
+            (
+                i131,
+                i132,
+            ),
+            (
+                i133,
+                i134,
+            ),
+            (
+                i135,
+                i136,
+            ),
+            (
+                i137,
+                i138,
+            ),
+            (
+                i139,
+                i140,
+            ),
+            (
+                i141,
+                i142,
+            ),
+            (
+                i143,
+                i144,
+            ),
+            (
+                i145,
+                i146,
+            ),
+            (
+                i147,
+                i148,
+            ),
+            (
+                i149,
+                i150,
+            ),
+            (
+                i151,
+                i152,
+            ),
+            (
+                i153,
+                i154,
+            ),
+            (
+                i155,
+                i156,
+            ),
+            (
+                i157,
+                i158,
+            ),
+            (
+                i159,
+                i160,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
        return_vals.append(op.logits)
        temp_past_key_values = op.past_key_values
        for item in temp_past_key_values:
@@ -298,7 +1151,8 @@ class CombinedModel(torch.nn.Module):
    ):
        super().__init__()
        self.first_vicuna = FirstVicuna(first_vicuna_model_path)
-        self.second_vicuna = SecondVicuna(second_vicuna_model_path)
+        # NOT using this path for 13B currently, hence using `SecondVicuna7B`.
+        self.second_vicuna = SecondVicuna7B(second_vicuna_model_path)

    def forward(self, input_ids):
        first_output = self.first_vicuna(input_ids=input_ids)
--- a/apps/language_models/src/pipelines/SharkLLMBase.py
+++ b/apps/language_models/src/pipelines/SharkLLMBase.py
@@ -3,7 +3,10 @@ from abc import ABC, abstractmethod

 class SharkLLMBase(ABC):
    def __init__(
-        self, model_name, hf_model_path=None, max_num_tokens=512
+        self,
+        model_name,
+        hf_model_path=None,
+        max_num_tokens=512,
    ) -> None:
        self.model_name = model_name
        self.hf_model_path = hf_model_path
--- a/apps/language_models/src/pipelines/falcon_pipeline.py
+++ b/apps/language_models/src/pipelines/falcon_pipeline.py
@@ -7,7 +7,7 @@ from io import BytesIO
 from pathlib import Path
 from contextlib import redirect_stdout
 from shark.shark_downloader import download_public_file
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 from shark.shark_inference import SharkInference
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers.generation import (
@@ -28,7 +28,9 @@ parser = argparse.ArgumentParser(
    description="runs a falcon model",
 )

-parser.add_argument("--falcon_variant_to_use", default="7b", help="7b, 40b")
+parser.add_argument(
+    "--falcon_variant_to_use", default="7b", help="7b, 40b, 180b"
+)
 parser.add_argument(
    "--precision", "-p", default="fp16", help="fp32, fp16, int8, int4"
 )
@@ -49,7 +51,7 @@ parser.add_argument(
 )
 parser.add_argument(
    "--load_mlir_from_shark_tank",
-    default=False,
+    default=True,
    action=argparse.BooleanOptionalAction,
    help="download precompile mlir from shark tank",
 )
@@ -59,32 +61,52 @@ parser.add_argument(
    action=argparse.BooleanOptionalAction,
    help="Run model in cli mode",
 )
+parser.add_argument(
+    "--hf_auth_token",
+    type=str,
+    default=None,
+    help="Specify your own huggingface authentication token for falcon-180B model.",
+)


 class Falcon(SharkLLMBase):
    def __init__(
        self,
        model_name,
-        hf_model_path,
+        hf_model_path="tiiuae/falcon-7b-instruct",
+        hf_auth_token: str = None,
        max_num_tokens=150,
        device="cuda",
        precision="fp32",
        falcon_mlir_path=None,
        falcon_vmfb_path=None,
+        debug=False,
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
+        print("hf_model_path: ", self.hf_model_path)
+
+        if "180b" in self.model_name and hf_auth_token == None:
+            raise ValueError(
+                """ HF auth token required for falcon-180b. Pass it using
+                --hf_auth_token flag. You can ask for the access to the model
+                here: https://huggingface.co/tiiuae/falcon-180B-chat."""
+            )
+        self.hf_auth_token = hf_auth_token
        self.max_padding_length = 100
        self.device = device
        self.precision = precision
        self.falcon_vmfb_path = falcon_vmfb_path
        self.falcon_mlir_path = falcon_mlir_path
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
-        self.shark_model = self.compile()
        self.src_model = self.get_src_model()
+        self.shark_model = self.compile()

    def get_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(
-            self.hf_model_path, trust_remote_code=True
+            self.hf_model_path,
+            trust_remote_code=True,
+            token=self.hf_auth_token,
        )
        tokenizer.padding_side = "left"
        tokenizer.pad_token_id = 11
@@ -92,13 +114,18 @@ class Falcon(SharkLLMBase):

    def get_src_model(self):
        print("Loading src model: ", self.model_name)
-        kwargs = {"torch_dtype": torch.float, "trust_remote_code": True}
+        kwargs = {
+            "torch_dtype": torch.float,
+            "trust_remote_code": True,
+            "token": self.hf_auth_token,
+            "device_map": "cpu" if args.device == "cpu" else "cuda:0",
+        }
        falcon_model = AutoModelForCausalLM.from_pretrained(
            self.hf_model_path, **kwargs
        )
        return falcon_model

-    def compile_falcon(self):
+    def compile(self):
        if args.use_precompiled_model:
            if not self.falcon_vmfb_path.exists():
                # Downloading VMFB from shark_tank
@@ -120,37 +147,37 @@ class Falcon(SharkLLMBase):
            if vmfb is not None:
                return vmfb

-        print(
-            f"[DEBUG] vmfb not found at {self.falcon_vmfb_path.absolute()}. Trying to work with"
-            f"[DEBUG] mlir path { self.falcon_mlir_path} {'exists' if self.falcon_mlir_path.exists() else 'does not exist'}"
-        )
+        print(f"[DEBUG] vmfb not found at {self.falcon_vmfb_path.absolute()}")
        if self.falcon_mlir_path.exists():
+            print(f"[DEBUG] mlir found at {self.falcon_mlir_path.absolute()}")
            with open(self.falcon_mlir_path, "rb") as f:
                bytecode = f.read()
        else:
            mlir_generated = False
-            # Downloading MLIR from shark_tank
-            download_public_file(
-                "gs://shark_tank/falcon/"
-                + "falcon_"
-                + args.falcon_variant_to_use
-                + "_"
-                + self.precision
-                + ".mlir",
-                self.falcon_mlir_path.absolute(),
-                single_file=True,
+            print(
+                f"[DEBUG] mlir not found at {self.falcon_mlir_path.absolute()}"
            )
-            if self.falcon_mlir_path.exists():
-                with open(self.falcon_mlir_path, "rb") as f:
-                    bytecode = f.read()
-                mlir_generated = True
-            else:
-                raise ValueError(
-                    f"MLIR not found at {self.falcon_mlir_path.absolute()}"
-                    " after downloading! Please check path and try again"
+            if args.load_mlir_from_shark_tank:
+                # Downloading MLIR from shark_tank
+                print(f"[DEBUG] Trying to download mlir from shark_tank")
+                download_public_file(
+                    "gs://shark_tank/falcon/"
+                    + "falcon_"
+                    + args.falcon_variant_to_use
+                    + "_"
+                    + self.precision
+                    + ".mlir",
+                    self.falcon_mlir_path.absolute(),
+                    single_file=True,
                )
+                if self.falcon_mlir_path.exists():
+                    print(
+                        f"[DEBUG] mlir found at {self.falcon_mlir_path.absolute()}"
+                    )
+                    mlir_generated = True

            if not mlir_generated:
+                print(f"[DEBUG] generating MLIR locally")
                compilation_input_ids = torch.randint(
                    low=1, high=10000, size=(1, 100)
                )
@@ -170,6 +197,7 @@ class Falcon(SharkLLMBase):
                    is_f16=self.precision == "fp16",
                    f16_input_mask=[False, False],
                    mlir_type="torchscript",
+                    is_gptq=self.precision == "int4",
                )
                del model
                print(f"[DEBUG] generating torch mlir")
@@ -189,35 +217,32 @@ class Falcon(SharkLLMBase):
                bytecode = bytecode_stream.getvalue()
                del module

-                print(f"[DEBUG] writing mlir to file")
-                with open(f"{self.model_name}.mlir", "wb") as f_:
-                    with redirect_stdout(f_):
-                        print(module.operation.get_asm())
+                f_ = open(self.falcon_mlir_path, "wb")
+                f_.write(bytecode)
+                print("Saved falcon mlir at ", str(self.falcon_mlir_path))
                f_.close()
+                del bytecode

        shark_module = SharkInference(
-            mlir_module=bytecode, device=self.device, mlir_dialect="linalg"
+            mlir_module=self.falcon_mlir_path,
+            device=self.device,
+            mlir_dialect="linalg",
        )
        path = shark_module.save_module(
-            self.falcon_vmfb_path.parent.absolute(),
+            self.falcon_vmfb_path,
            self.falcon_vmfb_path.stem,
            extra_args=[
-                "--iree-hal-dump-executable-sources-to=ies",
                "--iree-vm-target-truncate-unsupported-floats",
                "--iree-codegen-check-ir-before-llvm-conversion=false",
                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                "--iree-spirv-index-bits=64",
            ],
+            debug=self.debug,
        )
        print("Saved falcon vmfb at ", str(path))
        shark_module.load_module(path)

        return shark_module

-    def compile(self):
-        falcon_shark_model = self.compile_falcon()
-        return falcon_shark_model
-
    def generate(self, prompt):
        model_inputs = self.tokenizer(
            prompt,
@@ -466,11 +491,26 @@ if __name__ == "__main__":
        else Path(args.falcon_vmfb_path)
    )

+    if args.precision == "int4":
+        if args.falcon_variant_to_use == "180b":
+            hf_model_path_value = "TheBloke/Falcon-180B-Chat-GPTQ"
+        else:
+            hf_model_path_value = (
+                "TheBloke/falcon-"
+                + args.falcon_variant_to_use
+                + "-instruct-GPTQ"
+            )
+    else:
+        if args.falcon_variant_to_use == "180b":
+            hf_model_path_value = "tiiuae/falcon-180B-chat"
+        else:
+            hf_model_path_value = (
+                "tiiuae/falcon-" + args.falcon_variant_to_use + "-instruct"
+            )
+
    falcon = Falcon(
-        "falcon_" + args.falcon_variant_to_use,
-        hf_model_path="tiiuae/falcon-"
-        + args.falcon_variant_to_use
-        + "-instruct",
+        model_name="falcon_" + args.falcon_variant_to_use,
+        hf_model_path=hf_model_path_value,
        device=args.device,
        precision=args.precision,
        falcon_mlir_path=falcon_mlir_path,
@@ -497,7 +537,11 @@ if __name__ == "__main__":
            prompt = input("Please enter the prompt text: ")
        print("\nPrompt Text: ", prompt)

-        res_str = falcon.generate(prompt)
+        prompt_template = f"""A helpful assistant who helps the user with any questions asked.
+        User: {prompt}
+        Assistant:"""
+
+        res_str = falcon.generate(prompt_template)
        torch.cuda.empty_cache()
        gc.collect()
        print(
--- a/apps/language_models/src/pipelines/minigpt4_pipeline.py
+++ b/apps/language_models/src/pipelines/minigpt4_pipeline.py
@@ -126,7 +126,7 @@ def is_url(input_url):
 import os
 import tempfile
 from shark.shark_inference import SharkInference
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 import torch
 import torch_mlir
 from torch_mlir.compiler_utils import run_pipeline_with_repro_report
@@ -178,7 +178,7 @@ def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):


 def compile_module(
-    shark_module, extended_model_name, generate_vmfb, extra_args=[]
+    shark_module, extended_model_name, generate_vmfb, extra_args=[], debug=False,
 ):
    if generate_vmfb:
        vmfb_path = os.path.join(os.getcwd(), extended_model_name + ".vmfb")
@@ -190,7 +190,7 @@ def compile_module(
                "No vmfb found. Compiling and saving to {}".format(vmfb_path)
            )
            path = shark_module.save_module(
-                os.getcwd(), extended_model_name, extra_args
+                os.getcwd(), extended_model_name, extra_args, debug=debug
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -199,7 +199,7 @@ def compile_module(


 def compile_int_precision(
-    model, inputs, precision, device, generate_vmfb, extended_model_name
+    model, inputs, precision, device, generate_vmfb, extended_model_name, debug=False
 ):
    torchscript_module = import_with_fx(
        model,
@@ -219,7 +219,7 @@ def compile_int_precision(
    print(f"[DEBUG] converting torch to linalg")
    run_pipeline_with_repro_report(
        mlir_module,
-        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
    )
    from contextlib import redirect_stdout
@@ -235,6 +235,12 @@ def compile_int_precision(
    mlir_module = BytesIO(mlir_module)
    bytecode = mlir_module.read()
    print(f"Elided IR written for {extended_model_name}")
+    bytecode = save_mlir(
+        bytecode,
+        model_name=extended_model_name,
+        frontend="torch",
+        dir=os.getcwd(),
+    )
    return bytecode
    shark_module = SharkInference(
        mlir_module=bytecode, device=device, mlir_dialect="tm_tensor"
@@ -251,6 +257,7 @@ def compile_int_precision(
            extended_model_name=extended_model_name,
            generate_vmfb=generate_vmfb,
            extra_args=extra_args,
+            debug=debug,
        ),
        bytecode,
    )
@@ -294,6 +301,7 @@ def shark_compile_through_fx_int(
        device,
        generate_or_load_vmfb,
        extended_model_name,
+        debug,
    )
    extra_args = [
        "--iree-hal-dump-executable-sources-to=ies",
--- a/apps/language_models/src/pipelines/stablelm_pipeline.py
+++ b/apps/language_models/src/pipelines/stablelm_pipeline.py
@@ -32,11 +32,13 @@ class SharkStableLM(SharkLLMBase):
        max_num_tokens=512,
        device="cuda",
        precision="fp32",
+        debug="False",
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
        self.max_sequence_len = 256
        self.device = device
        self.precision = precision
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
        self.shark_model = self.compile()

@@ -111,7 +113,7 @@ class SharkStableLM(SharkLLMBase):
        shark_module.compile()

        path = shark_module.save_module(
-            vmfb_path.parent.absolute(), vmfb_path.stem
+            vmfb_path.parent.absolute(), vmfb_path.stem, debug=self.debug
        )
        print("Saved vmfb at ", str(path))

--- a/apps/language_models/utils.py
+++ b/apps/language_models/utils.py
@@ -8,7 +8,7 @@ from shark.shark_downloader import download_public_file

 # expects a Path / str as arg
 # returns None if path not found or SharkInference module
-def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
+def get_vmfb_from_path(vmfb_path, device, mlir_dialect, device_id=None):
    if not isinstance(vmfb_path, Path):
        vmfb_path = Path(vmfb_path)

@@ -20,7 +20,7 @@ def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
    print("Loading vmfb from: ", vmfb_path)
    print("Device from get_vmfb_from_path - ", device)
    shark_module = SharkInference(
-        None, device=device, mlir_dialect=mlir_dialect
+        None, device=device, mlir_dialect=mlir_dialect, device_idx=device_id
    )
    shark_module.load_module(vmfb_path)
    print("Successfully loaded vmfb")
@@ -28,7 +28,13 @@ def get_vmfb_from_path(vmfb_path, device, mlir_dialect):


 def get_vmfb_from_config(
-    shark_container, model, precision, device, vmfb_path, padding=None
+    shark_container,
+    model,
+    precision,
+    device,
+    vmfb_path,
+    padding=None,
+    device_id=None,
 ):
    vmfb_url = (
        f"gs://shark_tank/{shark_container}/{model}_{precision}_{device}"
@@ -37,4 +43,6 @@ def get_vmfb_from_config(
        vmfb_url = vmfb_url + f"_{padding}"
    vmfb_url = vmfb_url + ".vmfb"
    download_public_file(vmfb_url, vmfb_path.absolute(), single_file=True)
-    return get_vmfb_from_path(vmfb_path, device, "tm_tensor")
+    return get_vmfb_from_path(
+        vmfb_path, device, "tm_tensor", device_id=device_id
+    )
--- a/apps/stable_diffusion/profiling_with_iree.md
+++ b/apps/stable_diffusion/profiling_with_iree.md
@@ -7,16 +7,16 @@ Compile Commands FP32/FP16:

 ```shell
 Vulkan AMD: 
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux /path/to/input/mlir -o /path/to/output/vmfb

 #  add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
 #  use –iree-input-type=auto or "mhlo_legacy" or "stablehlo" for TF models

 CUDA NVIDIA:
-iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=cuda /path/to/input/mlir -o /path/to/output/vmfb

 CPU:
-iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu  --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu /path/to/input/mlir -o /path/to/output/vmfb
 ```


--- a/apps/stable_diffusion/scripts/train_lora_word.py
+++ b/apps/stable_diffusion/scripts/train_lora_word.py
@@ -34,7 +34,7 @@ from PIL import Image
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 from diffusers.loaders import AttnProcsLayers
-from diffusers.models.cross_attention import LoRACrossAttnProcessor
+from diffusers.models.attention_processor import LoRAXFormersAttnProcessor

 import torch_mlir
 from torch_mlir.dynamo import make_simple_dynamo_backend
@@ -287,7 +287,7 @@ def lora_train(
                block_id = int(name[len("down_blocks.")])
                hidden_size = unet.config.block_out_channels[block_id]

-            lora_attn_procs[name] = LoRACrossAttnProcessor(
+            lora_attn_procs[name] = LoRAXFormersAttnProcessor(
                hidden_size=hidden_size,
                cross_attention_dim=cross_attention_dim,
            )
--- a/apps/stable_diffusion/shark_studio_imports.py
+++ b/apps/stable_diffusion/shark_studio_imports.py
@@ -15,8 +15,8 @@ pathex = [

 # datafiles for pyinstaller
 datas = []
-datas += collect_data_files("torch")
 datas += copy_metadata("torch")
+datas += copy_metadata("tokenizers")
 datas += copy_metadata("tqdm")
 datas += copy_metadata("regex")
 datas += copy_metadata("requests")
@@ -31,20 +31,20 @@ datas += copy_metadata("Pillow")
 datas += copy_metadata("sentencepiece")
 datas += copy_metadata("pyyaml")
 datas += copy_metadata("huggingface-hub")
+datas += collect_data_files("torch")
 datas += collect_data_files("tokenizers")
 datas += collect_data_files("tiktoken")
 datas += collect_data_files("accelerate")
 datas += collect_data_files("diffusers")
 datas += collect_data_files("transformers")
 datas += collect_data_files("pytorch_lightning")
-datas += collect_data_files("opencv_python")
 datas += collect_data_files("skimage")
 datas += collect_data_files("gradio")
 datas += collect_data_files("gradio_client")
 datas += collect_data_files("iree")
-datas += collect_data_files("google_cloud_storage")
 datas += collect_data_files("shark", include_py_files=True)
 datas += collect_data_files("timm", include_py_files=True)
+datas += collect_data_files("tqdm")
 datas += collect_data_files("tkinter")
 datas += collect_data_files("webview")
 datas += collect_data_files("sentencepiece")
@@ -52,6 +52,7 @@ datas += collect_data_files("jsonschema")
 datas += collect_data_files("jsonschema_specifications")
 datas += collect_data_files("cpuinfo")
 datas += collect_data_files("langchain")
+datas += collect_data_files("cv2")
 datas += [
    ("src/utils/resources/prompts.json", "resources"),
    ("src/utils/resources/model_db.json", "resources"),
@@ -74,7 +75,13 @@ datas += [
 hiddenimports = ["shark", "shark.shark_inference", "apps"]
 hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
 hiddenimports += [
-    x for x in collect_submodules("transformers") if "tests" not in x
+    x for x in collect_submodules("diffusers") if "tests" not in x
+]
+blacklist = ["tests", "convert"]
+hiddenimports += [
+    x
+    for x in collect_submodules("transformers")
+    if not any(kw in x for kw in blacklist)
 ]
 hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
-hiddenimports += ["iree._runtime", "iree._runtime_libs"]
+hiddenimports += ["iree._runtime", "iree.compiler._mlir_libs._mlir.ir"]
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -177,9 +177,11 @@ class SharkifyStableDiffusionModel:
            "unet",
            "unet512",
            "stencil_unet",
+            "stencil_unet_512",
            "vae",
            "vae_encode",
            "stencil_adaptor",
+            "stencil_adaptor_512",
        ]
        index = 0
        for model in sub_model_list:
@@ -339,7 +341,7 @@ class SharkifyStableDiffusionModel:
        )
        return shark_vae, vae_mlir

-    def get_controlled_unet(self):
+    def get_controlled_unet(self, use_large=False):
        class ControlledUnetModel(torch.nn.Module):
            def __init__(
                self,
@@ -415,6 +417,16 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False

        inputs = tuple(self.inputs["unet"])
+        model_name = "stencil_unet"
+        if use_large:
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (
+                inputs[:2]
+                + (torch.nn.functional.pad(inputs[2], pad),)
+                + inputs[3:]
+            )
+            model_name = "stencil_unet_512"
        input_mask = [
            True,
            True,
@@ -437,19 +449,19 @@ class SharkifyStableDiffusionModel:
        shark_controlled_unet, controlled_unet_mlir = compile_through_fx(
            unet,
            inputs,
-            extended_model_name=self.model_name["stencil_unet"],
+            extended_model_name=self.model_name[model_name],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
-            model_name="stencil_unet",
+            model_name=model_name,
            precision=self.precision,
            return_mlir=self.return_mlir,
        )
        return shark_controlled_unet, controlled_unet_mlir

-    def get_control_net(self):
+    def get_control_net(self, use_large=False):
        class StencilControlNetModel(torch.nn.Module):
            def __init__(
                self, model_id=self.use_stencil, low_cpu_mem_usage=False
@@ -497,17 +509,34 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False

        inputs = tuple(self.inputs["stencil_adaptor"])
+        if use_large:
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (
+                inputs[0],
+                inputs[1],
+                torch.nn.functional.pad(inputs[2], pad),
+                inputs[3],
+            )
+            save_dir = os.path.join(
+                self.sharktank_dir, self.model_name["stencil_adaptor_512"]
+            )
+        else:
+            save_dir = os.path.join(
+                self.sharktank_dir, self.model_name["stencil_adaptor"]
+            )
        input_mask = [True, True, True, True]
+        model_name = "stencil_adaptor" if use_large else "stencil_adaptor_512"
        shark_cnet, cnet_mlir = compile_through_fx(
            scnet,
            inputs,
-            extended_model_name=self.model_name["stencil_adaptor"],
+            extended_model_name=self.model_name[model_name],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
-            model_name="stencil_adaptor",
+            model_name=model_name,
            precision=self.precision,
            return_mlir=self.return_mlir,
        )
@@ -681,8 +710,11 @@ class SharkifyStableDiffusionModel:
                return self.text_encoder(input)[0]

        clip_model = CLIPText(low_cpu_mem_usage=self.low_cpu_mem_usage)
-        save_dir = os.path.join(self.sharktank_dir, self.model_name["clip"])
+        save_dir = ""
        if self.debug:
+            save_dir = os.path.join(
+                self.sharktank_dir, self.model_name["clip"]
+            )
            os.makedirs(
                save_dir,
                exist_ok=True,
@@ -748,7 +780,7 @@ class SharkifyStableDiffusionModel:
            else:
                return self.get_unet(use_large=use_large)
        else:
-            return self.get_controlled_unet()
+            return self.get_controlled_unet(use_large=use_large)

    def vae_encode(self):
        try:
@@ -847,12 +879,14 @@ class SharkifyStableDiffusionModel:
        except Exception as e:
            sys.exit(e)

-    def controlnet(self):
+    def controlnet(self, use_large=False):
        try:
            self.inputs["stencil_adaptor"] = self.get_input_info_for(
                base_models["stencil_adaptor"]
            )
-            compiled_stencil_adaptor, controlnet_mlir = self.get_control_net()
+            compiled_stencil_adaptor, controlnet_mlir = self.get_control_net(
+                use_large=use_large
+            )

            check_compilation(compiled_stencil_adaptor, "Stencil")
            if self.return_mlir:
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
@@ -84,13 +84,35 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        num_inference_steps,
        strength,
        dtype,
+        resample_type,
    ):
        # Pre process image -> get image encoded -> process latents

        # TODO: process with variable HxW combos

-        # Pre process image
-        image = image.resize((width, height))
+        # Pre-process image
+        if resample_type == "Lanczos":
+            resample_type = Image.LANCZOS
+        elif resample_type == "Nearest Neighbor":
+            resample_type = Image.NEAREST
+        elif resample_type == "Bilinear":
+            resample_type = Image.BILINEAR
+        elif resample_type == "Bicubic":
+            resample_type = Image.BICUBIC
+        elif resample_type == "Adaptive":
+            resample_type = Image.ADAPTIVE
+        elif resample_type == "Antialias":
+            resample_type = Image.ANTIALIAS
+        elif resample_type == "Box":
+            resample_type = Image.BOX
+        elif resample_type == "Affine":
+            resample_type = Image.AFFINE
+        elif resample_type == "Cubic":
+            resample_type = Image.CUBIC
+        else:  # Fallback to Lanczos
+            resample_type = Image.LANCZOS
+
+        image = image.resize((width, height), resample=resample_type)
        image_arr = np.stack([np.array(i) for i in (image,)], axis=0)
        image_arr = image_arr / 255.0
        image_arr = torch.from_numpy(image_arr).permute(0, 3, 1, 2).to(dtype)
@@ -147,6 +169,7 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        cpu_scheduling,
        max_embeddings_multiples,
        use_stencil,
+        resample_type,
    ):
        # prompts and negative prompts must be a list.
        if isinstance(prompts, str):
@@ -186,6 +209,7 @@ class Image2ImagePipeline(StableDiffusionPipeline):
            num_inference_steps=num_inference_steps,
            strength=strength,
            dtype=dtype,
+            resample_type=resample_type,
        )

        # Get Image latents
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -58,6 +58,7 @@ class StencilPipeline(StableDiffusionPipeline):
    ):
        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
        self.controlnet = None
+        self.controlnet_512 = None

    def load_controlnet(self):
        if self.controlnet is not None:
@@ -68,6 +69,15 @@ class StencilPipeline(StableDiffusionPipeline):
        del self.controlnet
        self.controlnet = None

+    def load_controlnet_512(self):
+        if self.controlnet_512 is not None:
+            return
+        self.controlnet_512 = self.sd_model.controlnet(use_large=True)
+
+    def unload_controlnet_512(self):
+        del self.controlnet_512
+        self.controlnet_512 = None
+
    def prepare_latents(
        self,
        batch_size,
@@ -111,8 +121,12 @@ class StencilPipeline(StableDiffusionPipeline):
        latent_history = [latents]
        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
        text_embeddings_numpy = text_embeddings.detach().numpy()
-        self.load_unet()
-        self.load_controlnet()
+        if text_embeddings.shape[1] <= self.model_max_length:
+            self.load_unet()
+            self.load_controlnet()
+        else:
+            self.load_unet_512()
+            self.load_controlnet_512()
        for i, t in tqdm(enumerate(total_timesteps)):
            step_start_time = time.time()
            timestep = torch.tensor([t]).to(dtype)
@@ -135,43 +149,82 @@ class StencilPipeline(StableDiffusionPipeline):
                ).to(dtype)
            else:
                latent_model_input_1 = latent_model_input
-            control = self.controlnet(
-                "forward",
-                (
-                    latent_model_input_1,
-                    timestep,
-                    text_embeddings,
-                    controlnet_hint,
-                ),
-                send_to_host=False,
-            )
+            if text_embeddings.shape[1] <= self.model_max_length:
+                control = self.controlnet(
+                    "forward",
+                    (
+                        latent_model_input_1,
+                        timestep,
+                        text_embeddings,
+                        controlnet_hint,
+                    ),
+                    send_to_host=False,
+                )
+            else:
+                control = self.controlnet_512(
+                    "forward",
+                    (
+                        latent_model_input_1,
+                        timestep,
+                        text_embeddings,
+                        controlnet_hint,
+                    ),
+                    send_to_host=False,
+                )
            timestep = timestep.detach().numpy()
            # Profiling Unet.
            profile_device = start_profiling(file_path="unet.rdc")
            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
-            noise_pred = self.unet(
-                "forward",
-                (
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                    control[0],
-                    control[1],
-                    control[2],
-                    control[3],
-                    control[4],
-                    control[5],
-                    control[6],
-                    control[7],
-                    control[8],
-                    control[9],
-                    control[10],
-                    control[11],
-                    control[12],
-                ),
-                send_to_host=False,
-            )
+
+            if text_embeddings.shape[1] <= self.model_max_length:
+                noise_pred = self.unet(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                        control[0],
+                        control[1],
+                        control[2],
+                        control[3],
+                        control[4],
+                        control[5],
+                        control[6],
+                        control[7],
+                        control[8],
+                        control[9],
+                        control[10],
+                        control[11],
+                        control[12],
+                    ),
+                    send_to_host=False,
+                )
+            else:
+                print(self.unet_512)
+                noise_pred = self.unet_512(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                        control[0],
+                        control[1],
+                        control[2],
+                        control[3],
+                        control[4],
+                        control[5],
+                        control[6],
+                        control[7],
+                        control[8],
+                        control[9],
+                        control[10],
+                        control[11],
+                        control[12],
+                    ),
+                    send_to_host=False,
+                )
            end_profiling(profile_device)

            if cpu_scheduling:
@@ -191,7 +244,9 @@ class StencilPipeline(StableDiffusionPipeline):

        if self.ondemand:
            self.unload_unet()
+            self.unload_unet_512()
            self.unload_controlnet()
+            self.unload_controlnet_512()
        avg_step_time = step_time_sum / len(total_timesteps)
        self.log += f"\nAverage step time: {avg_step_time}ms/it"

@@ -218,6 +273,7 @@ class StencilPipeline(StableDiffusionPipeline):
        cpu_scheduling,
        max_embeddings_multiples,
        use_stencil,
+        resample_type,
    ):
        # Control Embedding check & conversion
        # TODO: 1. Change `num_images_per_prompt`.
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -158,9 +158,9 @@ def load_lower_configs(base_model_id=None):
                f"{spec}.json"
            )

-    full_gs_url = config_bucket + config_name
    lowering_config_dir = os.path.join(WORKDIR, "configs", config_name)
    print("Loading lowering config file from ", lowering_config_dir)
+    full_gs_url = config_bucket + config_name
    download_public_file(full_gs_url, lowering_config_dir, True)
    return lowering_config_dir

@@ -281,13 +281,9 @@ def sd_model_annotation(mlir_model, model_name, base_model_id=None):
        if "rdna2" not in args.iree_vulkan_target_triple.split("-")[0]:
            use_winograd = True
            winograd_config_dir = load_winograd_configs()
-            winograd_model = annotate_with_winograd(
+            tuned_model = annotate_with_winograd(
                mlir_model, winograd_config_dir, model_name
            )
-            lowering_config_dir = load_lower_configs(base_model_id)
-            tuned_model = annotate_with_lower_configs(
-                winograd_model, lowering_config_dir, model_name, use_winograd
-            )
        else:
            tuned_model = mlir_model
    else:
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -132,6 +132,57 @@ p.add_argument(
    "img2img.",
 )

+p.add_argument(
+    "--use_hiresfix",
+    type=bool,
+    default=False,
+    help="Use Hires Fix to do higher resolution images, while trying to "
+    "avoid the issues that come with it. This is accomplished by first "
+    "generating an image using txt2img, then running it through img2img.",
+)
+
+p.add_argument(
+    "--hiresfix_height",
+    type=int,
+    default=768,
+    choices=range(128, 769, 8),
+    help="The height of the Hires Fix image.",
+)
+
+p.add_argument(
+    "--hiresfix_width",
+    type=int,
+    default=768,
+    choices=range(128, 769, 8),
+    help="The width of the Hires Fix image.",
+)
+
+p.add_argument(
+    "--hiresfix_strength",
+    type=float,
+    default=0.6,
+    help="The denoising strength to apply for the Hires Fix.",
+)
+
+p.add_argument(
+    "--resample_type",
+    type=str,
+    default="Nearest Neighbor",
+    choices=[
+        "Lanczos",
+        "Nearest Neighbor",
+        "Bilinear",
+        "Bicubic",
+        "Adaptive",
+        "Antialias",
+        "Box",
+        "Affine",
+        "Cubic",
+    ],
+    help="The resample type to use when resizing an image before being run "
+    "through stable diffusion.",
+)
+
 ##############################################################################
 # Stable Diffusion Training Params
 ##############################################################################
@@ -407,6 +458,14 @@ p.add_argument(
    help="Specify your own huggingface authentication tokens for models like Llama2.",
 )

+p.add_argument(
+    "--device_allocator_heap_key",
+    type=str,
+    default="",
+    help="Specify heap key for device caching allocator."
+    "Expected form: max_allocation_size;max_allocation_capacity;max_free_allocation_count"
+    "Example: --device_allocator_heap_key='*;1gib' (will limit caching on device to 1 gigabyte)",
+)
 ##############################################################################
 # IREE - Vulkan supported flags
 ##############################################################################
@@ -519,6 +578,14 @@ p.add_argument(
    "in shark importer. Does nothing if import_mlir is false (the default).",
 )

+p.add_argument(
+    "--compile_debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag to toggle debug assert/verify flags for imported IR in the"
+    "iree-compiler. Default to false.",
+)
+
 p.add_argument(
    "--iree_constant_folding",
    default=True,
@@ -574,6 +641,13 @@ p.add_argument(
    help="Flag for enabling rest API.",
 )

+p.add_argument(
+    "--debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag for enabling debugging log in WebUI.",
+)
+
 p.add_argument(
    "--output_gallery",
    default=True,
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -18,14 +18,14 @@ import tempfile
 import torch
 from safetensors.torch import load_file
 from shark.shark_inference import SharkInference
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 from shark.iree_utils.vulkan_utils import (
    set_iree_vulkan_runtime_flags,
    get_vulkan_target_triple,
    get_iree_vulkan_runtime_flags,
 )
 from shark.iree_utils.metal_utils import get_metal_target_triple
-from shark.iree_utils.gpu_utils import get_cuda_sm_cc
+from shark.iree_utils.gpu_utils import get_cuda_sm_cc, get_iree_rocm_args
 from apps.stable_diffusion.src.utils.stable_args import args
 from apps.stable_diffusion.src.utils.resources import opt_flags
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
@@ -78,7 +78,7 @@ def _compile_module(shark_module, model_name, extra_args=[]):
                    )
                )
            path = shark_module.save_module(
-                os.getcwd(), model_name, extra_args
+                os.getcwd(), model_name, extra_args, debug=args.compile_debug
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -154,8 +154,8 @@ def compile_through_fx(
        f16_input_mask=f16_input_mask,
        debug=debug,
        model_name=extended_model_name,
-        save_dir=save_dir,
    )
+
    if use_tuned:
        if "vae" in extended_model_name.split("_")[0]:
            args.annotation_model = "vae"
@@ -168,6 +168,14 @@ def compile_through_fx(
            mlir_module, extended_model_name, base_model_id
        )

+    if not os.path.isdir(save_dir):
+        save_dir = ""
+
+    mlir_module = save_mlir(
+        mlir_module,
+        model_name=extended_model_name,
+        dir=save_dir,
+    )
    shark_module = SharkInference(
        mlir_module,
        device=args.device if device is None else device,
@@ -179,17 +187,22 @@ def compile_through_fx(
            mlir_module,
        )

-    del mlir_module
    gc.collect()


 def set_iree_runtime_flags():
+    # TODO: This function should be device-agnostic and piped properly
+    # to general runtime driver init.
    vulkan_runtime_flags = get_iree_vulkan_runtime_flags()
    if args.enable_rgp:
        vulkan_runtime_flags += [
            f"--enable_rgp=true",
            f"--vulkan_debug_utils=true",
        ]
+    if args.device_allocator_heap_key:
+        vulkan_runtime_flags += [
+            f"--device_allocator=caching:device_local={args.device_allocator_heap_key}",
+        ]
    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)


@@ -470,12 +483,25 @@ def get_available_devices():
    set_iree_runtime_flags()

    available_devices = []
-    vulkan_devices = get_devices_by_name("vulkan")
+    from shark.iree_utils.vulkan_utils import (
+        get_all_vulkan_devices,
+    )
+
+    vulkaninfo_list = get_all_vulkan_devices()
+    vulkan_devices = []
+    id = 0
+    for device in vulkaninfo_list:
+        vulkan_devices.append(f"{device.strip()} => vulkan://{id}")
+        id += 1
+    if id != 0:
+        print(f"vulkan devices are available.")
    available_devices.extend(vulkan_devices)
    metal_devices = get_devices_by_name("metal")
    available_devices.extend(metal_devices)
    cuda_devices = get_devices_by_name("cuda")
    available_devices.extend(cuda_devices)
+    rocm_devices = get_devices_by_name("rocm")
+    available_devices.extend(rocm_devices)
    cpu_device = get_devices_by_name("cpu-sync")
    available_devices.extend(cpu_device)
    cpu_device = get_devices_by_name("cpu-task")
@@ -499,7 +525,10 @@ def get_opt_flags(model, precision="fp16"):
        iree_flags.append(
            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
        )
-
+    if "rocm" in args.device:
+        rocm_args = get_iree_rocm_args()
+        iree_flags.extend(rocm_args)
+        print(iree_flags)
    if args.iree_constant_folding == False:
        iree_flags.append("--iree-opt-const-expr-hoisting=False")
        iree_flags.append(
@@ -572,7 +601,7 @@ def preprocessCKPT(custom_weights, is_inpaint=False):
    )
    num_in_channels = 9 if is_inpaint else 4
    pipe = download_from_original_stable_diffusion_ckpt(
-        checkpoint_path=custom_weights,
+        checkpoint_path_or_dict=custom_weights,
        extract_ema=extract_ema,
        from_safetensors=from_safetensors,
        num_in_channels=num_in_channels,
@@ -822,6 +851,8 @@ def clear_all():
    elif os.name == "unix":
        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+    if args.local_tank_cache != "":
+        shutil.rmtree(args.local_tank_cache)


 def get_generated_imgs_path() -> Path:
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,6 +1,7 @@
 from multiprocessing import Process, freeze_support
 import os
 import sys
+import logging

 if sys.platform == "darwin":
    # import before IREE to avoid torch-MLIR library issues
@@ -41,6 +42,8 @@ def launch_app(address):


 if __name__ == "__main__":
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
    # required to do multiprocessing in a pyinstaller freeze
    freeze_support()
    if args.api or "api" in args.ui.split(","):
@@ -153,9 +156,9 @@ if __name__ == "__main__":
        upscaler_sendto_img2img,
        upscaler_sendto_inpaint,
        upscaler_sendto_outpaint,
-        lora_train_web,
-        model_web,
-        model_config_web,
+        #  lora_train_web,
+        #  model_web,
+        #  model_config_web,
        hf_models,
        modelmanager_sendto_txt2img,
        modelmanager_sendto_img2img,
@@ -247,16 +250,16 @@ if __name__ == "__main__":
                        upscaler_status,
                    ]
                )
-            with gr.TabItem(label="Model Manager", id=6):
-                model_web.render()
-            with gr.TabItem(label="LoRA Training (Experimental)", id=7):
-                lora_train_web.render()
-            with gr.TabItem(label="Chat Bot (Experimental)", id=8):
+            #  with gr.TabItem(label="Model Manager", id=6):
+            #      model_web.render()
+            #  with gr.TabItem(label="LoRA Training (Experimental)", id=7):
+            #      lora_train_web.render()
+            with gr.TabItem(label="Chat Bot", id=8):
                stablelm_chat.render()
-            with gr.TabItem(
-                label="Generate Sharding Config (Experimental)", id=9
-            ):
-                model_config_web.render()
+            #  with gr.TabItem(
+            #      label="Generate Sharding Config (Experimental)", id=9
+            #  ):
+            #      model_config_web.render()
            with gr.TabItem(label="MultiModal (Experimental)", id=10):
                minigpt4_web.render()
            # with gr.TabItem(label="DocuChat Upload", id=11):
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -3,6 +3,7 @@ import torch
 import time
 import gradio as gr
 import PIL
+from math import ceil
 from PIL import Image
 import base64
 from io import BytesIO
@@ -67,6 +68,7 @@ def img2img_inf(
    lora_hf_id: str,
    ondemand: bool,
    repeatable_seeds: bool,
+    resample_type: str,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -245,7 +247,7 @@ def img2img_inf(
            batch_size,
            height,
            width,
-            steps,
+            ceil(steps / strength),
            strength,
            guidance_scale,
            seeds[current_batch],
@@ -255,6 +257,7 @@ def img2img_inf(
            cpu_scheduling,
            args.max_embeddings_multiples,
            use_stencil=use_stencil,
+            resample_type=resample_type,
        )
        total_time = time.time() - start_time
        text_output = get_generation_text_info(
@@ -348,6 +351,7 @@ def img2img_api(
        lora_hf_id="",
        ondemand=False,
        repeatable_seeds=False,
+        resample_type="Lanczos",
    )

    # Converts generator type to subscriptable
@@ -432,7 +436,7 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                        lines=2,
                        elem_id="negative_prompt_box",
                    )
-
+                # TODO: make this import image prompt info if it exists
                img2img_init_image = gr.Image(
                    label="Input Image",
                    source="upload",
@@ -550,15 +554,6 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                        width = gr.Slider(
                            384, 768, value=args.width, step=8, label="Width"
                        )
-                        precision = gr.Radio(
-                            label="Precision",
-                            value=args.precision,
-                            choices=[
-                                "fp16",
-                                "fp32",
-                            ],
-                            visible=True,
-                        )
                        max_length = gr.Radio(
                            label="Max Length",
                            value=args.max_length,
@@ -581,11 +576,35 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                                step=0.01,
                                label="Denoising Strength",
                            )
+                            resample_type = gr.Dropdown(
+                                value=args.resample_type,
+                                choices=[
+                                    "Lanczos",
+                                    "Nearest Neighbor",
+                                    "Bilinear",
+                                    "Bicubic",
+                                    "Adaptive",
+                                    "Antialias",
+                                    "Box",
+                                    "Affine",
+                                    "Cubic",
+                                ],
+                                label="Resample Type",
+                            )
                        ondemand = gr.Checkbox(
                            value=args.ondemand,
                            label="Low VRAM",
                            interactive=True,
                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=args.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=True,
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            guidance_scale = gr.Slider(
@@ -695,6 +714,7 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                lora_hf_id,
                ondemand,
                repeatable_seeds,
+                resample_type,
            ],
            outputs=[img2img_gallery, std_output, img2img_status],
            show_progress="minimal" if args.progress_bar else "none",
--- a/apps/stable_diffusion/web/ui/minigpt4_ui.py
+++ b/apps/stable_diffusion/web/ui/minigpt4_ui.py
@@ -109,7 +109,7 @@ with gr.Blocks() as minigpt4_web:
    gr.Markdown(description)

    with gr.Row():
-        with gr.Column(scale=0.5):
+        with gr.Column():
            image = gr.Image(type="pil")
            upload_button = gr.Button(
                value="Upload & Start Chat",
--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -8,7 +8,7 @@ from transformers import (
 from apps.stable_diffusion.web.ui.utils import available_devices
 from datetime import datetime as dt
 import json
-import time
+import sys


 def user(message, history):
@@ -24,12 +24,9 @@ past_key_values = None

 model_map = {
    "llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
+    "llama2_13b": "meta-llama/Llama-2-13b-chat-hf",
    "llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
-    "codegen": "Salesforce/codegen25-7b-multi",
-    "vicuna1p3": "lmsys/vicuna-7b-v1.3",
    "vicuna": "TheBloke/vicuna-7B-1.1-HF",
-    "vicuna4": "TheBloke/vicuna-7B-1.1-HF",
-    "StableLM": "stabilityai/stablelm-tuned-alpha-3b",
 }

 # NOTE: Each `model_name` should have its own start message
@@ -43,6 +40,15 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
+    "llama2_13b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
    "llama2_70b": (
        "System: You are a helpful, respectful and honest assistant. Always answer "
        "as helpfully as possible, while being safe.  Your answers should not "
@@ -52,60 +58,39 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
-    "StableLM": (
-        "<|SYSTEM|># StableLM Tuned (Alpha version)"
-        "\n- StableLM is a helpful and harmless open-source AI language model "
-        "developed by StabilityAI."
-        "\n- StableLM is excited to be able to help the user, but will refuse "
-        "to do anything that could be considered harmful to the user."
-        "\n- StableLM is more than just an information source, StableLM is also "
-        "able to write poetry, short stories, and make jokes."
-        "\n- StableLM will refuse to participate in anything that "
-        "could harm a human."
-    ),
    "vicuna": (
        "A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's "
        "questions.\n"
    ),
-    "vicuna4": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "vicuna1p3": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "codegen": "",
 }


 def create_prompt(model_name, history):
    system_message = start_message[model_name]

-    if model_name in [
-        "StableLM",
-        "vicuna",
-        "vicuna4",
-        "vicuna1p3",
-        "llama2_7b",
-        "llama2_70b",
-    ]:
+    if "llama2" in model_name:
+        B_INST, E_INST = "[INST]", "[/INST]"
+        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+        conversation = "".join(
+            [f"{B_INST} {item[0]} {E_INST} {item[1]} " for item in history[1:]]
+        )
+        msg = f"{B_INST} {B_SYS} {system_message} {E_SYS} {history[0][0]} {E_INST} {history[0][1]} {conversation}"
+    elif model_name in ["vicuna"]:
        conversation = "".join(
            [
                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
                for item in history
            ]
        )
+        msg = system_message + conversation
+        msg = msg.strip()
    else:
        conversation = "".join(
            ["".join([item[0], item[1]]) for item in history]
        )
-
-    msg = system_message + conversation
-    msg = msg.strip()
+        msg = system_message + conversation
+        msg = msg.strip()
    return msg


@@ -149,14 +134,16 @@ def chat(
    model,
    device,
    precision,
+    download_vmfb,
    config_file,
    cli=False,
    progress=gr.Progress(),
 ):
    global past_key_values
    global model_vmfb_key
-
    global vicuna_model
+
+    device_id = None
    model_name, model_path = list(map(str.strip, model.split("=>")))
    if "cuda" in device:
        device = "cuda"
@@ -165,124 +152,125 @@ def chat(
    elif "task" in device:
        device = "cpu-task"
    elif "vulkan" in device:
+        device_id = int(device.split("://")[1])
        device = "vulkan"
+    elif "rocm" in device:
+        device = "rocm"
    else:
        print("unrecognized device")

-    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{precision}"
-    if model_name in [
-        "vicuna",
-        "vicuna4",
-        "vicuna1p3",
-        "codegen",
-        "llama2_7b",
-        "llama2_70b",
-    ]:
-        from apps.language_models.scripts.vicuna import ShardedVicuna
-        from apps.language_models.scripts.vicuna import UnshardedVicuna
-        from apps.stable_diffusion.src import args
+    from apps.language_models.scripts.vicuna import ShardedVicuna
+    from apps.language_models.scripts.vicuna import UnshardedVicuna
+    from apps.stable_diffusion.src import args

-        if new_model_vmfb_key != model_vmfb_key:
-            model_vmfb_key = new_model_vmfb_key
-            max_toks = 128 if model_name == "codegen" else 512
-
-            # get iree flags that need to be overridden, from commandline args
-            _extra_args = []
-            # vulkan target triple
-            if args.iree_vulkan_target_triple != "":
-                _extra_args.append(
-                    f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-                )
-
-            if model_name == "vicuna4":
-                vicuna_model = ShardedVicuna(
-                    model_name,
-                    hf_model_path=model_path,
-                    device=device,
-                    precision=precision,
-                    max_num_tokens=max_toks,
-                    compressed=True,
-                    extra_args_cmd=_extra_args,
-                )
-            else:
-                #  if config_file is None:
-                vicuna_model = UnshardedVicuna(
-                    model_name,
-                    hf_model_path=model_path,
-                    hf_auth_token=args.hf_auth_token,
-                    device=device,
-                    precision=precision,
-                    max_num_tokens=max_toks,
-                    extra_args_cmd=_extra_args,
-                )
-                #  else:
-                #      if config_file is not None:
-                #          config_file = open(config_file)
-                #          config_json = json.load(config_file)
-                #          config_file.close()
-                #      else:
-                #          config_json = get_default_config()
-                #      vicuna_model = ShardedVicuna(
-                #          model_name,
-                #          device=device,
-                #          precision=precision,
-                #          config_json=config_json,
-                #      )
-
-        prompt = create_prompt(model_name, history)
-
-        partial_text = ""
-        count = 0
-        start_time = time.time()
-        for text, msg in progress.tqdm(
-            vicuna_model.generate(prompt, cli=cli),
-            desc="generating response",
-        ):
-            count += 1
-            if "formatted" in msg:
-                history[-1][1] = text
-                end_time = time.time()
-                tokens_per_sec = count / (end_time - start_time)
-                yield history, str(
-                    format(tokens_per_sec, ".2f")
-                ) + " tokens/sec"
-            else:
-                partial_text += text + " "
-                history[-1][1] = partial_text
-                yield history, ""
-
-        return history, ""
-
-    # else Model is StableLM
-    global sharkModel
-    from apps.language_models.src.pipelines.stablelm_pipeline import (
-        SharkStableLM,
-    )
-
-    if new_model_vmfb_key != model_vmfb_key:
+    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{device_id}#{precision}#{download_vmfb}"
+    if vicuna_model is None or new_model_vmfb_key != model_vmfb_key:
        model_vmfb_key = new_model_vmfb_key
-        # max_new_tokens=512
-        shark_slm = SharkStableLM(
-            model_name
-        )  # pass elements from UI as required
+        max_toks = 128 if model_name == "codegen" else 512
+
+        # get iree flags that need to be overridden, from commandline args
+        _extra_args = []
+        # vulkan target triple
+        vulkan_target_triple = args.iree_vulkan_target_triple
+        from shark.iree_utils.vulkan_utils import (
+            get_all_vulkan_devices,
+            get_vulkan_target_triple,
+        )
+
+        if device == "vulkan":
+            vulkaninfo_list = get_all_vulkan_devices()
+            if vulkan_target_triple == "":
+                # We already have the device_id extracted via WebUI, so we directly use
+                # that to find the target triple.
+                vulkan_target_triple = get_vulkan_target_triple(
+                    vulkaninfo_list[device_id]
+                )
+            _extra_args.append(
+                f"-iree-vulkan-target-triple={vulkan_target_triple}"
+            )
+            if "rdna" in vulkan_target_triple:
+                flags_to_add = [
+                    "--iree-spirv-index-bits=64",
+                ]
+                _extra_args = _extra_args + flags_to_add
+
+            if device_id is None:
+                id = 0
+                for device in vulkaninfo_list:
+                    target_triple = get_vulkan_target_triple(
+                        vulkaninfo_list[id]
+                    )
+                    if target_triple == vulkan_target_triple:
+                        device_id = id
+                        break
+                    id += 1
+
+                assert (
+                    device_id
+                ), f"no vulkan hardware for target-triple '{vulkan_target_triple}' exists"
+
+        print(f"Will use target triple : {vulkan_target_triple}")
+
+        if model_name == "vicuna4":
+            vicuna_model = ShardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                compressed=True,
+                extra_args_cmd=_extra_args,
+            )
+        else:
+            #  if config_file is None:
+            vicuna_model = UnshardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                hf_auth_token=args.hf_auth_token,
+                device=device,
+                vulkan_target_triple=vulkan_target_triple,
+                precision=precision,
+                max_num_tokens=max_toks,
+                download_vmfb=download_vmfb,
+                load_mlir_from_shark_tank=True,
+                extra_args_cmd=_extra_args,
+                device_id=device_id,
+            )
+
+    if vicuna_model is None:
+        sys.exit("Unable to instantiate the model object, exiting.")

-    # Construct the input message string for the model by concatenating the
-    # current system message and conversation history
-    if len(curr_system_message.split()) > 160:
-        print("clearing context")
    prompt = create_prompt(model_name, history)
-    generate_kwargs = dict(prompt=prompt)
-
-    words_list = shark_slm.generate(**generate_kwargs)

    partial_text = ""
-    for new_text in words_list:
-        partial_text += new_text
-        history[-1][1] = partial_text
-        # Yield an empty string to clean up the message textbox and the updated
-        # conversation history
-        yield history
-    return words_list
+    token_count = 0
+    total_time_ms = 0.001  # In order to avoid divide by zero error
+    prefill_time = 0
+    is_first = True
+    for text, msg, exec_time in progress.tqdm(
+        vicuna_model.generate(prompt, cli=cli),
+        desc="generating response",
+    ):
+        if msg is None:
+            if is_first:
+                prefill_time = exec_time
+                is_first = False
+            else:
+                total_time_ms += exec_time
+                token_count += 1
+            partial_text += text + " "
+            history[-1][1] = partial_text
+            yield history, f"Prefill: {prefill_time:.2f}"
+        elif "formatted" in msg:
+            history[-1][1] = text
+            tokens_per_sec = (token_count / total_time_ms) * 1000
+            yield history, f"Prefill: {prefill_time:.2f} seconds\n Decode: {tokens_per_sec:.2f} tokens/sec"
+        else:
+            sys.exit(
+                "unexpected message from the vicuna generate call, exiting."
+            )
+
+    return history, ""


 def llm_chat_api(InputData: dict):
@@ -318,6 +306,7 @@ def llm_chat_api(InputData: dict):
        UnshardedVicuna,
    )

+    device_id = None
    if vicuna_model == 0:
        if "cuda" in device:
            device = "cuda"
@@ -326,6 +315,7 @@ def llm_chat_api(InputData: dict):
        elif "task" in device:
            device = "cpu-task"
        elif "vulkan" in device:
+            device_id = int(device.split("://")[1])
            device = "vulkan"
        else:
            print("unrecognized device")
@@ -336,6 +326,9 @@ def llm_chat_api(InputData: dict):
            device=device,
            precision=precision,
            max_num_tokens=max_toks,
+            download_vmfb=True,
+            load_mlir_from_shark_tank=True,
+            device_id=device_id,
        )

    # TODO: add role dict for different models
@@ -398,7 +391,7 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        )
        model = gr.Dropdown(
            label="Select Model",
-            value=model_choices[4],
+            value=model_choices[0],
            choices=model_choices,
        )
        supported_devices = available_devices
@@ -406,27 +399,31 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        # show cpu-task device first in list for chatbot
        supported_devices = supported_devices[-1:] + supported_devices[:-1]
        supported_devices = [x for x in supported_devices if "sync" not in x]
-        #  print(supported_devices)
-        devices = gr.Dropdown(
+        device = gr.Dropdown(
            label="Device",
            value=supported_devices[0]
            if enabled
            else "Only CUDA Supported for now",
            choices=supported_devices,
            interactive=enabled,
-            #  multiselect=True,
+            # multiselect=True,
        )
        precision = gr.Radio(
            label="Precision",
-            value="int8",
+            value="int4",
            choices=[
                "int4",
                "int8",
                "fp16",
            ],
-            visible=True,
+            visible=False,
        )
        tokens_time = gr.Textbox(label="Tokens generated per second")
+        download_vmfb = gr.Checkbox(
+            label="Download vmfb from Shark tank if available",
+            value=True,
+            interactive=True,
+        )

    with gr.Row(visible=False):
        with gr.Group():
@@ -458,19 +455,45 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
    )

    submit_event = msg.submit(
-        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        show_progress=False,
+        queue=False,
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, devices, precision, config_file],
+        inputs=[
+            system_msg,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
        outputs=[chatbot, tokens_time],
+        show_progress=False,
        queue=True,
    )
    submit_click_event = submit.click(
-        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        show_progress=False,
+        queue=False,
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, devices, precision, config_file],
+        inputs=[
+            system_msg,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
        outputs=[chatbot, tokens_time],
+        show_progress=False,
        queue=True,
    )
    stop.click(
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -4,6 +4,7 @@ import time
 import sys
 import gradio as gr
 from PIL import Image
+from math import ceil
 import base64
 from io import BytesIO
 from fastapi.exceptions import HTTPException
@@ -26,6 +27,7 @@ from apps.stable_diffusion.src import (
    utils,
    save_output_img,
    prompt_examples,
+    Image2ImagePipeline,
 )
 from apps.stable_diffusion.src.utils import (
    get_generated_imgs_path,
@@ -62,6 +64,11 @@ def txt2img_inf(
    lora_hf_id: str,
    ondemand: bool,
    repeatable_seeds: bool,
+    use_hiresfix: bool,
+    hiresfix_height: int,
+    hiresfix_width: int,
+    hiresfix_strength: float,
+    resample_type: str,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -200,6 +207,81 @@ def txt2img_inf(
            cpu_scheduling,
            args.max_embeddings_multiples,
        )
+        # TODO: allow user to save original image
+        # TODO: add option to let user keep both pipelines loaded, and unload
+        #  either at will
+        # TODO: add custom step value slider
+        # TODO: add option to use secondary model for the img2img pass
+        if use_hiresfix is True:
+            new_config_obj = Config(
+                "img2img",
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                precision,
+                1,
+                max_length,
+                height,
+                width,
+                device,
+                use_lora=args.use_lora,
+                use_stencil="None",
+                ondemand=ondemand,
+            )
+
+            global_obj.clear_cache()
+            global_obj.set_cfg_obj(new_config_obj)
+            set_init_device_flags()
+            model_id = (
+                args.hf_model_id
+                if args.hf_model_id
+                else "stabilityai/stable-diffusion-2-1-base"
+            )
+            global_obj.set_schedulers(get_schedulers(model_id))
+            scheduler_obj = global_obj.get_scheduler(args.scheduler)
+
+            global_obj.set_sd_obj(
+                Image2ImagePipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    1,
+                    hiresfix_height,
+                    hiresfix_width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                    ondemand=args.ondemand,
+                )
+            )
+
+            global_obj.set_sd_scheduler(args.scheduler)
+
+            out_imgs = global_obj.get_sd_obj().generate_images(
+                prompt,
+                negative_prompt,
+                out_imgs[0],
+                batch_size,
+                hiresfix_height,
+                hiresfix_width,
+                ceil(steps / hiresfix_strength),
+                hiresfix_strength,
+                guidance_scale,
+                seeds[current_batch],
+                args.max_length,
+                dtype,
+                args.use_base_vae,
+                cpu_scheduling,
+                args.max_embeddings_multiples,
+                use_stencil="None",
+                resample_type=resample_type,
+            )
        total_time = time.time() - start_time
        text_output = get_generation_text_info(
            seeds[: current_batch + 1], device
@@ -271,6 +353,11 @@ def txt2img_api(
        lora_hf_id="",
        ondemand=False,
        repeatable_seeds=False,
+        use_hiresfix=False,
+        hiresfix_height=512,
+        hiresfix_width=512,
+        hiresfix_strength=0.6,
+        resample_type="Nearest Neighbor",
    )

    # Convert Generator to Subscriptable
@@ -460,6 +547,49 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                            label="Low VRAM",
                            interactive=True,
                        )
+                    with gr.Group():
+                        with gr.Row():
+                            use_hiresfix = gr.Checkbox(
+                                value=args.use_hiresfix,
+                                label="Use Hires Fix",
+                                interactive=True,
+                            )
+                            resample_type = gr.Dropdown(
+                                value=args.resample_type,
+                                choices=[
+                                    "Lanczos",
+                                    "Nearest Neighbor",
+                                    "Bilinear",
+                                    "Bicubic",
+                                    "Adaptive",
+                                    "Antialias",
+                                    "Box",
+                                    "Affine",
+                                    "Cubic",
+                                ],
+                                label="Resample Type",
+                            )
+                        hiresfix_height = gr.Slider(
+                            384,
+                            768,
+                            value=args.hiresfix_height,
+                            step=8,
+                            label="Hires Fix Height",
+                        )
+                        hiresfix_width = gr.Slider(
+                            384,
+                            768,
+                            value=args.hiresfix_width,
+                            step=8,
+                            label="Hires Fix Width",
+                        )
+                        hiresfix_strength = gr.Slider(
+                            0,
+                            1,
+                            value=args.hiresfix_strength,
+                            step=0.01,
+                            label="Hires Fix Denoising Strength",
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            batch_count = gr.Slider(
@@ -495,16 +625,6 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                        value=available_devices[0],
                        choices=available_devices,
                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        lambda: -1,
-                        inputs=[],
-                        outputs=[seed],
-                        queue=False,
-                    )
-                    stop_batch = gr.Button("Stop Batch")
-                    stable_diffusion = gr.Button("Generate Image(s)")
                with gr.Accordion(label="Prompt Examples!", open=False):
                    ex = gr.Examples(
                        examples=prompt_examples,
@@ -530,6 +650,18 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                        show_label=False,
                    )
                    txt2img_status = gr.Textbox(visible=False)
+                with gr.Row():
+                    stable_diffusion = gr.Button("Generate Image(s)")
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        lambda: -1,
+                        inputs=[],
+                        outputs=[seed],
+                        queue=False,
+                    )
+                    stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    blank_thing_for_row = None
                with gr.Row():
                    txt2img_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    txt2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
@@ -565,6 +697,11 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                lora_hf_id,
                ondemand,
                repeatable_seeds,
+                use_hiresfix,
+                hiresfix_height,
+                hiresfix_width,
+                hiresfix_strength,
+                resample_type,
            ],
            outputs=[txt2img_gallery, std_output, txt2img_status],
            show_progress="minimal" if args.progress_bar else "none",
--- a/apps/stable_diffusion/web/ui/utils.py
+++ b/apps/stable_diffusion/web/ui/utils.py
@@ -25,7 +25,7 @@ class Config:
    device: str
    use_lora: str
    use_stencil: str
-    ondemand: str
+    ondemand: str  # should this be expecting a bool instead?


 custom_model_filetypes = (
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -40,7 +40,7 @@ cmake --build build/
 *Prepare the model*
 ```bash
 wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  resnet50_tf.mlir -o resnet50_tf.vmfb
 ```
 *Prepare the input*

@@ -65,18 +65,18 @@ A tool for benchmarking other models is built and can be invoked with a command
 see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
 ```bash
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  stable_diff_tf.mlir -o stable_diff_tf.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
 ```
 VAE and Autoencoder are also available
 ```bash
 # VAE
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  vae.mlir -o vae.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32

 # CLIP Autoencoder
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  clip_autoencoder.mlir -o clip_autoencoder.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
 ```
--- a/docs/shark_iree_profiling.md
+++ b/docs/shark_iree_profiling.md
@@ -55,7 +55,7 @@ The command line for compilation will start something like this, where the `-` n
 The `-o output_filename.vmfb` flag can be used to specify the location to save the compiled vmfb. Note that a dump of the
 dispatches that can be compiled + run in isolation can be generated by adding `--iree-hal-dump-executable-benchmarks-to=/some/directory`. Say, if they are in the `benchmarks` directory, the following compile/run commands would work for Vulkan on RDNA3.
 ```
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux  benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb

 iree-benchmark-module --module=benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb --function=forward --device=vulkan
 ```
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -1,192 +0,0 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-cmake_minimum_required(VERSION 3.17)
-
-project(sharkbackend LANGUAGES C CXX)
-
-#
-# Options
-#
-
-option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
-option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
-
-set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
-set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
-set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-#
-# Dependencies
-#
-# FetchContent requires us to include the transitive closure of all
-# repos that we depend on so that we can override the tags.
-#
-include(FetchContent)
-
-FetchContent_Declare(
-  repo-common
-  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
-  GIT_TAG ${TRITON_COMMON_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_Declare(
-  repo-core
-  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
-  GIT_TAG ${TRITON_CORE_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_Declare(
-  repo-backend
-  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
-  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_MakeAvailable(repo-common repo-core repo-backend)
-
-#
-# The backend must be built into a shared library. Use an ldscript to
-# hide all symbols except for the TRITONBACKEND API.
-#
-configure_file(src/libtriton_dshark.ldscript libtriton_dshark.ldscript COPYONLY)
-
-add_library(
-  triton-dshark-backend SHARED
-  src/dshark.cc
-  #src/dshark_driver_module.c
-)
-
-add_library(
-  SharkBackend::triton-dshark-backend ALIAS triton-dshark-backend
-)
-
-target_include_directories(
-  triton-dshark-backend
-  PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/src
-)
-
-list(APPEND CMAKE_MODULE_PATH "${PROJECT_BINARY_DIR}/lib/cmake/mlir")
-
-add_subdirectory(thirdparty/srt EXCLUDE_FROM_ALL)
-
-target_link_libraries(triton-dshark-backend PRIVATE iree_base_base
-  iree_hal_hal
-  iree_hal_cuda_cuda
-  iree_hal_cuda_registration_registration
-  iree_hal_vmvx_registration_registration
-  iree_hal_dylib_registration_registration
-  iree_modules_hal_hal
-  iree_vm_vm
-  iree_vm_bytecode_module
-  iree_hal_local_loaders_system_library_loader
-  iree_hal_local_loaders_vmvx_module_loader
-  )
-
-target_compile_features(triton-dshark-backend PRIVATE cxx_std_11)
-
-
-target_link_libraries(
-  triton-dshark-backend
-  PRIVATE
-    triton-core-serverapi   # from repo-core
-    triton-core-backendapi  # from repo-core
-    triton-core-serverstub  # from repo-core
-    triton-backend-utils    # from repo-backend
-)
-
-if(WIN32)
-  set_target_properties(
-    triton-dshark-backend PROPERTIES
-    POSITION_INDEPENDENT_CODE ON
-    OUTPUT_NAME triton_dshark
-  )
-else()
-  set_target_properties(
-    triton-dshark-backend PROPERTIES
-    POSITION_INDEPENDENT_CODE ON
-    OUTPUT_NAME triton_dshark
-    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_dshark.ldscript
-    LINK_FLAGS "-Wl,--version-script libtriton_dshark.ldscript"
-  )
-endif()
-
-
-
-#
-# Install
-#
-include(GNUInstallDirs)
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/SharkBackend)
-
-install(
-  TARGETS
-    triton-dshark-backend
-  EXPORT
-    triton-dshark-backend-targets
-  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
-  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
-)
-
-install(
-  EXPORT
-    triton-dshark-backend-targets
-  FILE
-    SharkBackendTargets.cmake
-  NAMESPACE
-    SharkBackend::
-  DESTINATION
-    ${INSTALL_CONFIGDIR}
-)
-
-include(CMakePackageConfigHelpers)
-configure_package_config_file(
-  ${CMAKE_CURRENT_LIST_DIR}/cmake/SharkBackendConfig.cmake.in
-  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
-  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
-)
-
-install(
-  FILES
-  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
-  DESTINATION ${INSTALL_CONFIGDIR}
-)
-
-#
-# Export from build tree
-#
-export(
-  EXPORT triton-dshark-backend-targets
-  FILE ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendTargets.cmake
-  NAMESPACE SharkBackend::
-)
-
-export(PACKAGE SharkBackend)
-
--- a/inference/README.md
+++ b/inference/README.md
@@ -1,100 +0,0 @@
-# SHARK Triton Backend
-
-The triton backend for shark.
-
-# Build
-
-Install SHARK
-
-```
-git clone https://github.com/nod-ai/SHARK.git
-# skip above step if dshark is already installed
-cd SHARK/inference
-```
-
-install dependancies
-
-```
-apt-get install patchelf rapidjson-dev python3-dev
-git submodule update --init
-```
-
-update the submodules of iree
-
-```
-cd thirdparty/srt
-git submodule update --init
-```
-
-Next, make the backend and install it
-
-```
-cd ../..
-mkdir build && cd build
-cmake -DTRITON_ENABLE_GPU=ON \
-DIREE_HAL_DRIVER_CUDA=ON \
-DIREE_TARGET_BACKEND_CUDA=ON \
-DMLIR_ENABLE_CUDA_RUNNER=ON \
-DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \
-DTRITON_BACKEND_REPO_TAG=r22.02 \
-DTRITON_CORE_REPO_TAG=r22.02 \
-DTRITON_COMMON_REPO_TAG=r22.02 ..
-make install
-```
-
-# Incorporating into Triton
-
-There are much more in depth explenations for the following steps in triton's documentation:
-https://github.com/triton-inference-server/server/blob/main/docs/compose.md#triton-with-unsupported-and-custom-backends
-
-There should be a file at /build/install/backends/dshark/libtriton_dshark.so.  You will need to copy it into your triton server image.  
-More documentation is in the link above, but to create the docker image, you need to run the compose.py command in the triton-backend server repo
-
-
-To first build your image, clone the tritonserver repo.
-
-```
-git clone https://github.com/triton-inference-server/server.git
-```
-
-then run `compose.py` to build a docker compose file 
-```
-cd server
-python3 compose.py --repoagent checksum --dry-run
-```
-
-Because dshark is a third party backend, you will need to manually modify the `Dockerfile.compose` to include the dshark backend.  To do this, in the Dockerfile.compose file produced, copy this line.
-the dshark backend will be located in the build folder from earlier under `/build/install/backends`
-
-```
-COPY /path/to/build/install/backends/dshark /opt/tritonserver/backends/dshark
-```
-
-Next run 
-```
-docker build -t tritonserver_custom -f Dockerfile.compose .
-docker run -it --gpus=1 --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
-```
-
-where `path/to/model_repos` is where you are storing the models you want to run
-
-if your not using gpus, omit `--gpus=1`
-
-```
-docker run -it  --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
-```
-
-# Setting up a model
-
-to include a model in your backend, add a directory with your model name to your model repository directory.  examples of models can be seen here: https://github.com/triton-inference-server/backend/tree/main/examples/model_repos/minimal_models
-
-make sure to adjust the input correctly in the config.pbtxt file, and save a vmfb file under 1/model.vmfb
-
-# CUDA
-
-if you're having issues with cuda, make sure your correct drivers are installed, and that `nvidia-smi` works, and also make sure that the nvcc compiler is on the path.
-
-
-
-
-
--- a/inference/cmake/SharkBackendConfig.cmake.in
+++ b/inference/cmake/SharkBackendConfig.cmake.in
@@ -1,39 +0,0 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include(CMakeFindDependencyMacro)
-
-get_filename_component(
-  SHARKBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
-)
-
-list(APPEND CMAKE_MODULE_PATH ${SHARKBACKEND_CMAKE_DIR})
-
-if(NOT TARGET SharkBackend::triton-dshark-backend)
-  include("${SHARKBACKEND_CMAKE_DIR}/SharkBackendTargets.cmake")
-endif()
-
-set(SHARKBACKEND_LIBRARIES SharkBackend::triton-dshark-backend)
--- a/inference/src/dshark.cc
+++ b/inference/src/dshark.cc
--- a/inference/src/libtriton_dshark.ldscript
+++ b/inference/src/libtriton_dshark.ldscript
@@ -1,30 +0,0 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-{
-  global:
-    TRITONBACKEND_*;
-  local: *;
-};
--- a/inference/thirdparty/shark-runtime
+++ b/inference/thirdparty/shark-runtime
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -6,15 +6,15 @@ from distutils.sysconfig import get_python_lib
 import fileinput
 from pathlib import Path

-# Temorary workaround for transformers/__init__.py.
-path_to_tranformers_hook = Path(
+# Temporary workaround for transformers/__init__.py.
+path_to_transformers_hook = Path(
    get_python_lib()
    + "/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-transformers.py"
 )
-if path_to_tranformers_hook.is_file():
+if path_to_transformers_hook.is_file():
    pass
 else:
-    with open(path_to_tranformers_hook, "w") as f:
+    with open(path_to_transformers_hook, "w") as f:
        f.write("module_collection_mode = 'pyz+py'")

 path_to_skipfiles = Path(get_python_lib() + "/torch/_dynamo/skipfiles.py")
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -8,19 +8,8 @@ torchvision
 tqdm

 #iree-compiler  | iree-runtime should already be installed
-#these dont work ok osx
-#iree-tools-tflite
-#iree-tools-xla
-#iree-tools-tf

-# TensorFlow and JAX.
-gin-config
-tensorflow-macos
-tensorflow-metal
-#tf-models-nightly
-#tensorflow-text-nightly
 transformers
-tensorflow-probability
 #jax[cpu]

 # tflitehub dependencies.
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -9,23 +9,13 @@ tabulate
 tqdm

 #iree-compiler  | iree-runtime should already be installed
-iree-tools-tflite
 iree-tools-xla
-iree-tools-tf

-# TensorFlow and JAX.
+# Modelling and JAX.
 gin-config
-tf-nightly
-keras
-#tf-models-nightly
-#tensorflow-text-nightly
 transformers
 diffusers
-#tensorflow-probability
 #jax[cpu]
-
-
-# tflitehub dependencies.
 Pillow

 # Testing and support.
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,13 +18,14 @@ Pillow
 parameterized

 # Add transformers, diffusers and scipy since it most commonly used
+tokenizers==0.13.3
 transformers
-diffusers==0.19.3
+diffusers
 #accelerate is now required for diffusers import from ckpt.
 accelerate
 scipy
 ftfy
-gradio
+gradio==3.44.3
 altair
 omegaconf
 # 0.3.2 doesn't have binaries for arm64
@@ -46,4 +47,4 @@ pefile
 pyinstaller

 # vicuna quantization
-brevitas @ git+https://github.com/Xilinx/brevitas.git@dev
+brevitas @ git+https://github.com/Xilinx/brevitas.git@56edf56a3115d5ac04f19837b388fd7d3b1ff7ea
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -130,14 +130,13 @@ fi

 $PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/cpu/

-if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
+if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
  T_VER=$($PYTHON -m pip show torch | grep Version)
-  TORCH_VERSION=${T_VER:9:17}
+  T_VER_MIN=${T_VER:14:12}
  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
-  TV_VERSION=${TV_VER:9:18}
-  $PYTHON -m pip uninstall -y torch torchvision
-  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
-  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu118/torch-${TORCH_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu118/torchvision-${TV_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl
+  TV_VER_MAJ=${TV_VER:9:6}
+  $PYTHON -m pip uninstall -y torchvision
+  $PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
  if [ $? -eq 0 ];then
    echo "Successfully Installed torch + cu118."
  else
@@ -146,7 +145,7 @@ if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
 fi

 if [[ -z "${NO_BREVITAS}" ]]; then
-  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@llm
+  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@dev
 fi

 if [[ -z "${CONDA_PREFIX}" && "$SKIP_VENV" != "1" ]]; then
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -52,6 +52,8 @@ def iree_device_map(device):
    )
    if len(uri_parts) == 1:
        return iree_driver
+    elif "rocm" in uri_parts:
+        return "rocm"
    else:
        return f"{iree_driver}://{uri_parts[1]}"

@@ -63,7 +65,6 @@ def get_supported_device_list():
 _IREE_DEVICE_MAP = {
    "cpu": "local-task",
    "cpu-task": "local-task",
-    "AMD-AIE": "local-task",
    "cpu-sync": "local-sync",
    "cuda": "cuda",
    "vulkan": "vulkan",
@@ -82,7 +83,6 @@ def iree_target_map(device):
 _IREE_TARGET_MAP = {
    "cpu": "llvm-cpu",
    "cpu-task": "llvm-cpu",
-    "AMD-AIE": "llvm-cpu",
    "cpu-sync": "llvm-cpu",
    "cuda": "cuda",
    "vulkan": "vulkan",
@@ -121,7 +121,10 @@ def check_device_drivers(device):
        return False
    elif device == "rocm":
        try:
-            subprocess.check_output("rocminfo")
+            if sys.platform == "win32":
+                subprocess.check_output("hipinfo")
+            else:
+                subprocess.check_output("rocminfo")
        except Exception:
            return True

--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import iree._runtime.scripts.iree_benchmark_module as benchmark_module
 from shark.iree_utils._common import run_cmd, iree_device_map
 from shark.iree_utils.cpu_utils import get_cpu_count
 import numpy as np
@@ -102,15 +101,13 @@ def build_benchmark_args_non_tensor_input(
    and whether it is training or not.
    Outputs: string that execute benchmark-module on target model.
    """
-    path = benchmark_module.__path__[0]
+    path = os.path.join(os.environ["VIRTUAL_ENV"], "bin")
    if platform.system() == "Windows":
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module.exe"
-        )
+        benchmarker_path = os.path.join(path, "iree-benchmark-module.exe")
+        time_extractor = None
    else:
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module"
-        )
+        benchmarker_path = os.path.join(path, "iree-benchmark-module")
+        time_extractor = "| awk 'END{{print $2 $3}}'"
    benchmark_cl = [benchmarker_path, f"--module={input_file}"]
    # TODO: The function named can be passed as one of the args.
    if function_name:
@@ -135,7 +132,7 @@ def run_benchmark_module(benchmark_cl):
    benchmark_path = benchmark_cl[0]
    assert os.path.exists(
        benchmark_path
-    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
+    ), "Cannot find iree_benchmark_module, Please contact SHARK maintainer on discord."
    bench_stdout, bench_stderr = run_cmd(" ".join(benchmark_cl))
    try:
        regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -46,7 +46,7 @@ def get_iree_device_args(device, extra_args=[]):
    if device_uri[0] == "cpu":
        from shark.iree_utils.cpu_utils import get_iree_cpu_args

-        data_tiling_flag = ["--iree-flow-enable-data-tiling"]
+        data_tiling_flag = ["--iree-opt-data-tiling"]
        u_kernel_flag = ["--iree-llvmcpu-enable-microkernels"]
        stack_size_flag = ["--iree-llvmcpu-stack-allocation-limit=256000"]

@@ -84,7 +84,7 @@ def get_iree_frontend_args(frontend):
    elif frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
        return [
            "--iree-llvmcpu-target-cpu-features=host",
-            "--iree-flow-demote-i64-to-i32",
+            "--iree-input-demote-i64-to-i32",
        ]
    else:
        # Frontend not found.
@@ -92,14 +92,27 @@ def get_iree_frontend_args(frontend):


 # Common args to be used given any frontend or device.
-def get_iree_common_args():
-    return [
-        "--iree-stream-resource-index-bits=64",
+def get_iree_common_args(debug=False):
+    common_args = [
        "--iree-stream-resource-max-allocation-size=4294967295",
-        "--iree-vm-target-index-bits=64",
        "--iree-vm-bytecode-module-strip-source-map=true",
        "--iree-util-zero-fill-elided-attrs",
    ]
+    if debug == True:
+        common_args.extend(
+            [
+                "--iree-opt-strip-assertions=false",
+                "--verify=true",
+            ]
+        )
+    else:
+        common_args.extend(
+            [
+                "--iree-opt-strip-assertions=true",
+                "--verify=false",
+            ]
+        )
+    return common_args


 # Args that are suitable only for certain models or groups of models.
@@ -278,14 +291,17 @@ def compile_module_to_flatbuffer(
    model_config_path,
    extra_args,
    model_name="None",
+    debug=False,
+    compile_str=False,
 ):
    # Setup Compile arguments wrt to frontends.
-    input_type = ""
+    input_type = "auto"
    args = get_iree_frontend_args(frontend)
    args += get_iree_device_args(device, extra_args)
-    args += get_iree_common_args()
+    args += get_iree_common_args(debug=debug)
    args += get_model_specific_args()
    args += extra_args
+    args += shark_args.additional_compile_args

    if frontend in ["tensorflow", "tf"]:
        input_type = "auto"
@@ -296,10 +312,7 @@ def compile_module_to_flatbuffer(
    elif frontend in ["tm_tensor"]:
        input_type = ireec.InputType.TM_TENSOR

-    # TODO: make it simpler.
-    # Compile according to the input type, else just try compiling.
-    if input_type != "":
-        # Currently for MHLO/TOSA.
+    if compile_str:
        flatbuffer_blob = ireec.compile_str(
            module,
            target_backends=[iree_target_map(device)],
@@ -307,9 +320,10 @@ def compile_module_to_flatbuffer(
            input_type=input_type,
        )
    else:
-        # Currently for Torch.
-        flatbuffer_blob = ireec.compile_str(
+        assert os.path.isfile(module)
+        flatbuffer_blob = ireec.compile_file(
            module,
+            input_type=input_type,
            target_backends=[iree_target_map(device)],
            extra_args=args,
        )
@@ -343,7 +357,8 @@ def load_vmfb_using_mmap(
    flatbuffer_blob_or_path, device: str, device_idx: int = None
 ):
    print(f"Loading module {flatbuffer_blob_or_path}...")
-
+    if "rocm" in device:
+        device = "rocm"
    with DetailLogger(timeout=2.5) as dl:
        # First get configs.
        if device_idx is not None:
@@ -388,6 +403,11 @@ def load_vmfb_using_mmap(
            dl.log(f"mmap {flatbuffer_blob_or_path}")
            ctx = ireert.SystemContext(config=config)
            dl.log(f"ireert.SystemContext created")
+            if "vulkan" in device:
+                # Vulkan pipeline creation consumes significant amount of time.
+                print(
+                    "\tCompiling Vulkan shaders. This may take a few minutes."
+                )
            ctx.add_vm_module(mmaped_vmfb)
            dl.log(f"module initialized")
            mmaped_vmfb = getattr(ctx.modules, mmaped_vmfb.name)
@@ -410,10 +430,18 @@ def get_iree_compiled_module(
    extra_args: list = [],
    device_idx: int = None,
    mmap: bool = False,
+    debug: bool = False,
+    compile_str: bool = False,
 ):
    """Given a module returns the compiled .vmfb and configs"""
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, frontend, model_config_path, extra_args
+        module,
+        device,
+        frontend,
+        model_config_path,
+        extra_args,
+        debug,
+        compile_str,
    )
    temp_file_to_unlink = None
    # TODO: Currently mmap=True control flow path has been switched off for mmap.
@@ -469,10 +497,18 @@ def export_iree_module_to_vmfb(
    model_config_path: str = None,
    module_name: str = None,
    extra_args: list = [],
+    debug: bool = False,
+    compile_str: bool = False,
 ):
    # Compiles the module given specs and saves it as .vmfb file.
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, mlir_dialect, model_config_path, extra_args
+        module,
+        device,
+        mlir_dialect,
+        model_config_path,
+        extra_args,
+        debug,
+        compile_str,
    )
    if module_name is None:
        device_name = (
@@ -480,9 +516,9 @@ def export_iree_module_to_vmfb(
        )
        module_name = f"{mlir_dialect}_{device_name}"
    filename = os.path.join(directory, module_name + ".vmfb")
-    print(f"Saved vmfb in {filename}.")
    with open(filename, "wb") as f:
        f.write(flatbuffer_blob)
+    print(f"Saved vmfb in {filename}.")
    return filename


--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -17,6 +17,7 @@
 import functools
 import iree.runtime as ireert
 import ctypes
+import sys
 from shark.parser import shark_args


@@ -42,21 +43,51 @@ def get_iree_gpu_args():
@functools.cache
 def get_iree_rocm_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    # get arch from rocminfo.
+    # get arch from hipinfo.
+    import os
    import re
    import subprocess

-    rocm_arch = re.match(
-        r".*(gfx\w+)",
-        subprocess.check_output(
-            "rocminfo | grep -i 'gfx'", shell=True, text=True
-        ),
-    ).group(1)
-    print(f"Found rocm arch {rocm_arch}...")
+    if sys.platform == "win32":
+        if "HIP_PATH" in os.environ:
+            rocm_path = os.environ["HIP_PATH"]
+            print(f"Found a ROCm installation at {rocm_path}.")
+        else:
+            print("Failed to find ROCM_PATH. Defaulting to C:\\AMD\\ROCM\\5.5")
+            rocm_path = "C:\\AMD\\ROCM\\5.5"
+    else:
+        if "ROCM_PATH" in os.environ:
+            rocm_path = os.environ["ROCM_PATH"]
+            print(f"Found a ROCm installation at {rocm_path}.")
+        else:
+            print("Failed to find ROCM_PATH. Defaulting to /opt/rocm")
+            rocm_path = "/opt/rocm/"
+
+    try:
+        if sys.platform == "win32":
+            rocm_arch = re.search(
+                r"gfx\d{3,}",
+                subprocess.check_output("hipinfo", shell=True, text=True),
+            ).group(0)
+        else:
+            rocm_arch = re.match(
+                r".*(gfx\w+)",
+                subprocess.check_output(
+                    "rocminfo | grep -i 'gfx'", shell=True, text=True
+                ),
+            ).group(1)
+        print(f"Found rocm arch {rocm_arch}...")
+    except:
+        print(
+            "Failed to find ROCm architecture from hipinfo / rocminfo. Defaulting to gfx1100."
+        )
+        rocm_arch = "gfx1100"
+
+    bc_path = os.path.join(rocm_path, "amdgcn", "bitcode")
    return [
        f"--iree-rocm-target-chip={rocm_arch}",
        "--iree-rocm-link-bc=true",
-        "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
+        f"--iree-rocm-bc-dir={bc_path}",
    ]


--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -57,11 +57,8 @@ def get_version(triple):
@functools.cache
 def get_extensions(triple):
    def make_ext_list(ext_list):
-        res = ""
-        for e in ext_list:
-            res += e + ", "
-        res = f"[{res[:-2]}]"
-        return res
+        res = ", ".join(ext_list)
+        return f"[{res}]"

    arch, product, os = triple
    if arch == "m1":
@@ -119,7 +116,7 @@ def get_extensions(triple):
    ]

    if get_vendor(triple) == "NVIDIA" or arch == "rdna3":
-        ext.append("VK_NV_cooperative_matrix")
+        ext.append("VK_KHR_cooperative_matrix")
    if get_vendor(triple) == ["NVIDIA", "AMD", "Intel"]:
        ext.append("VK_KHR_shader_integer_dot_product")
    return make_ext_list(ext_list=ext)
@@ -247,7 +244,7 @@ def get_vulkan_target_capabilities(triple):
        if arch == "rdna3":
            # TODO: Get scope value
            cap["coopmatCases"] = [
-                "mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f16, resultType = f16, scope = #vk.scope<Subgroup>"
+                "mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f16, resultType = f16, accSat = false, scope = #vk.scope<Subgroup>"
            ]

        if product == "rx5700xt":
@@ -468,9 +465,9 @@ def get_vulkan_target_capabilities(triple):
        cap["variablePointersStorageBuffer"] = True

        cap["coopmatCases"] = [
-            "mSize = 8, nSize = 8, kSize = 32, aType = i8, bType = i8, cType = i32, resultType = i32, scope = #vk.scope<Subgroup>",
-            "mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f16, resultType = f16, scope = #vk.scope<Subgroup>",
-            "mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f32, resultType = f32, scope = #vk.scope<Subgroup>",
+            "mSize = 8, nSize = 8, kSize = 32, aType = i8, bType = i8, cType = i32, resultType = i32, accSat = false, scope = #vk.scope<Subgroup>",
+            "mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f16, resultType = f16, accSat = false, scope = #vk.scope<Subgroup>",
+            "mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f32, resultType = f32, accSat = false, scope = #vk.scope<Subgroup>",
        ]

    elif arch == "adreno":
@@ -531,7 +528,7 @@ def get_vulkan_target_capabilities(triple):
                cmc = ""
                for case in v:
                    cmc += f"#vk.coop_matrix_props<{case}>, "
-                res += f"cooperativeMatrixPropertiesNV = [{cmc[:-2]}], "
+                res += f"cooperativeMatrixPropertiesKHR = [{cmc[:-2]}], "
            else:
                res += f"{k} = {get_comma_sep_str(v)}, "
        else:
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -23,11 +23,19 @@ from shark.iree_utils.vulkan_target_env_utils import get_vulkan_target_env_flag
 from shark.parser import shark_args


+@functools.cache
+def get_all_vulkan_devices():
+    from iree.runtime import get_driver
+
+    driver = get_driver("vulkan")
+    device_list_src = driver.query_available_devices()
+    device_list_src.sort(key=lambda d: d["path"])
+    return [d["name"] for d in device_list_src]
+
+
@functools.cache
 def get_vulkan_device_name(device_num=0):
-    vulkaninfo_dump, _ = run_cmd("vulkaninfo")
-    vulkaninfo_dump = vulkaninfo_dump.split(linesep)
-    vulkaninfo_list = [s.strip() for s in vulkaninfo_dump if "deviceName" in s]
+    vulkaninfo_list = get_all_vulkan_devices()
    if len(vulkaninfo_list) == 0:
        raise ValueError("No device name found in VulkanInfo!")
    if len(vulkaninfo_list) > 1:
@@ -111,6 +119,8 @@ def get_vulkan_target_triple(device_name):
    # Windows: AMD Radeon RX 7900 XTX
    elif all(x in device_name for x in ("RX", "7900")):
        triple = f"rdna3-7900-{system_os}"
+    elif all(x in device_name for x in ("Radeon", "780M")):
+        triple = f"rdna3-780m-{system_os}"
    elif all(x in device_name for x in ("AMD", "PRO", "W7900")):
        triple = f"rdna3-w7900-{system_os}"
    elif any(x in device_name for x in ("AMD", "Radeon")):
@@ -178,9 +188,7 @@ def get_iree_vulkan_args(device_num=0, extra_args=[]):
@functools.cache
 def get_iree_vulkan_runtime_flags():
    vulkan_runtime_flags = [
-        f"--vulkan_large_heap_block_size={shark_args.vulkan_large_heap_block_size}",
        f"--vulkan_validation_layers={'true' if shark_args.vulkan_validation_layers else 'false'}",
-        f"--vulkan_vma_allocator={'true' if shark_args.vulkan_vma_allocator else 'false'}",
    ]
    return vulkan_runtime_flags

--- a/shark/parser.py
+++ b/shark/parser.py
@@ -14,8 +14,21 @@

 import argparse
 import os
+import shlex
 import subprocess

+
+class SplitStrToListAction(argparse.Action):
+    def __init__(self, option_strings, dest, *args, **kwargs):
+        super(SplitStrToListAction, self).__init__(
+            option_strings=option_strings, dest=dest, *args, **kwargs
+        )
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        del parser, option_string
+        setattr(namespace, self.dest, shlex.split(values[0]))
+
+
 parser = argparse.ArgumentParser(description="SHARK runner.")

 parser.add_argument(
@@ -24,6 +37,13 @@ parser.add_argument(
    default="cpu",
    help="Device on which shark_runner runs. options are cpu, cuda, and vulkan",
 )
+parser.add_argument(
+    "--additional_compile_args",
+    default=list(),
+    nargs=1,
+    action=SplitStrToListAction,
+    help="Additional arguments to pass to the compiler. These are appended as the last arguments.",
+)
 parser.add_argument(
    "--enable_tf32",
    type=bool,
@@ -133,13 +153,6 @@ parser.add_argument(
    help="Profiles vulkan device and collects the .rdc info.",
 )

-parser.add_argument(
-    "--vulkan_large_heap_block_size",
-    default="2073741824",
-    help="Flag for setting VMA preferredLargeHeapBlockSize for "
-    "vulkan device, default is 4G.",
-)
-
 parser.add_argument(
    "--vulkan_validation_layers",
    default=False,
@@ -147,11 +160,4 @@ parser.add_argument(
    help="Flag for disabling vulkan validation layers when benchmarking.",
 )

-parser.add_argument(
-    "--vulkan_vma_allocator",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Flag for enabling / disabling Vulkan VMA Allocator.",
-)
-
 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -84,6 +84,13 @@ class SharkBenchmarkRunner(SharkRunner):
        self.extra_args = extra_args
        self.import_args = {}
        self.temp_file_to_unlink = None
+        if not os.path.isfile(mlir_module):
+            print(
+                "Warning: Initializing SharkRunner with a mlir string/bytecode object will duplicate the model in RAM at compile time. To avoid this, initialize SharkInference with a path to a MLIR module on your hard disk instead."
+            )
+            self.compile_str = True
+        else:
+            self.compile_str = False
        SharkRunner.__init__(
            self,
            mlir_module,
@@ -98,6 +105,7 @@ class SharkBenchmarkRunner(SharkRunner):
            ".",
            self.mlir_dialect,
            extra_args=self.extra_args,
+            compile_str=self.compile_str,
        )
        params = load_flatbuffer(
            self.vmfb_file,
--- a/shark/shark_compile.py
+++ b/shark/shark_compile.py
@@ -1,7 +1,7 @@
 import os
 import tempfile
 from shark.shark_inference import SharkInference
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 import torch
 import torch_mlir
 from torch_mlir.compiler_utils import run_pipeline_with_repro_report
@@ -115,7 +115,7 @@ def compile_int_precision(
    print(f"[DEBUG] converting torch to linalg")
    run_pipeline_with_repro_report(
        mlir_module,
-        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
    )
    from contextlib import redirect_stdout
@@ -130,10 +130,17 @@ def compile_int_precision(
    mlir_module = mlir_module.encode("UTF-8")
    mlir_module = BytesIO(mlir_module)
    bytecode = mlir_module.read()
+    bytecode_path = os.path.join(
+        os.getcwd(), f"{extended_model_name}_linalg.mlirbc"
+    )
+    with open(bytecode_path, "wb") as f:
+        f.write(bytecode)
+    del bytecode
+    del mlir_module
    print(f"Elided IR written for {extended_model_name}")
-    return bytecode
+    return bytecode_path
    shark_module = SharkInference(
-        mlir_module=bytecode, device=device, mlir_dialect="tm_tensor"
+        mlir_module=bytecode_path, device=device, mlir_dialect="tm_tensor"
    )
    extra_args = [
        "--iree-hal-dump-executable-sources-to=ies",
@@ -148,7 +155,7 @@ def compile_int_precision(
            generate_vmfb=generate_vmfb,
            extra_args=extra_args,
        ),
-        bytecode,
+        bytecode_path,
    )


@@ -201,7 +208,7 @@ def shark_compile_through_fx(
        ]
    else:
        (
-            mlir_module,
+            bytecode,
            _,
        ) = import_with_fx(
            model=model,
@@ -212,6 +219,11 @@ def shark_compile_through_fx(
            model_name=extended_model_name,
            save_dir=save_dir,
        )
+        mlir_module = save_mlir(
+            mlir_module=bytecode,
+            model_name=extended_model_name,
+            mlir_dialect=mlir_dialect,
+        )

    shark_module = SharkInference(
        mlir_module,
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -275,11 +275,11 @@ def download_model(
    model_dir = os.path.join(WORKDIR, model_dir_name)
    tuned_str = "" if tuned is None else "_" + tuned
    suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
-    filename = os.path.join(model_dir, model_name + suffix)
+    mlir_filename = os.path.join(model_dir, model_name + suffix)
    print(
-        f"Verifying that model artifacts were downloaded successfully to {filename}..."
+        f"Verifying that model artifacts were downloaded successfully to {mlir_filename}..."
    )
-    if not os.path.exists(filename):
+    if not os.path.exists(mlir_filename):
        from tank.generate_sharktank import gen_shark_files

        print(
@@ -287,13 +287,11 @@ def download_model(
        )
        gen_shark_files(model_name, frontend, WORKDIR, import_args)

-    assert os.path.exists(filename), f"MLIR not found at {filename}"
-    with open(filename, mode="rb") as f:
-        mlir_file = f.read()
+    assert os.path.exists(mlir_filename), f"MLIR not found at {mlir_filename}"
    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))

    inputs_tuple = tuple([inputs[key] for key in inputs])
    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
-    return mlir_file, function_name, inputs_tuple, golden_out_tuple
+    return mlir_filename, function_name, inputs_tuple, golden_out_tuple
--- a/shark/shark_eager/shark_eager.py
+++ b/shark/shark_eager/shark_eager.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, List, Tuple
 from collections import defaultdict
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 import torchvision.models as models
 import copy
 import io
@@ -20,10 +20,16 @@ def shark_backend(fx_g: torch.fx.GraphModule, inputs, device: str = "cpu"):
    bytecode_stream = io.BytesIO()
    mlir_module.operation.write_bytecode(bytecode_stream)
    bytecode = bytecode_stream.getvalue()
+    bytecode_path = save_mlir(
+        bytecode,
+        model_name="shark_eager_module",
+        frontend="torch",
+        mlir_dialect="tm_tensor",
+    )
    from shark.shark_inference import SharkInference

    shark_module = SharkInference(
-        mlir_module=bytecode,
+        mlir_module=bytecode_path,
        device=device,
        mlir_dialect="tm_tensor",
    )
--- a/shark/shark_generate_model_config.py
+++ b/shark/shark_generate_model_config.py
@@ -3,8 +3,8 @@ import json
 import numpy as np

 import torch_mlir
-from iree.compiler import compile_str
-from shark.shark_importer import import_with_fx, get_f16_inputs
+from iree.compiler import compile_file
+from shark.shark_importer import import_with_fx, get_f16_inputs, save_mlir


 class GenerateConfigFile:
@@ -54,9 +54,15 @@ class GenerateConfigFile:
            verbose=False,
        )
        module = module.operation.get_asm(large_elements_limit=4)
+        module_file = save_mlir(
+            module,
+            model_name="module_pre_split",
+            frontend="torch",
+            mlir_dialect="linalg",
+        )
        compiled_module_str = str(
-            compile_str(
-                str(module),
+            compile_file(
+                module_file,
                target_backends=[backend],
                extra_args=[
                    "--compile-to=flow",
@@ -138,7 +144,7 @@ if __name__ == "__main__":
    firstVicunaCompileInput = (compilation_input_ids,)
    from apps.language_models.src.model_wrappers.vicuna_model import (
        FirstVicuna,
-        SecondVicuna,
+        SecondVicuna7B,
        CombinedModel,
    )

--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -451,6 +451,65 @@ def transform_fx(fx_g, quantized=False):
    fx_g.graph.lint()


+def gptq_transforms(fx_g):
+    import torch
+
+    for node in fx_g.graph.nodes:
+        if node.op == "call_function":
+            if node.target in [
+                torch.ops.aten.arange,
+                torch.ops.aten.empty,
+                torch.ops.aten.ones,
+                torch.ops.aten._to_copy,
+            ]:
+                if node.kwargs.get("device") == torch.device(device="cuda:0"):
+                    updated_kwargs = node.kwargs.copy()
+                    updated_kwargs["device"] = torch.device(device="cpu")
+                    node.kwargs = updated_kwargs
+
+            if node.target in [
+                torch.ops.aten._to_copy,
+            ]:
+                if node.kwargs.get("dtype") == torch.bfloat16:
+                    updated_kwargs = node.kwargs.copy()
+                    updated_kwargs["dtype"] = torch.float16
+                    node.kwargs = updated_kwargs
+
+            # Inputs of aten.native_layer_norm should be upcasted to fp32.
+            if node.target in [torch.ops.aten.native_layer_norm]:
+                with fx_g.graph.inserting_before(node):
+                    new_node_arg0 = fx_g.graph.call_function(
+                        torch.ops.prims.convert_element_type,
+                        args=(node.args[0], torch.float32),
+                        kwargs={},
+                    )
+                    node.args = (
+                        new_node_arg0,
+                        node.args[1],
+                        node.args[2],
+                        node.args[3],
+                        node.args[4],
+                    )
+
+            # Downcasting the result of native_layer_norm back to fp16.
+            if node.name.startswith("getitem"):
+                with fx_g.graph.inserting_before(node):
+                    if node.args[0].target in [
+                        torch.ops.aten.native_layer_norm
+                    ]:
+                        new_node = fx_g.graph.call_function(
+                            torch.ops.aten._to_copy,
+                            args=(node,),
+                            kwargs={"dtype": torch.float32},
+                        )
+                        node.append(new_node)
+                        node.replace_all_uses_with(new_node)
+                        new_node.args = (node,)
+                        new_node.kwargs = {"dtype": torch.float32}
+
+    fx_g.graph.lint()
+
+
 # Doesn't replace the None type.
 def change_fx_graph_return_to_tuple(fx_g):
    for node in fx_g.graph.nodes:
@@ -504,27 +563,12 @@ def import_with_fx(
    is_dynamic=False,
    tracing_required=False,
    precision="fp32",
+    is_gptq=False,
 ):
    import torch
    from torch.fx.experimental.proxy_tensor import make_fx
    from torch._decomp import get_decompositions
    from typing import List
-    from brevitas_examples.llm.llm_quant.export import (
-        block_quant_layer_level_manager,
-    )
-    from brevitas_examples.llm.llm_quant.export import (
-        brevitas_layer_export_mode,
-    )
-    from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
-        LinearWeightBlockQuantHandlerFwd,
-    )
-    from brevitas_examples.llm.llm_quant.export import replace_call_fn_target
-    from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
-        matmul_rhs_group_quant_placeholder,
-    )
-    from brevitas.backport.fx.experimental.proxy_tensor import (
-        make_fx as brevitas_make_fx,
-    )

    golden_values = None
    if debug:
@@ -596,8 +640,30 @@ def import_with_fx(
        torch.ops.aten.native_layer_norm,
        torch.ops.aten.masked_fill.Tensor,
        torch.ops.aten.masked_fill.Scalar,
+        torch.ops.aten._scaled_dot_product_flash_attention.default,
+        torch.ops.aten.index_add,
+        torch.ops.aten.index_add_,
    ]
-    if precision in ["int4", "int8"]:
+    if precision in ["int4", "int8"] and not is_gptq:
+        from brevitas_examples.llm.llm_quant.export import (
+            block_quant_layer_level_manager,
+        )
+        from brevitas_examples.llm.llm_quant.export import (
+            brevitas_layer_export_mode,
+        )
+        from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
+            LinearWeightBlockQuantHandlerFwd,
+        )
+        from brevitas_examples.llm.llm_quant.export import (
+            replace_call_fn_target,
+        )
+        from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
+            matmul_rhs_group_quant_placeholder,
+        )
+        from brevitas.backport.fx.experimental.proxy_tensor import (
+            make_fx as brevitas_make_fx,
+        )
+
        export_context_manager = brevitas_layer_export_mode
        export_class = block_quant_layer_level_manager(
            export_handlers=[LinearWeightBlockQuantHandlerFwd]
@@ -647,6 +713,10 @@ def import_with_fx(
        add_upcast(fx_g)
        fx_g.recompile()

+    if is_gptq:
+        gptq_transforms(fx_g)
+        fx_g.recompile()
+
    if mlir_type == "fx":
        return fx_g

@@ -677,5 +747,27 @@ def import_with_fx(
        )
        return mlir_module, func_name

-    mlir_module, func_name = mlir_importer.import_mlir()
+    mlir_module, func_name = mlir_importer.import_mlir(mlir_type=mlir_type)
    return mlir_module, func_name
+
+
+# Saves a .mlir module python object to the directory 'dir' with 'model_name' and returns a path to the saved file.
+def save_mlir(
+    mlir_module,
+    model_name,
+    mlir_dialect="linalg",
+    frontend="torch",
+    dir=tempfile.gettempdir(),
+):
+    model_name_mlir = (
+        model_name + "_" + frontend + "_" + mlir_dialect + ".mlir"
+    )
+    if dir == "":
+        dir = tempfile.gettempdir()
+    mlir_path = os.path.join(dir, model_name_mlir)
+    print(f"saving {model_name_mlir} to {dir}")
+    if frontend == "torch":
+        with open(mlir_path, "wb") as mlir_file:
+            mlir_file.write(mlir_module)
+
+    return mlir_path
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -39,7 +39,7 @@ class SharkInference:
    Attributes
    ----------
    mlir_module : str
-        mlir_module represented in string; modules from torch-mlir are serialized in bytecode format.
+        mlir_module or path represented in string; modules from torch-mlir are serialized in bytecode format.
    device : str
        device to execute the mlir_module on.
        currently supports cpu, cuda, vulkan, and metal backends.
@@ -65,7 +65,7 @@ class SharkInference:

    def __init__(
        self,
-        mlir_module: bytes,
+        mlir_module,
        device: str = "none",
        mlir_dialect: str = "linalg",
        is_benchmark: bool = False,
@@ -75,6 +75,14 @@ class SharkInference:
        mmap: bool = True,
    ):
        self.mlir_module = mlir_module
+        if mlir_module is not None:
+            if mlir_module and not os.path.isfile(mlir_module):
+                print(
+                    "Warning: Initializing SharkInference with a mlir string/bytecode object will duplicate the model in RAM at compile time. To avoid this, initialize SharkInference with a path to a MLIR module on your hard disk instead."
+                )
+                self.compile_str = True
+            else:
+                self.compile_str = False
        self.device = shark_args.device if device == "none" else device
        self.mlir_dialect = mlir_dialect
        self.is_benchmark = is_benchmark
@@ -192,7 +200,9 @@ class SharkInference:

    # TODO: Instead of passing directory and having names decided by the module
    # , user may want to save the module with manual names.
-    def save_module(self, dir=os.getcwd(), module_name=None, extra_args=[]):
+    def save_module(
+        self, dir=os.getcwd(), module_name=None, extra_args=[], debug=False
+    ):
        return export_iree_module_to_vmfb(
            self.mlir_module,
            self.device,
@@ -200,6 +210,8 @@ class SharkInference:
            self.mlir_dialect,
            module_name=module_name,
            extra_args=extra_args,
+            debug=debug,
+            compile_str=self.compile_str,
        )

    # load and return the module.
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -45,7 +45,7 @@ class SharkRunner:
    Attributes
    ----------
    mlir_module : str
-        mlir_module represented in string.
+        mlir_module path, string, or bytecode.
    device : str
        device to execute the mlir_module on.
        currently supports cpu, cuda, vulkan, and metal backends.
@@ -74,6 +74,14 @@ class SharkRunner:
        device_idx: int = None,
    ):
        self.mlir_module = mlir_module
+        if self.mlir_module is not None:
+            if not os.path.isfile(mlir_module):
+                print(
+                    "Warning: Initializing SharkRunner with a mlir string/bytecode object will duplicate the model in RAM at compile time. To avoid this, initialize SharkInference with a path to a MLIR module on your hard disk instead."
+                )
+                self.compile_str = True
+            else:
+                self.compile_str = False
        self.device = shark_args.device if device == "none" else device
        self.mlir_dialect = mlir_dialect
        self.extra_args = extra_args
@@ -91,6 +99,7 @@ class SharkRunner:
                self.mlir_dialect,
                extra_args=self.extra_args,
                device_idx=self.device_idx,
+                compile_str=self.compile_str,
            )
            self.iree_compilation_module = params["vmfb"]
            self.iree_config = params["config"]
--- a/shark/shark_trainer.py
+++ b/shark/shark_trainer.py
@@ -15,7 +15,7 @@
 from shark.parser import shark_args
 from shark.shark_runner import SharkRunner
 from shark.backward_makefx import MakeFxModule
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 import numpy as np
 from tqdm import tqdm
 import sys
@@ -69,7 +69,7 @@ class SharkTrainer:
            self.frontend = frontend

    # Training function is needed in the case of torch_fn.
-    def compile(self, training_fn=None, extra_args=[]):
+    def compile(self, training_fn=None, mlir_type="linalg", extra_args=[]):
        if self.frontend in ["torch", "pytorch"]:
            packed_inputs = (
                dict(self.model.named_parameters()),
@@ -77,7 +77,18 @@ class SharkTrainer:
                tuple(self.input),
            )
            mlir_module, func_name = import_with_fx(
-                training_fn, packed_inputs, False, [], training=True
+                training_fn,
+                packed_inputs,
+                False,
+                [],
+                training=True,
+                mlir_type=mlir_type,
+            )
+            mlir_module = save_mlir(
+                mlir_module,
+                model_name="shark_model",
+                frontend="torch",
+                mlir_dialect=mlir_type,
            )
            self.shark_runner = SharkRunner(
                mlir_module,
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -1,24 +1,6 @@
-resnet50,stablehlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-albert-base-v2,stablehlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
-roberta-base,stablehlo,tf,1e-02,1e-3,default,nhcw-nhwc,True,True,True,"","macos"
-bert-base-uncased,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"","enabled_windows"
-camembert-base,stablehlo,tf,1e-2,1e-3,default,None,True,True,True,"",""
-dbmdz/convbert-base-turkish-cased,stablehlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/iree-org/iree/issues/9971",""
-distilbert-base-uncased,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-facebook/convnext-tiny-224,stablehlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342","macos"
-funnel-transformer/small,stablehlo,tf,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/201",""
-google/electra-small-discriminator,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-google/mobilebert-uncased,stablehlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile","macos"
-google/vit-base-patch16-224,stablehlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,False,"",""
-microsoft/MiniLM-L12-H384-uncased,stablehlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile.",""
-microsoft/layoutlm-base-uncased,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-microsoft/mpnet-base,stablehlo,tf,1e-2,1e-2,default,None,True,True,True,"",""
-alexnet,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/879",""
-bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
 bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
 bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,True,True,"",""
 bert-large-uncased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
-bert-large-uncased,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile.",""
 google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311",""
 microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390","macos"
@@ -32,14 +14,8 @@ resnet50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 resnet50_fp16,linalg,torch,1e-2,1e-2,default,nhcw-nhwc/img2col,True,True,True,"Numerics issues, awaiting cuda-independent fp16 integration",""
 squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,True,False,False,"","macos"
-efficientnet-v2-s,stablehlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"","macos"
 efficientnet_b0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"https://github.com/nod-ai/SHARK/issues/1487","macos"
 efficientnet_b7,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"https://github.com/nod-ai/SHARK/issues/1487","macos"
-efficientnet_b0,stablehlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"",""
-efficientnet_b7,stablehlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"Fails on MacOS builder, VK device lost","macos"
-gpt2,stablehlo,tf,1e-2,1e-3,default,None,True,False,False,"","macos"
 t5-base,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported.","macos"
-t5-base,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"","macos"
 t5-large,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported","macos"
-t5-large,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"","macos"
--- a/tank/examples/bert_tf/bert_large_run.py
+++ b/tank/examples/bert_tf/bert_large_run.py
@@ -85,8 +85,6 @@ if __name__ == "__main__":
    args = [
        "--iree-llvmcpu-target-cpu-features=host",
        "--iree-mhlo-demote-i64-to-i32=false",
-        "--iree-stream-resource-index-bits=64",
-        "--iree-vm-target-index-bits=64",
    ]
    backend_config = "dylib"
    # backend = "cuda"
--- a/tank/examples/opt/README.md
+++ b/tank/examples/opt/README.md
@@ -1,3 +1,26 @@
-# Running Different OPT Variants
+# Run OPT for sentence completion through SHARK

-To run different sizes of OPT, change the string `OPT_MODEL` string in `opt_torch_test.py`. The default is 350m parameters. 66b cases also exist in the file, simply uncomment the test cases.
+From base SHARK directory, follow instructions to set up a virtual environment with SHARK. (`./setup_venv.sh` or `./setup_venv.ps1`)
+Then, you may run opt_causallm.py to get a very simple sentence completion application running through SHARK
+```
+python opt_causallm.py
+```
+
+# Run OPT performance comparison on SHARK vs. PyTorch
+
+```
+python opt_perf_comparison.py --max-seq-len=512 --model-name=facebook/opt-1.3b \
+        --platform=shark
+```
+Any OPT model from huggingface should work with this script, and you can choose between `--platform=shark` or `--platform=huggingface` to generate benchmarks of OPT inference on SHARK / PyTorch. 
+
+# Run a small suite of OPT models through the benchmark script
+
+```
+python opt_perf_comparison_batch.py
+```
+This script will run benchmarks from a suite of OPT configurations:
+- Sequence Lengths: 32, 128, 256, 512
+- Parameter Counts: 125m, 350m, 1.3b
+
+note: Most of these scripts are written for use on CPU, as perf comparisons against pytorch can be problematic across platforms otherwise.
--- a/tank/examples/opt/opt_causallm.py
+++ b/tank/examples/opt/opt_causallm.py
@@ -36,9 +36,7 @@ def create_module(model_name, tokenizer, device):

    mlir_path = f"./{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch.mlir"
    if os.path.isfile(mlir_path):
-        with open(mlir_path, "r") as f:
-            model_mlir = f.read()
-        print(f"Loaded .mlir from {mlir_path}")
+        print(f"Found .mlir from {mlir_path}")
    else:
        (model_mlir, func_name) = import_with_fx(
            model=opt_model,
@@ -50,16 +48,17 @@ def create_module(model_name, tokenizer, device):
        with open(mlir_path, "w") as f:
            f.write(model_mlir)
        print(f"Saved mlir at {mlir_path}")
+        del model_mlir

    shark_module = SharkInference(
-        model_mlir,
+        mlir_path,
        device=device,
        mlir_dialect="tm_tensor",
        is_benchmark=False,
    )

    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{device}"
-    shark_module.save_module(module_name=vmfb_name)
+    shark_module.save_module(module_name=vmfb_name, debug=False)
    vmfb_path = vmfb_name + ".vmfb"
    return vmfb_path

--- a/tank/examples/opt/opt_causallm_torch_test.py
+++ b/tank/examples/opt/opt_causallm_torch_test.py
@@ -6,7 +6,7 @@ import numpy as np
 from shark_opt_wrapper import OPTForCausalLMModel
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from shark.shark_inference import SharkInference
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 from transformers import AutoTokenizer, OPTForCausalLM

 OPT_MODEL = "facebook/opt-1.3b"
@@ -57,9 +57,10 @@ class OPTModuleTester:
        with open(mlir_path, "w") as f:
            f.write(mlir_module)
        print(f"Saved mlir at {mlir_path}")
+        del mlir_module

        shark_module = SharkInference(
-            mlir_module,
+            mlir_path,
            device=device,
            mlir_dialect="tm_tensor",
            is_benchmark=self.benchmark,
--- a/tank/examples/opt/opt_perf_comparison.py
+++ b/tank/examples/opt/opt_perf_comparison.py
@@ -1,18 +1,45 @@
+"""
+Script for comparing OPT model performance between SHARK and Huggingface
+PyTorch.
+
+Usage Example:
+
+python opt_perf_comparison.py --max-seq-len=32 --model-name=facebook/opt-125m \
+        --platform=shark
+
+python opt_perf_comparison.py --max-seq-len=512 --model-name=facebook/opt-1.3b \
+        --platform=shark
+
+See parse_args() below for command line argument usage.
+"""
+
+import argparse
 import collections
 import json
-import time
 import os
+import psutil
+import time
+from typing import Tuple

 from shark.shark_inference import SharkInference
 from shark.shark_importer import import_with_fx
 from transformers import AutoTokenizer, OPTForCausalLM
 from shark_opt_wrapper import OPTForCausalLMModel

-MODEL_NAME = "facebook/opt-1.3b"
-OPT_MODELNAME = "opt-1.3b"
-OPT_FS_NAME = "opt_1-3b"
-MAX_SEQUENCE_LENGTH = 512
 DEVICE = "cpu"
+PLATFORM_SHARK = "shark"
+PLATFORM_HUGGINGFACE = "huggingface"
+
+# Dict keys for reports.
+REPORT_PLATFORM = "platform"
+REPORT_MODEL_NAME = "model"
+REPORT_MAX_SEQ_LEN = "max_seq_len"
+REPORT_LOAD_TIME = "load_time_sec"
+REPORT_RUN_TIME = "run_time_sec"
+REPORT_LOAD_PHYSICAL_MEMORY_MB = "load_physical_MB"
+REPORT_LOAD_VIRTUAL_MEMORY_MB = "load_virtual_MB"
+REPORT_RUN_PHYSICAL_MEMORY_MB = "run_physical_MB"
+REPORT_RUN_VIRTUAL_MEMORY_MB = "run_virtual_MB"

 PROMPTS = [
    "What is the meaning of life?",
@@ -30,15 +57,27 @@ PROMPTS = [
 ModelWrapper = collections.namedtuple("ModelWrapper", ["model", "tokenizer"])


-def create_vmfb_module(model_name, tokenizer, device):
-    opt_base_model = OPTForCausalLM.from_pretrained("facebook/" + model_name)
+def get_memory_info():
+    pid = os.getpid()
+    process = psutil.Process(pid)
+    return process.memory_info()
+
+
+def create_vmfb_module(
+    model_name: str,
+    tokenizer,
+    device: str,
+    max_seq_len: int,
+    recompile_shark: bool,
+):
+    opt_base_model = OPTForCausalLM.from_pretrained(model_name)
    opt_base_model.eval()
    opt_model = OPTForCausalLMModel(opt_base_model)
    encoded_inputs = tokenizer(
-        "What is the meaning of life?",
+        PROMPTS[0],
        padding="max_length",
        truncation=True,
-        max_length=MAX_SEQUENCE_LENGTH,
+        max_length=max_seq_len,
        return_tensors="pt",
    )
    inputs = (
@@ -48,8 +87,16 @@ def create_vmfb_module(model_name, tokenizer, device):
    # np.save("model_inputs_0.npy", inputs[0])
    # np.save("model_inputs_1.npy", inputs[1])

-    mlir_path = f"./{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch.mlir"
-    if os.path.isfile(mlir_path):
+    opt_fs_name = get_opt_fs_name(model_name)
+    mlir_path = f"./{opt_fs_name}_causallm_{max_seq_len}_torch.mlir"
+    # If MLIR has already been loaded and recompilation is not requested, use
+    # the loaded MLIR file.
+    has_mlir = os.path.isfile(mlir_path)
+    # The purpose of recompile_shark is to measure compilation time; the
+    # compilation time can be correctly measured only when MLIR has already been
+    # loaded.
+    assert not recompile_shark or has_mlir
+    if has_mlir:
        with open(mlir_path, "r") as f:
            model_mlir = f.read()
        print(f"Loaded .mlir from {mlir_path}")
@@ -58,7 +105,7 @@ def create_vmfb_module(model_name, tokenizer, device):
            model=opt_model,
            inputs=inputs,
            is_f16=False,
-            model_name=OPT_FS_NAME,
+            model_name=opt_fs_name,
            return_str=True,
        )
        with open(mlir_path, "w") as f:
@@ -72,18 +119,25 @@ def create_vmfb_module(model_name, tokenizer, device):
        is_benchmark=False,
    )

-    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{DEVICE}_tiled_ukernels"
+    vmfb_name = (
+        f"{opt_fs_name}_causallm_{max_seq_len}_torch_{DEVICE}_tiled_ukernels"
+    )
    shark_module.save_module(module_name=vmfb_name)
    vmfb_path = vmfb_name + ".vmfb"
    return vmfb_path


-def load_shark_model() -> ModelWrapper:
-    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{DEVICE}_tiled_ukernels.vmfb"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
-    if not os.path.isfile(vmfb_name):
+def load_shark_model(
+    model_name: str, max_seq_len: int, recompile_shark: bool
+) -> ModelWrapper:
+    opt_fs_name = get_opt_fs_name(model_name)
+    vmfb_name = f"{opt_fs_name}_causallm_{max_seq_len}_torch_{DEVICE}_tiled_ukernels.vmfb"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    if recompile_shark or not os.path.isfile(vmfb_name):
        print(f"vmfb not found. compiling and saving to {vmfb_name}")
-        create_vmfb_module(OPT_MODELNAME, tokenizer, DEVICE)
+        create_vmfb_module(
+            model_name, tokenizer, DEVICE, max_seq_len, recompile_shark
+        )
    shark_module = SharkInference(mlir_module=None, device="cpu-task")
    shark_module.load_module(vmfb_name)
    return ModelWrapper(model=shark_module, tokenizer=tokenizer)
@@ -94,20 +148,10 @@ def run_shark_model(model_wrapper: ModelWrapper, tokens):
    return model_wrapper.model("forward", tokens)


-def run_shark():
-    model_wrapper = load_shark_model()
-
-    prompt = "What is the meaning of life?"
-    logits = run_shark_model(model_wrapper, prompt)
-
-    # Print output logits to validate vs. pytorch + base transformers
-    print(logits[0])
-
-
-def load_huggingface_model() -> ModelWrapper:
+def load_huggingface_model(model_name: str) -> ModelWrapper:
    return ModelWrapper(
-        model=OPTForCausalLM.from_pretrained(MODEL_NAME),
-        tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
+        model=OPTForCausalLM.from_pretrained(model_name),
+        tokenizer=AutoTokenizer.from_pretrained(model_name),
    )


@@ -117,47 +161,71 @@ def run_huggingface_model(model_wrapper: ModelWrapper, tokens):
    )


-def run_huggingface():
-    model_wrapper = load_huggingface_model()
-    prompt = "What is the meaning of life?"
-    logits = run_huggingface_model(model_wrapper, prompt)
-
-    print(logits[0])
-
-
 def save_json(data, filename):
    with open(filename, "w") as file:
        json.dump(data, file)


-def collect_huggingface_logits():
+def collect_huggingface_logits(
+    model_name: str, max_seq_len: int, to_save_json: bool
+) -> Tuple[float, float]:
+    # Load
    t0 = time.time()
-    model_wrapper = load_huggingface_model()
-    print("--- Took {} seconds to load Huggingface.".format(time.time() - t0))
+    model_wrapper = load_huggingface_model(model_name)
+    load_time = time.time() - t0
+    print("--- Took {} seconds to load Huggingface.".format(load_time))
+    load_memory_info = get_memory_info()
+
    results = []
    tokenized_prompts = []
    for prompt in PROMPTS:
        tokens = model_wrapper.tokenizer(
            prompt,
            padding="max_length",
-            max_length=MAX_SEQUENCE_LENGTH,
+            max_length=max_seq_len,
            truncation=True,
            return_tensors="pt",
        )
        tokenized_prompts.append(tokens)
+
+    # Run
    t0 = time.time()
    for idx, tokens in enumerate(tokenized_prompts):
        print("prompt: {}".format(PROMPTS[idx]))
        logits = run_huggingface_model(model_wrapper, tokens)
-        results.append([PROMPTS[idx], logits[0].tolist()])
-    print("--- Took {} seconds to run Huggingface.".format(time.time() - t0))
-    save_json(results, "/tmp/huggingface.json")
+        if to_save_json:
+            results.append([PROMPTS[idx], logits[0].tolist()])
+    run_time = time.time() - t0
+    print("--- Took {} seconds to run Huggingface.".format(run_time))
+    if to_save_json:
+        save_json(results, "/tmp/huggingface.json")
+    run_memory_info = get_memory_info()
+    return {
+        REPORT_PLATFORM: PLATFORM_HUGGINGFACE,
+        REPORT_MODEL_NAME: model_name,
+        REPORT_MAX_SEQ_LEN: max_seq_len,
+        REPORT_LOAD_TIME: load_time,
+        REPORT_RUN_TIME: run_time / len(PROMPTS),
+        REPORT_LOAD_PHYSICAL_MEMORY_MB: load_memory_info.rss >> 20,
+        REPORT_LOAD_VIRTUAL_MEMORY_MB: load_memory_info.vms >> 20,
+        REPORT_RUN_PHYSICAL_MEMORY_MB: run_memory_info.rss >> 20,
+        REPORT_RUN_VIRTUAL_MEMORY_MB: run_memory_info.vms >> 20,
+    }


-def collect_shark_logits():
+def collect_shark_logits(
+    model_name: str,
+    max_seq_len: int,
+    recompile_shark: bool,
+    to_save_json: bool,
+) -> Tuple[float, float]:
+    # Load
    t0 = time.time()
-    model_wrapper = load_shark_model()
-    print("--- Took {} seconds to load Shark.".format(time.time() - t0))
+    model_wrapper = load_shark_model(model_name, max_seq_len, recompile_shark)
+    load_time = time.time() - t0
+    print("--- Took {} seconds to load Shark.".format(load_time))
+    load_memory_info = get_memory_info()
+
    results = []
    tokenized_prompts = []
    for prompt in PROMPTS:
@@ -165,7 +233,7 @@ def collect_shark_logits():
            prompt,
            padding="max_length",
            truncation=True,
-            max_length=MAX_SEQUENCE_LENGTH,
+            max_length=max_seq_len,
            return_tensors="pt",
        )
        inputs = (
@@ -173,16 +241,100 @@ def collect_shark_logits():
            tokens["attention_mask"],
        )
        tokenized_prompts.append(inputs)
+
+    # Run
    t0 = time.time()
    for idx, tokens in enumerate(tokenized_prompts):
        print("prompt: {}".format(PROMPTS[idx]))
        logits = run_shark_model(model_wrapper, tokens)
        lst = [e.tolist() for e in logits]
-        results.append([PROMPTS[idx], lst])
-    print("--- Took {} seconds to run Shark.".format(time.time() - t0))
-    save_json(results, "/tmp/shark.json")
+        if to_save_json:
+            results.append([PROMPTS[idx], lst])
+    run_time = time.time() - t0
+    print("--- Took {} seconds to run Shark.".format(run_time))
+    if to_save_json:
+        save_json(results, "/tmp/shark.json")
+    platform_postfix = "-compile" if recompile_shark else "-precompiled"
+    run_memory_info = get_memory_info()
+    return {
+        REPORT_PLATFORM: PLATFORM_SHARK + platform_postfix,
+        REPORT_MODEL_NAME: model_name,
+        REPORT_MAX_SEQ_LEN: max_seq_len,
+        REPORT_LOAD_TIME: load_time,
+        REPORT_RUN_TIME: run_time / len(PROMPTS),
+        REPORT_LOAD_PHYSICAL_MEMORY_MB: load_memory_info.rss >> 20,
+        REPORT_LOAD_VIRTUAL_MEMORY_MB: load_memory_info.vms >> 20,
+        REPORT_RUN_PHYSICAL_MEMORY_MB: run_memory_info.rss >> 20,
+        REPORT_RUN_VIRTUAL_MEMORY_MB: run_memory_info.vms >> 20,
+    }
+
+
+def get_opt_fs_name(model_name: str) -> str:
+    """Cleanses the model name ino a file system-friendly name.
+
+    Example: get_opt_fs_name('facebook/opt-1.3b') == 'opt_1-3b'
+    """
+    slash_split = model_name.split("/")
+    assert 1 <= len(slash_split) <= 2, "There should be at most one slash."
+    model_name = slash_split[-1]
+    for src_pattern, dest_pattern in (("-", "_"), (".", "-")):
+        model_name = model_name.replace(src_pattern, dest_pattern)
+    return model_name
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save-json",
+        help="If set, saves output JSON.",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+    )
+    parser.add_argument(
+        "--max-seq-len", help="Max sequence length", type=int, default=32
+    )
+    parser.add_argument(
+        "--model-name",
+        help="Model name",
+        type=str,
+        choices=[
+            "facebook/opt-125m",
+            "facebook/opt-350m",
+            "facebook/opt-1.3b",
+            "facebook/opt-6.7b",
+        ],
+        default="facebook/opt-1.3b",
+    )
+    parser.add_argument(
+        "--recompile-shark",
+        help="If set, recompiles MLIR",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+    )
+    parser.add_argument(
+        "--platform",
+        help="Either shark or huggingface",
+        type=str,
+        choices=[PLATFORM_SHARK, PLATFORM_HUGGINGFACE],
+        default=PLATFORM_SHARK,
+    )
+    args = parser.parse_args()
+    print("args={}".format(args))
+    return args


 if __name__ == "__main__":
-    collect_shark_logits()
-    collect_huggingface_logits()
+    args = parse_args()
+    if args.platform == PLATFORM_SHARK:
+        shark_report = collect_shark_logits(
+            args.model_name,
+            args.max_seq_len,
+            args.recompile_shark,
+            args.save_json,
+        )
+        print("# Summary: {}".format(json.dumps(shark_report)))
+    else:
+        huggingface_report = collect_huggingface_logits(
+            args.model_name, args.max_seq_len, args.save_json
+        )
+        print("# Summary: {}".format(json.dumps(huggingface_report)))
--- a/tank/examples/opt/opt_perf_comparison_batch.py
+++ b/tank/examples/opt/opt_perf_comparison_batch.py
@@ -0,0 +1,30 @@
+"""
+Script for running opt_perf_comparison.py in batch with a series of arguments.
+
+Usage: python opt_perf_comparison_batch.py
+"""
+
+from typing import Iterable, List
+import shlex
+import subprocess
+
+
+def make_commands() -> Iterable[List[str]]:
+    command = shlex.split("python opt_perf_comparison.py --no-save-json")
+    max_seq_lens = [32, 128, 256, 512]
+    model_names = ["facebook/opt-" + e for e in ["125m", "350m", "1.3b"]]
+    for max_seq_len in max_seq_lens:
+        for model_name in model_names:
+            yield command + [
+                f"--max-seq-len={max_seq_len}",
+                f"--model-name={model_name}",
+            ]
+
+
+def main():
+    for command in make_commands():
+        result = subprocess.run(command, check=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/tank/examples/opt/shark_hf_base_opt.py
+++ b/tank/examples/opt/shark_hf_base_opt.py
@@ -2,7 +2,7 @@ import os
 import torch
 from transformers import AutoTokenizer, OPTForCausalLM
 from shark.shark_inference import SharkInference
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 from shark_opt_wrapper import OPTForCausalLMModel

 model_name = "facebook/opt-1.3b"
@@ -25,11 +25,13 @@ inputs = (
    model=model,
    inputs=inputs,
    is_f16=False,
-    debug=True,
-    model_name=model_name.split("/")[1],
-    save_dir=".",
 )
-
+mlir_module = save_mlir(
+    mlir_module,
+    model_name=model_name.split("/")[1],
+    frontend="torch",
+    mlir_dialect="linalg",
+)
 shark_module = SharkInference(
    mlir_module,
    device="cpu-sync",
--- a/tank/generate_sharktank.py
+++ b/tank/generate_sharktank.py
@@ -36,7 +36,7 @@ def save_torch_model(torch_model_list, local_tank_cache, import_args):
        get_hf_img_cls_model,
        get_fp16_model,
    )
-    from shark.shark_importer import import_with_fx
+    from shark.shark_importer import import_with_fx, save_mlir

    with open(torch_model_list) as csvfile:
        torch_reader = csv.reader(csvfile, delimiter=",")
@@ -130,133 +130,6 @@ def save_torch_model(torch_model_list, local_tank_cache, import_args):
                    )


-def save_tf_model(tf_model_list, local_tank_cache, import_args):
-    from tank.model_utils_tf import (
-        get_causal_image_model,
-        get_masked_lm_model,
-        get_causal_lm_model,
-        get_keras_model,
-        get_TFhf_model,
-        get_tfhf_seq2seq_model,
-    )
-    import os
-
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
-    import tensorflow as tf
-
-    visible_default = tf.config.list_physical_devices("GPU")
-    try:
-        tf.config.set_visible_devices([], "GPU")
-        visible_devices = tf.config.get_visible_devices()
-        for device in visible_devices:
-            assert device.device_type != "GPU"
-    except:
-        # Invalid device or cannot modify virtual devices once initialized.
-        pass
-
-    with open(tf_model_list) as csvfile:
-        tf_reader = csv.reader(csvfile, delimiter=",")
-        fields = next(tf_reader)
-        for row in tf_reader:
-            tf_model_name = row[0]
-            model_type = row[1]
-
-            model = None
-            input = None
-            print(f"Generating artifacts for model {tf_model_name}")
-            if model_type == "hf":
-                model, input, _ = get_masked_lm_model(
-                    tf_model_name, import_args
-                )
-            elif model_type == "img":
-                model, input, _ = get_causal_image_model(
-                    tf_model_name, import_args
-                )
-            elif model_type == "keras":
-                model, input, _ = get_keras_model(tf_model_name, import_args)
-            elif model_type == "TFhf":
-                model, input, _ = get_TFhf_model(tf_model_name, import_args)
-            elif model_type == "tfhf_seq2seq":
-                model, input, _ = get_tfhf_seq2seq_model(
-                    tf_model_name, import_args
-                )
-            elif model_type == "hf_causallm":
-                model, input, _ = get_causal_lm_model(
-                    tf_model_name, import_args
-                )
-
-            tf_model_name = tf_model_name.replace("/", "_")
-            if import_args["batch_size"] != 1:
-                tf_model_dir = os.path.join(
-                    local_tank_cache,
-                    str(tf_model_name)
-                    + "_tf"
-                    + f"_BS{str(import_args['batch_size'])}",
-                )
-            else:
-                tf_model_dir = os.path.join(
-                    local_tank_cache, str(tf_model_name) + "_tf"
-                )
-            os.makedirs(tf_model_dir, exist_ok=True)
-            mlir_importer = SharkImporter(
-                model,
-                inputs=input,
-                frontend="tf",
-            )
-            mlir_importer.import_debug(
-                is_dynamic=False,
-                dir=tf_model_dir,
-                model_name=tf_model_name,
-            )
-
-
-def save_tflite_model(tflite_model_list, local_tank_cache, import_args):
-    from shark.tflite_utils import TFLitePreprocessor
-
-    with open(tflite_model_list) as csvfile:
-        tflite_reader = csv.reader(csvfile, delimiter=",")
-        for row in tflite_reader:
-            print("\n")
-            tflite_model_name = row[0]
-            tflite_model_link = row[1]
-            print("tflite_model_name", tflite_model_name)
-            print("tflite_model_link", tflite_model_link)
-            tflite_model_name_dir = os.path.join(
-                local_tank_cache, str(tflite_model_name) + "_tflite"
-            )
-            os.makedirs(tflite_model_name_dir, exist_ok=True)
-            print(f"TMP_TFLITE_MODELNAME_DIR = {tflite_model_name_dir}")
-
-            # Preprocess to get SharkImporter input import_args
-            tflite_preprocessor = TFLitePreprocessor(str(tflite_model_name))
-            raw_model_file_path = tflite_preprocessor.get_raw_model_file()
-            inputs = tflite_preprocessor.get_inputs()
-            tflite_interpreter = tflite_preprocessor.get_interpreter()
-
-            # Use SharkImporter to get SharkInference input import_args
-            my_shark_importer = SharkImporter(
-                module=tflite_interpreter,
-                inputs=inputs,
-                frontend="tflite",
-                raw_model_file=raw_model_file_path,
-            )
-            my_shark_importer.import_debug(
-                dir=tflite_model_name_dir,
-                model_name=tflite_model_name,
-                func_name="main",
-            )
-            mlir_hash = create_hash(
-                os.path.join(
-                    tflite_model_name_dir,
-                    tflite_model_name + "_tflite" + ".mlir",
-                )
-            )
-            np.save(
-                os.path.join(tflite_model_name_dir, "hash"),
-                np.array(mlir_hash),
-            )
-
-
 def check_requirements(frontend):
    import importlib

@@ -265,10 +138,6 @@ def check_requirements(frontend):
        tv_spec = importlib.util.find_spec("torchvision")
        has_pkgs = tv_spec is not None

-    elif frontend in ["tensorflow", "tf"]:
-        tf_spec = importlib.util.find_spec("tensorflow")
-        has_pkgs = tf_spec is not None
-
    return has_pkgs


@@ -287,27 +156,11 @@ def gen_shark_files(modelname, frontend, tank_dir, importer_args):
        torch_model_csv = os.path.join(
            os.path.dirname(__file__), "torch_model_list.csv"
        )
-        tf_model_csv = os.path.join(
-            os.path.dirname(__file__), "tf_model_list.csv"
-        )
        custom_model_csv = tempfile.NamedTemporaryFile(
            dir=os.path.dirname(__file__),
            delete=True,
        )
-        # Create a temporary .csv with only the desired entry.
-        if frontend == "tf":
-            with open(tf_model_csv, mode="r") as src:
-                reader = csv.reader(src)
-                for row in reader:
-                    if row[0] == modelname:
-                        target = row
-            with open(custom_model_csv.name, mode="w") as trg:
-                writer = csv.writer(trg)
-                writer.writerow(["modelname", "src"])
-                writer.writerow(target)
-            save_tf_model(custom_model_csv.name, tank_dir, import_args)
-
-        elif frontend == "torch":
+        if frontend == "torch":
            with open(torch_model_csv, mode="r") as src:
                reader = csv.reader(src)
                for row in reader:
@@ -341,18 +194,6 @@ if __name__ == "__main__":
    #         Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
    # )
    # parser.add_argument(
-    #    "--tf_model_csv",
-    #    type=lambda x: is_valid_file(x),
-    #    default="./tank/tf_model_list.csv",
-    #    help="Contains the file with tf model name and args.",
-    # )
-    # parser.add_argument(
-    #    "--tflite_model_csv",
-    #    type=lambda x: is_valid_file(x),
-    #    default="./tank/tflite/tflite_model_list.csv",
-    #    help="Contains the file with tf model name and args.",
-    # )
-    # parser.add_argument(
    #    "--ci_tank_dir",
    #    type=bool,
    #    default=False,
@@ -369,11 +210,5 @@ if __name__ == "__main__":
    torch_model_csv = os.path.join(
        os.path.dirname(__file__), "torch_model_list.csv"
    )
-    tf_model_csv = os.path.join(os.path.dirname(__file__), "tf_model_list.csv")
-    tflite_model_csv = os.path.join(
-        os.path.dirname(__file__), "tflite", "tflite_model_list.csv"
-    )

    save_torch_model(torch_model_csv, WORKDIR, import_args)
-    # save_tf_model(tf_model_csv, WORKDIR, import_args)
-    # save_tflite_model(tflite_model_csv, WORKDIR, import_args)
--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -287,6 +287,9 @@ class SharkModuleTester:
        repro_path = os.path.join("reproducers", self.tmp_prefix, "*")

        bashCommand = f"gsutil cp -r {repro_path} gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
+        print(
+            f"Uploading reproducer {repro_path} to gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
+        )
        process = subprocess.run(bashCommand.split())

    def postprocess_outputs(self, golden_out, result):
--- a/tank/tf_model_list.csv
+++ b/tank/tf_model_list.csv
@@ -1,28 +0,0 @@
-model_name, model_type
-albert-base-v2,hf
-bert-base-uncased,hf
-camembert-base,hf
-dbmdz/convbert-base-turkish-cased,hf
-distilbert-base-uncased,hf
-google/electra-small-discriminator,hf
-funnel-transformer/small,hf
-microsoft/layoutlm-base-uncased,hf
-google/mobilebert-uncased,hf
-microsoft/mpnet-base,hf
-roberta-base,hf
-resnet50,keras
-xlm-roberta-base,hf
-microsoft/MiniLM-L12-H384-uncased,TFhf
-funnel-transformer/small,hf
-microsoft/mpnet-base,hf
-facebook/convnext-tiny-224,img
-google/vit-base-patch16-224,img
-efficientnet-v2-s,keras
-bert-large-uncased,hf
-t5-base,tfhf_seq2seq
-t5-large,tfhf_seq2seq
-efficientnet_b0,keras
-efficientnet_b7,keras
-gpt2,hf_causallm
-t5-base,tfhf_seq2seq
-t5-large,tfhf_seq2seq
Author	SHA1	Message	Date
Ean Garvey	caf6cc5d8f	Switch most compile flows to use ireec.compile_file. (#1863 ) * Switch most compile flows to use ireec.compile_file. * re-add input type to compile_str path. * Check if mlir_module exists before checking if it's a path or pyobject. * Fix some save_dir cases	2023-10-06 23:04:43 -05:00
Ean Garvey	8614a18474	Remove tf dependencies from importer path. (#1874 ) * Remove tf dependencies from import path. * Fix formatting.	2023-10-06 12:27:12 -07:00
Jakub Kuderski	86c1c0c215	Add aggregate statistics to microbenchmark (#1871 ) Print averaged results at the end of all iterations. Increase the default number of iterations to 5. Example: ``` Number of iterations: 5 Prefill: avg. 0.03 s, stddev 0.00 Decode: avg. 43.34 tokens/s, stdev 0.13 ``` Also remove the -2 in the number of generated tokens -- I did not find any evidence we need it.	2023-10-06 10:03:07 -07:00
Daniel Garvey	8bb364bcb8	enforce fp32 accumulates for cpu (#1873 )	2023-10-06 11:34:49 -05:00
Daniel Garvey	7abddd01ec	argmax inside model + brevitas pin (#1872 )	2023-10-05 20:15:21 -07:00
Abhishek Varma	2a451fa0c7	[Llama2] Add a standalone utility for dynamic and combining IRs -- This script adds a standalone utility for converting Llama IRs to dynamic and combining them as well. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-10-05 20:01:06 +05:30
Jakub Kuderski	9c4610b9da	Add microbenchmark mode to vicuna CLI (#1864 ) Add flags to enable a non-internactive mode for microbenchmarking llama models. In this mode, the system and user prompts are specified with CLI flags, and the number of generated tokens and iterations is fixed. Also move the stats below the response and trim any response blankspace.	2023-10-05 00:12:08 -04:00
powderluv	a38cc9d216	Update vulkan_utils.py for Radeon 780m igpu (#1866 )	2023-10-04 20:33:07 -07:00
Jakub Kuderski	1c382449ec	[vulkan] Print note about module load times. NFC. (#1862 ) Print a note ahead of a potentially long inactivity to set the right expectations. Separately, we should add progress to the UI and make this loading faster.	2023-10-03 17:27:27 -04:00
Gaurav Shukla	7cc9b3f8e8	[llama cli] Fix llama cli Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-10-03 20:39:53 +05:30
Gaurav Shukla	e54517e967	[UI] Disable config generator, lora train and model manager (#1858 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-10-02 22:34:40 -07:00
Ean Garvey	326327a799	Collect pipeline submodules for diffusers ckpt preprocessing. (#1859 )	2023-10-03 00:29:28 -04:00
Ean Garvey	785b65c7b0	Add flag for specifying device-local caching allocator heap key. (#1856 )	2023-10-03 00:28:39 -04:00
Sungsoon Cho	0d16c81687	Remove unused import. (#1857 )	2023-10-02 11:36:08 -05:00
Vivek Khandelwal	8dd7850c69	Add Falcon-GPTQ support	2023-10-02 16:39:57 +05:30
Gaurav Shukla	e930ba85b4	[os] Remove os dependency from vmfb naming (#1854 ) Also fixes a small ui issue for chatbot. Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-29 12:38:17 -05:00
Gaurav Shukla	cd732e7a38	[chatbot] split execution time to prefill and decode Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-29 13:18:03 +05:30
Gaurav Shukla	8e0f8b3227	[ui] Update chatbot UI Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-29 13:18:03 +05:30
Gaurav Shukla	b8210ef796	[chatbot] Re-instantiate the chatbot object if device id changes Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-29 13:18:03 +05:30
PhaneeshB	94594542a9	remove use of vulkaninfo	2023-09-28 21:57:00 +05:30
Gaurav Shukla	82f833e87d	[vulkan] Update vmfb naming Update vmfb naming for vulkan devices in order to resolve naming conflicts in the presence of multiple vulkan devices. Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-28 14:52:11 +05:30
Vivek Khandelwal	c9d6870105	Modify falcon pipeline for 180b support	2023-09-28 12:39:35 +05:30
Jakub Kuderski	4fec03a6cc	[vulkan] Switch from coop matrix NV to KHR (#1848 )	2023-09-27 21:43:37 -04:00
harsh-nod	9a27f51378	Deprecate inference directory This patch removes the inference directory that was no longer being used.	2023-09-27 14:29:00 -07:00
Abhishek Varma	ad1a0f35ff	Fix misdirection while saving vmfb -- Currently SHARK suggests that vmfb has been saved, while that is not the case and no vmfb is generated. This creates a misdirection for IR/vmfbs which are of larger size. -- This commit therefore fixes that misdirection. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-09-27 16:25:29 +05:30
Nelson Sharpe	6773278ec2	Fix checkpoint_path unexpected argument (#1832 )	2023-09-24 14:17:52 -07:00
Abhishek Varma	9a0efffcca	[Llama2] Fix wrong Vulkan device ID + Add Vulkan compile flags -- This commit fixes the wrong Vulkan device being selected during runtime. -- It also adds couple of IREE compilation flags to target specific Vulkan device. -- It also changes the Vulkan device listing to be more in tune with lowering control flow. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-09-22 22:24:18 +05:30
gpetters94	61c6f153d9	Switch to keras-nightly to fix a Linux issue (#1835 )	2023-09-21 12:33:45 -04:00
Phaneesh Barwaria	effd42e8f5	pin gradio to v3.44.3	2023-09-21 17:33:43 +05:30
Sungsoon Cho	b5fbb1a8a0	Rename the func arg save_json to avoid name collision. (#1837 ) * Rename the func arg save_json to avoid name collision. * black formatted.	2023-09-19 17:29:27 -05:00
Quinn Dawkins	ded74d09cd	[vicuna.py] Keep past key values on device (#1836 ) The past key values are only used within the models themselves and can be kept on device. For vulkan int4, this gives 44 tok/s (for the first prompt) and settles at around 26 tok/s on 7900xtx.	2023-09-19 18:17:41 -04:00
Boian Petkantchin	79267931c1	Add argument --additional_compile_args (#1119 ) This allows to pass more arguemnts to the IREE compiler Example: python my-app.py --additional_compile_args="--mlir-pretty-debuginfo --mlir-timing" Co-authored-by: Boian Petkantchin <boian@nod-labs.com>	2023-09-19 11:26:03 -05:00
zjgarvey	9eceba69b7	local_tank_cache included into clear_all (#1833 )	2023-09-18 00:27:23 -05:00
Ean Garvey	ca609afb6a	Update README.md (#1830 )	2023-09-14 10:33:57 -05:00
Gaurav Shukla	11bdce9790	[flags] Fix vulkan runtime flags as vma is dropped from iree (#1831 )	2023-09-14 08:58:59 -05:00
Ean Garvey	684943a4a6	(SD) Fix tokenizers imports in pyinstaller builds. (#1828 ) * Fix tokenizers metadata. * (SD) Disable VAE lowering configs (rdna3) and add versioned tunings. * Update sd_annotation.py * (SD) Add cv2 to spec. * Update stencil pipeline with the new img2img arg.	2023-09-12 12:23:48 -05:00
PhaneeshB	b817bb8455	add roles for llama2	2023-09-12 10:59:28 +05:30
Ean Garvey	780f520f02	Fix vk.target_env extensions and remove redundant SD imports. (#1826 ) * Remove redundant IREE runtime imports. * Fix vulkan target env extensions.	2023-09-11 13:42:52 -05:00
Dom	c61b6f8d65	Code refactoring (#1817 ) * use join * fix bug * further code optimizations --------- Co-authored-by: Daniel Garvey <34486624+dan-garvey@users.noreply.github.com>	2023-09-11 11:30:56 -05:00
Abhishek Varma	c854208d49	[Llama2] Prefetch llama2 tokenizer configs (#1824 ) -- This commit prefetches llama2 tokenizer configs from shark_tank. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-09-08 11:29:54 -07:00
Gaurav Shukla	c5dcfc1f13	[vicuna] Exit when mlir is not present in shark tank (#1825 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-08 10:30:29 -07:00
Abhishek Varma	bde63ee8ae	Add logging feature in WebUI (#1821 )	2023-09-08 05:48:05 -07:00
Vivek Khandelwal	9681d494eb	Update decomp list and shark trainer for DLRM	2023-09-06 21:24:50 +05:30
Gaurav Shukla	ede6bf83e2	[vicuna] Disabling the IR generation path Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-09-06 20:13:17 +05:30
Ean Garvey	2c2693fb7d	Fix torchvision versioning in Linux importer setup. (#1809 )	2023-09-05 12:57:03 -05:00
Vivek Khandelwal	1d31b2b2c6	Fix StableHLO Compilation flag	2023-09-05 21:32:33 +05:30
Gaurav Shukla	d2f64eefa3	[chatbot] Remove few outdated models from list (#1814 )	2023-09-04 09:26:32 -07:00
Abhishek Varma	87ae14b6ff	[SD] Add sdpfa decomposition + update IREE flag -- This commit adds Scaled Dot Product Flash Attention's decomposition in shark_importer. -- It also updates `iree-flow-enable-data-tiling` to `iree-opt-data-tiling`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-09-04 18:03:53 +05:30
Phaneesh Barwaria	1ccafa1fc1	fix llama2-70b rewrite tensor dim	2023-09-01 17:27:06 +05:30
jinchen62	4c3d8a0a7f	Enable downloading vmfb/mlir for webui (#1807 )	2023-08-31 11:05:47 -07:00
jinchen62	3601dc7c3b	Fix llama2 13b combined ir (#1803 )	2023-08-28 11:34:44 -07:00
Daniel Garvey	671881cf87	Llama2 70b (#1783 ) * llama2 70b IR gen * fix IR sec llama2 + debug * llama270b --------- Co-authored-by: PhaneeshB <b.phaneesh@gmail.com>	2023-08-25 23:04:28 -07:00
Gaurav Shukla	4e9be6be59	[chatbot] Add debug as class attribute (#1799 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-08-25 21:46:29 -07:00
Ean Garvey	9c8cbaf498	Add support for ROCM (Windows) in Studio + compile utils (#1770 ) * WIP: MSVC ROCM support for SHARK Studio * Make get_iree_rocm_args platform-agnostic. * Update stable_args.py * Update rocm arg handling in SD utils * Guard quantization imports. Co-authored-by: jam https://github.com/jammm	2023-08-25 20:56:05 -07:00
Ean Garvey	9e348a114e	Revert changes process_skipfiles.py (#1798 ) Keeps a small typo fix but reverts the rest of changes to this file from `450c231171`	2023-08-25 15:31:49 -07:00
jinchen62	51f90a4d56	Update conversion passes for brevitas quant op (#1795 )	2023-08-25 17:28:07 -05:00
Abhishek Varma	310d5d0a49	Fix llama2 13b crashing + add spec file for CLI execution of Llama (#1797 ) * [Llama2] Add a fix for Llama2 13B downloading/crashing -- This commit fixes downloading/crashing of llama2 13B on wrong .mlir file. -- Also adds support for downloading vmfb from shark_tank in CLI. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> * [llama2] Add a spec file to run Llama/Vicuna CLI exe -- This commit adds a spec file to run Llama/Vicuna CLI exe. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> --------- Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-08-25 09:36:09 -05:00
Ean Garvey	9697981004	Pipe through a debug option to iree compile utils. (#1796 ) * Update compile_utils.py * Pipe through a flag to toggle debug options in compile utils. * Update SharkLLMBase.py	2023-08-25 07:11:11 -07:00
Ean Garvey	450c231171	Add tokenizers to requirements.txt (#1790 ) * Add tokenizers to requirements and pin version * Update process_skipfiles.py	2023-08-24 19:44:04 -05:00
Ean Garvey	07f6f4a2f7	Add a short README for the OPT examples and small tweaks. (#1793 ) * Small changes to OPT example. * Update opt README. * Add a few modes to batch script. * Update README.md	2023-08-24 17:26:11 -07:00
jinchen62	610813c72f	Add iree flag to strip assertions (#1791 )	2023-08-24 10:51:19 -07:00
Ean Garvey	8e3860c9e6	Remove flags that are default in upstream IREE (#1785 ) * Remove index bits flags now set by default * Update shark_studio_imports.py	2023-08-24 11:57:54 -05:00
xzuyn	e37d6720eb	Add Hires Fix (#1787 ) * improper test hiresfix * add sliders & use `clear_cache` * add resample choices & fix step adjustment * add step adjustment to img2img * add resample options to img2img * simplify hiresfix - import `img2img_inf` from `img2img_ui.py` instead of just copying it into `txt2img_ui.py` * set `hri` to None after using * add more resample types, and don't show output until hiresfix is done * cleaner implementation * ran black * ran black again with jupyter dependencies	2023-08-24 09:01:41 -07:00
Vivek Khandelwal	16160d9a7d	Fix combine mlir script	2023-08-24 19:10:49 +05:30
Sungsoon Cho	79075a1a07	Opt perf (#1786 ) * Define command line args, model-name, max-seq-len, platform, etc. * Add usage example. * Add opt_perf_comparision_batch.py. * Use shlex instead.	2023-08-24 08:33:12 -05:00
Abhishek Varma	db990826d3	Add Llama2 13B int4 fp16 support (#1784 ) Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-08-23 10:00:32 -07:00
gpetters94	7ee3e4ba5d	Add stencil_unet_512 support (#1778 ) This should fix any remaining issues with stencils and long prompts.	2023-08-22 12:23:46 -04:00
Vivek Khandelwal	05889a8fe1	Add LLaMa2-int4-fp16 support (#1782 )	2023-08-22 07:45:50 -07:00
jinchen62	b87efe7686	Fix venv setup for brevitas (#1779 )	2023-08-21 11:58:51 -07:00
gpetters94	82b462de3a	Fix stencils for long prompts (#1777 )	2023-08-19 00:26:51 -07:00
Daniel Garvey	d8f0f7bade	replace public with private (#1776 ) unload footguns	2023-08-18 14:22:46 -07:00
gpetters94	79bd0b84a1	Fix an issue with diffusers>0.19.3 (#1775 )	2023-08-18 14:06:06 -04:00