Enable pytests on Windows (#901 )

Drop old cli and webui (#911 )
move ci sd stuff to apps (#912 )
2026-04-20 03:00:34 -04:00 · 2023-02-01 18:36:41 -06:00 · 2023-02-01 13:13:46 -08:00 · 2023-02-01 12:15:07 -08:00 · 2023-02-01 11:11:58 -08:00 · 2023-02-01 11:09:00 -08:00
110 changed files with 2902 additions and 4149 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -10,14 +10,14 @@ on:

 jobs:
  windows-build:
-    runs-on: windows-latest
+    runs-on: 7950X
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.10"]

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v3
      with:
@@ -50,8 +50,12 @@ jobs:
      shell: powershell
      run: |
        ./setup_venv.ps1
-        pyinstaller web/shark_sd.spec
+        pyinstaller .\apps\stable_diffusion\shark_sd.spec
        mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
+        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
+        pyinstaller .\apps\stable_diffusion\shark_sd_cli.spec
+        mv ./dist/shark_sd_cli.exe ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
+        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe

        
    # GHA windows VM OOMs so disable for now
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -29,7 +29,7 @@ jobs:
    strategy:
      fail-fast: true
      matrix:
-        os: [icelake, a100, MacStudio, ubuntu-latest]
+        os: [7950x, icelake, a100, MacStudio, ubuntu-latest]
        suite: [cpu,cuda,vulkan]
        python-version: ["3.10"]
        include:
@@ -52,13 +52,19 @@ jobs:
            suite: cuda
          - os: a100
            suite: cpu
+          - os: 7950x
+            suite: cpu
+          - os: 7950x
+            suite: cuda

    runs-on: ${{ matrix.os }}

    steps:
    - uses: actions/checkout@v3
+      if: matrix.os != '7950x'
    
    - name: Set Environment Variables
+      if: matrix.os != '7950x'
      run: |
        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
@@ -78,6 +84,9 @@ jobs:
        #cache-dependency-path: |
        #  **/requirements-importer.txt
        #  **/requirements.txt
+    
+    - uses: actions/checkout@v2
+      if: matrix.os == '7950x'
          
    - name: Install dependencies
      if: matrix.suite == 'lint'
@@ -100,9 +109,9 @@ jobs:
      if: matrix.suite == 'cpu'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k cpu --update_tank
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k cpu
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

@@ -112,25 +121,41 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k cuda --update_tank
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
+        # Disabled due to black image bug
+        # python build_tools/stable_diffusion_testing.py --device=cuda 

    - name: Validate Vulkan Models (MacOS)
      if: matrix.suite == 'vulkan' && matrix.os == 'MacStudio'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
        export DYLD_LIBRARY_PATH=/usr/local/lib/
        echo $PATH
        pip list | grep -E "torch|iree"
-        pytest -s --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k vulkan

    - name: Validate Vulkan Models (a100)
-      if: matrix.suite == 'vulkan' && matrix.os != 'MacStudio'
+      if: matrix.suite == 'vulkan' && matrix.os == 'a100'
      run: |
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k vulkan
+        python build_tools/stable_diffusion_testing.py --device=vulkan
+
+    - name: Validate Vulkan Models (Windows)
+      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
+      run: |
+        ./setup_venv.ps1
+        pytest --benchmark -k vulkan -s
+        type bench_results.csv
+
+    - name: Validate Stable Diffusion Models (Windows)
+      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
+      run: |
+        ./setup_venv.ps1
+        python build_tools/stable_diffusion_testing.py --device=vulkan
--- a/README.md
+++ b/README.md
@@ -45,12 +45,12 @@ source shark.venv/bin/activate

 #### Windows 10/11 Users
 ```powershell
-(shark.venv) PS C:\Users\nod\SHARK> cd web
-(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
+(shark.venv) PS C:\g\shark> cd .\apps\stable_diffusion\web\
+(shark.venv) PS C:\g\shark\apps\stable_diffusion\web> python .\index.py
 ```
-#### Linux Users
+#### Linux / macOS Users
 ```shell
-(shark.venv) > cd web
+(shark.venv) > cd apps/stable_diffusion/web
 (shark.venv) > python index.py
 ```

@@ -65,7 +65,7 @@ source shark.venv/bin/activate

 #### Install your hardware drivers
 * [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree)
-* [macOS Users] Download and install the latest Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home)
+* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
 * [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)

 Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
@@ -73,29 +73,25 @@ Other users please ensure you have your latest vendor drivers and Vulkan SDK fro

 #### Windows 10/11 Users
 ```powershell
-(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\txt2img.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
 ```

 #### Linux / macOS Users
 ```shell
-python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+python3.10 apps/stable_diffusion/scripts/txt2img.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
 ```

 You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc

-The output on a 6900XT would like:
+The output on a 7900XTX would like:

 ```shell 
-44it [00:08,  5.14it/s]i = 44 t = 120 (191ms)
-45it [00:08,  5.15it/s]i = 45 t = 100 (191ms)
-46it [00:08,  5.16it/s]i = 46 t = 80 (191ms)
-47it [00:09,  5.16it/s]i = 47 t = 60 (193ms)
-48it [00:09,  5.15it/s]i = 48 t = 40 (195ms)
-49it [00:09,  5.12it/s]i = 49 t = 20 (196ms)
-50it [00:09,  5.14it/s]
-Average step time: 192.8154182434082ms/it
-Total image generation runtime (s): 10.390909433364868
-(shark.venv) PS C:\g\shark>
+Stats for run 0:
+Average step time: 47.19188690185547ms/it
+Clip Inference time (ms) = 109.531
+VAE Inference time (ms): 78.590
+
+Total image generation time: 2.5788655281066895sec
 ```

 Here are some samples generated:
--- a/web/models/init.py
+++ b/web/models/init.py
--- a/apps/stable_diffusion/init.py
+++ b/apps/stable_diffusion/init.py
--- a/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
+++ b/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -0,0 +1 @@
+from apps.stable_diffusion.scripts.txt2img import txt2img_inf
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
--- a/apps/stable_diffusion/scripts/telegram_bot.py
+++ b/apps/stable_diffusion/scripts/telegram_bot.py
--- a/apps/stable_diffusion/scripts/txt2img.py
+++ b/apps/stable_diffusion/scripts/txt2img.py
@@ -0,0 +1,274 @@
+import os
+
+os.environ["AMD_ENABLE_LLPC"] = "1"
+
+import json
+import torch
+import re
+import time
+from pathlib import Path
+from PIL import PngImagePlugin
+from datetime import datetime as dt
+from dataclasses import dataclass
+from csv import DictWriter
+from apps.stable_diffusion.src import (
+    args,
+    Text2ImagePipeline,
+    get_schedulers,
+    set_init_device_flags,
+)
+
+
+@dataclass
+class Config:
+    model_id: str
+    ckpt_loc: str
+    precision: str
+    batch_size: int
+    max_length: int
+    height: int
+    width: int
+    device: str
+
+
+# This has to come before importing cache objects
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+# save output images and the inputs correspoding to it.
+def save_output_img(output_img):
+    output_path = args.output_dir if args.output_dir else Path.cwd()
+    generated_imgs_path = Path(output_path, "generated_imgs")
+    generated_imgs_path.mkdir(parents=True, exist_ok=True)
+    csv_path = Path(generated_imgs_path, "imgs_details.csv")
+
+    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
+    out_img_name = (
+        f"{prompt_slice}_{args.seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+    )
+    out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+
+    if args.output_img_format == "jpg":
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+        output_img.save(out_img_path, quality=95, subsampling=0)
+    else:
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
+        pngInfo = PngImagePlugin.PngInfo()
+
+        if args.write_metadata_to_png:
+            pngInfo.add_text(
+                "parameters",
+                f"{args.prompts[0]}\nNegative prompt: {args.negative_prompts[0]}\nSteps:{args.steps}, Sampler: {args.scheduler}, CFG scale: {args.guidance_scale}, Seed: {args.seed}, Size: {args.width}x{args.height}, Model: {args.hf_model_id}",
+            )
+
+        output_img.save(
+            output_path / f"{out_img_name}.png", "PNG", pnginfo=pngInfo
+        )
+
+        if args.output_img_format not in ["png", "jpg"]:
+            print(
+                f"[ERROR] Format {args.output_img_format} is not supported yet."
+                "Image saved as png instead. Supported formats: png / jpg"
+            )
+
+    new_entry = {
+        "VARIANT": args.hf_model_id,
+        "SCHEDULER": args.scheduler,
+        "PROMPT": args.prompts[0],
+        "NEG_PROMPT": args.negative_prompts[0],
+        "SEED": args.seed,
+        "CFG_SCALE": args.guidance_scale,
+        "PRECISION": args.precision,
+        "STEPS": args.steps,
+        "HEIGHT": args.height,
+        "WIDTH": args.width,
+        "MAX_LENGTH": args.max_length,
+        "OUTPUT": out_img_path,
+    }
+
+    with open(csv_path, "a") as csv_obj:
+        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
+        dictwriter_obj.writerow(new_entry)
+        csv_obj.close()
+
+    if args.save_metadata_to_json:
+        del new_entry["OUTPUT"]
+        with open(f"{output_path}/{out_img_name}.json", "w") as f:
+            json.dump(new_entry, f, indent=4)
+
+
+txt2img_obj = None
+config_obj = None
+schedulers = None
+
+
+# Exposed to UI.
+def txt2img_inf(
+    prompt: str,
+    negative_prompt: str,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_size: int,
+    scheduler: str,
+    model_id: str,
+    custom_model_id: str,
+    ckpt_file_obj,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+):
+    global txt2img_obj
+    global config_obj
+    global schedulers
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.scheduler = scheduler
+    args.hf_model_id = custom_model_id if custom_model_id else model_id
+    args.ckpt_loc = ckpt_file_obj.name if ckpt_file_obj else ""
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+    )
+    if config_obj != new_config_obj:
+        config_obj = new_config_obj
+        args.precision = precision
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.use_tuned = True
+        args.import_mlir = False
+        set_init_device_flags()
+        schedulers = get_schedulers(model_id)
+        scheduler_obj = schedulers[scheduler]
+        txt2img_obj = Text2ImagePipeline.from_pretrained(
+            scheduler_obj,
+            args.import_mlir,
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.precision,
+            args.max_length,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.use_base_vae,
+        )
+    txt2img_obj.scheduler = schedulers[scheduler]
+
+    start_time = time.time()
+    txt2img_obj.log = ""
+    generated_imgs = txt2img_obj.generate_images(
+        prompt,
+        negative_prompt,
+        batch_size,
+        height,
+        width,
+        steps,
+        guidance_scale,
+        seed,
+        args.max_length,
+        dtype,
+        args.use_base_vae,
+        cpu_scheduling,
+    )
+    total_time = time.time() - start_time
+    save_output_img(generated_imgs[0])
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={device}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={args.height}x{args.width}"
+    text_output += (
+        f", batch size={args.batch_size}, max_length={args.max_length}"
+    )
+    text_output += txt2img_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    return generated_imgs, text_output
+
+
+if __name__ == "__main__":
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)
+    scheduler_obj = schedulers[args.scheduler]
+
+    txt2img_obj = Text2ImagePipeline.from_pretrained(
+        scheduler_obj,
+        args.import_mlir,
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.precision,
+        args.max_length,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.use_base_vae,
+    )
+
+    start_time = time.time()
+    generated_imgs = txt2img_obj.generate_images(
+        args.prompts,
+        args.negative_prompts,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.steps,
+        args.guidance_scale,
+        args.seed,
+        args.max_length,
+        dtype,
+        args.use_base_vae,
+        cpu_scheduling,
+    )
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={args.height}x{args.width}"
+    text_output += (
+        f", batch size={args.batch_size}, max_length={args.max_length}"
+    )
+    text_output += txt2img_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    save_output_img(generated_imgs[0])
+    print(text_output)
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -19,15 +19,18 @@ datas += copy_metadata('torchvision')
 datas += copy_metadata('torch-mlir')
 datas += copy_metadata('diffusers')
 datas += copy_metadata('transformers')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
 datas += collect_data_files('gradio')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
 datas += collect_data_files('shark')
 datas += [
-         ( 'models/stable_diffusion/resources/prompts.json', 'resources' ),
-         ( 'models/stable_diffusion/resources/model_db.json', 'resources' ),
-         ( 'models/stable_diffusion/resources/model_config.json', 'resources' ),
-         ( 'models/stable_diffusion/logos/*', 'logos' )
+         ( 'src/utils/resources/prompts.json', 'resources' ),
+         ( 'src/utils/resources/model_db.json', 'resources' ),
+         ( 'src/utils/resources/opt_flags.json', 'resources' ),
+         ( 'src/utils/resources/base_model.json', 'resources' ),
+         ( 'web/logos/*', 'logos' )
         ]

 binaries = []
@@ -36,11 +39,11 @@ block_cipher = None


 a = Analysis(
-    ['index.py'],
+    ['web/index.py'],
    pathex=['.'],
    binaries=binaries,
    datas=datas,
-    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio'],
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
--- a/apps/stable_diffusion/shark_sd_cli.spec
+++ b/apps/stable_diffusion/shark_sd_cli.spec
@@ -0,0 +1,77 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import copy_metadata
+
+import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
+
+datas = []
+datas += collect_data_files('torch')
+datas += copy_metadata('torch')
+datas += copy_metadata('tqdm')
+datas += copy_metadata('regex')
+datas += copy_metadata('requests')
+datas += copy_metadata('packaging')
+datas += copy_metadata('filelock')
+datas += copy_metadata('numpy')
+datas += copy_metadata('tokenizers')
+datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torchvision')
+datas += copy_metadata('torch-mlir')
+datas += copy_metadata('diffusers')
+datas += copy_metadata('transformers')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
+datas += collect_data_files('gradio')
+datas += collect_data_files('iree')
+datas += collect_data_files('google-cloud-storage')
+datas += collect_data_files('shark')
+datas += [
+         ( 'src/utils/resources/prompts.json', 'resources' ),
+         ( 'src/utils/resources/model_db.json', 'resources' ),
+         ( 'src/utils/resources/opt_flags.json', 'resources' ),
+         ( 'src/utils/resources/base_model.json', 'resources' ),
+         ]
+
+binaries = []
+
+block_cipher = None
+
+
+a = Analysis(
+    ['scripts/txt2img.py'],
+    pathex=['.'],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='shark_sd_cli',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -0,0 +1,8 @@
+from apps.stable_diffusion.src.utils import (
+    args,
+    set_init_device_flags,
+    prompt_examples,
+    get_available_devices,
+)
+from apps.stable_diffusion.src.pipelines import Text2ImagePipeline
+from apps.stable_diffusion.src.schedulers import get_schedulers
--- a/apps/stable_diffusion/src/models/init.py
+++ b/apps/stable_diffusion/src/models/init.py
@@ -0,0 +1,9 @@
+from apps.stable_diffusion.src.models.model_wrappers import (
+    SharkifyStableDiffusionModel,
+)
+from apps.stable_diffusion.src.models.opt_params import (
+    get_vae,
+    get_unet,
+    get_clip,
+    get_tokenizer,
+)
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -0,0 +1,233 @@
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from transformers import CLIPTextModel
+from collections import defaultdict
+import torch
+import sys
+import traceback
+import re
+from apps.stable_diffusion.src.utils import (
+    compile_through_fx,
+    get_opt_flags,
+    base_models,
+    args,
+)
+
+
+# These shapes are parameter dependent.
+def replace_shape_str(shape, max_len, width, height, batch_size):
+    new_shape = []
+    for i in range(len(shape)):
+        if shape[i] == "max_len":
+            new_shape.append(max_len)
+        elif shape[i] == "height":
+            new_shape.append(height)
+        elif shape[i] == "width":
+            new_shape.append(width)
+        elif isinstance(shape[i], str):
+            if "batch_size" in shape[i]:
+                mul_val = int(shape[i].split("*")[0])
+                new_shape.append(batch_size * mul_val)
+        else:
+            new_shape.append(shape[i])
+    return new_shape
+
+
+# Get the input info for various models i.e. "unet", "clip", "vae".
+def get_input_info(model_info, max_len, width, height, batch_size):
+    dtype_config = {"f32": torch.float32, "i64": torch.int64}
+    input_map = defaultdict(list)
+    for k in model_info:
+        for inp in model_info[k]:
+            shape = model_info[k][inp]["shape"]
+            dtype = dtype_config[model_info[k][inp]["dtype"]]
+            tensor = None
+            if isinstance(shape, list):
+                clean_shape = replace_shape_str(
+                    shape, max_len, width, height, batch_size
+                )
+                if dtype == torch.int64:
+                    tensor = torch.randint(1, 3, tuple(clean_shape))
+                else:
+                    tensor = torch.randn(*clean_shape).to(dtype)
+            elif isinstance(shape, int):
+                tensor = torch.tensor(shape).to(dtype)
+            else:
+                sys.exit("shape isn't specified correctly.")
+            input_map[k].append(tensor)
+    return input_map
+
+
+class SharkifyStableDiffusionModel:
+    def __init__(
+        self,
+        model_id: str,
+        custom_weights: str,
+        precision: str,
+        max_len: int = 64,
+        width: int = 512,
+        height: int = 512,
+        batch_size: int = 1,
+        use_base_vae: bool = False,
+    ):
+        self.check_params(max_len, width, height)
+        self.max_len = max_len
+        self.height = height // 8
+        self.width = width // 8
+        self.batch_size = batch_size
+        self.model_id = model_id if custom_weights == "" else custom_weights
+        self.precision = precision
+        self.base_vae = use_base_vae
+        self.model_name = (
+            str(batch_size)
+            + "_"
+            + str(max_len)
+            + "_"
+            + str(height)
+            + "_"
+            + str(width)
+            + "_"
+            + precision
+        )
+        # We need a better naming convention for the .vmfbs because despite
+        # using the custom model variant the .vmfb names remain the same and
+        # it'll always pick up the compiled .vmfb instead of compiling the
+        # custom model.
+        # So, currently, we add `self.model_id` in the `self.model_name` of
+        # .vmfb file.
+        # TODO: Have a better way of naming the vmfbs using self.model_name.
+
+        model_name = re.sub(r"\W+", "_", self.model_id)
+        if model_name[0] == "_":
+            model_name = model_name[1:]
+        self.model_name = self.model_name + "_" + model_name
+
+    def check_params(self, max_len, width, height):
+        if not (max_len >= 32 and max_len <= 77):
+            sys.exit("please specify max_len in the range [32, 77].")
+        if not (width % 8 == 0 and width >= 384):
+            sys.exit("width should be greater than 384 and multiple of 8")
+        if not (height % 8 == 0 and height >= 384):
+            sys.exit("height should be greater than 384 and multiple of 8")
+
+    def get_vae(self):
+        class VaeModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, base_vae=self.base_vae):
+                super().__init__()
+                self.vae = AutoencoderKL.from_pretrained(
+                    model_id,
+                    subfolder="vae",
+                )
+                self.base_vae = base_vae
+
+            def forward(self, input):
+                if not self.base_vae:
+                    input = 1 / 0.18215 * input
+                x = self.vae.decode(input, return_dict=False)[0]
+                x = (x / 2 + 0.5).clamp(0, 1)
+                if self.base_vae:
+                    return x
+                x = x * 255.0
+                return x.round()
+
+        vae = VaeModel()
+        inputs = tuple(self.inputs["vae"])
+        is_f16 = True if self.precision == "fp16" else False
+        vae_name = "base_vae" if self.base_vae else "vae"
+        shark_vae = compile_through_fx(
+            vae,
+            inputs,
+            is_f16=is_f16,
+            model_name=vae_name + self.model_name,
+            extra_args=get_opt_flags("vae", precision=self.precision),
+        )
+        return shark_vae
+
+    def get_unet(self):
+        class UnetModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.unet = UNet2DConditionModel.from_pretrained(
+                    model_id,
+                    subfolder="unet",
+                )
+                self.in_channels = self.unet.in_channels
+                self.train(False)
+
+            def forward(
+                self, latent, timestep, text_embedding, guidance_scale
+            ):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latents = torch.cat([latent] * 2)
+                unet_out = self.unet.forward(
+                    latents, timestep, text_embedding, return_dict=False
+                )[0]
+                noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+                return noise_pred
+
+        unet = UnetModel()
+        is_f16 = True if self.precision == "fp16" else False
+        inputs = tuple(self.inputs["unet"])
+        input_mask = [True, True, True, False]
+        shark_unet = compile_through_fx(
+            unet,
+            inputs,
+            model_name="unet" + self.model_name,
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+        )
+        return shark_unet
+
+    def get_clip(self):
+        class CLIPText(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.text_encoder = CLIPTextModel.from_pretrained(
+                    model_id,
+                    subfolder="text_encoder",
+                )
+
+            def forward(self, input):
+                return self.text_encoder(input)[0]
+
+        clip_model = CLIPText()
+
+        shark_clip = compile_through_fx(
+            clip_model,
+            tuple(self.inputs["clip"]),
+            model_name="clip" + self.model_name,
+            extra_args=get_opt_flags("clip", precision="fp32"),
+        )
+        return shark_clip
+
+    def __call__(self):
+        for model_id in base_models:
+            self.inputs = get_input_info(
+                base_models[model_id],
+                self.max_len,
+                self.width,
+                self.height,
+                self.batch_size,
+            )
+            try:
+                compiled_clip = self.get_clip()
+                compiled_unet = self.get_unet()
+                compiled_vae = self.get_vae()
+            except Exception as e:
+                if args.enable_stack_trace:
+                    traceback.print_exc()
+                print("Retrying with a different base model configuration")
+                continue
+            # This is done just because in main.py we are basing the choice of tokenizer and scheduler
+            # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
+            # model and rely on retrying method to find the input configuration, we should also update
+            # the knowledge of base model id accordingly into `args.hf_model_id`.
+            if args.ckpt_loc != "":
+                args.hf_model_id = model_id
+            return compiled_clip, compiled_unet, compiled_vae
+        sys.exit(
+            "Cannot compile the model. Please use `enable_stack_trace` and create an issue at https://github.com/nod-ai/SHARK/issues"
+        )
--- a/shark/examples/shark_inference/stable_diffusion/opt_params.py
+++ b/shark/examples/shark_inference/stable_diffusion/opt_params.py
@@ -1,17 +1,17 @@
 import sys
-from model_wrappers import (
-    get_base_vae_mlir,
-    get_vae_mlir,
-    get_unet_mlir,
-    get_clip_mlir,
-)
-from resources import models_db
-from stable_args import args
-from utils import get_shark_model
+from transformers import CLIPTokenizer
+from apps.stable_diffusion.src.utils import models_db, args, get_shark_model

-BATCH_SIZE = len(args.prompts)
-if BATCH_SIZE != 1:
-    sys.exit("Only batch size 1 is supported.")
+
+hf_model_variant_map = {
+    "Linaqruf/anything-v3.0": ["anythingv3", "v2_1base"],
+    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v2_1base"],
+    "prompthero/openjourney": ["openjourney", "v2_1base"],
+    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v2_1base"],
+    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1"],
+    "stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
+    "CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
+}


 def get_params(bucket_key, model_key, model, is_tuned, precision):
@@ -60,50 +60,54 @@ def get_params(bucket_key, model_key, model, is_tuned, precision):


 def get_unet():
+    variant, version = hf_model_variant_map[args.hf_model_id]
    # Tuned model is present only for `fp16` precision.
    is_tuned = "tuned" if args.use_tuned else "untuned"
    if "vulkan" not in args.device and args.use_tuned:
-        bucket_key = f"{args.variant}/{is_tuned}/{args.device}"
-        model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
    else:
-        bucket_key = f"{args.variant}/{is_tuned}"
-        model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"

    bucket, model_name, iree_flags = get_params(
        bucket_key, model_key, "unet", is_tuned, args.precision
    )
-    if not args.use_tuned and args.import_mlir:
-        return get_unet_mlir(model_name, iree_flags)
    return get_shark_model(bucket, model_name, iree_flags)


 def get_vae():
+    variant, version = hf_model_variant_map[args.hf_model_id]
    # Tuned model is present only for `fp16` precision.
    is_tuned = "tuned" if args.use_tuned else "untuned"
    is_base = "/base" if args.use_base_vae else ""
    if "vulkan" not in args.device and args.use_tuned:
-        bucket_key = f"{args.variant}/{is_tuned}/{args.device}"
-        model_key = f"{args.variant}/{args.version}/vae/{args.precision}/length_77/{is_tuned}{is_base}/{args.device}"
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}/{args.device}"
    else:
-        bucket_key = f"{args.variant}/{is_tuned}"
-        model_key = f"{args.variant}/{args.version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"

    bucket, model_name, iree_flags = get_params(
        bucket_key, model_key, "vae", is_tuned, args.precision
    )
-    if not args.use_tuned and args.import_mlir:
-        if args.use_base_vae:
-            return get_base_vae_mlir(model_name, iree_flags)
-        return get_vae_mlir(model_name, iree_flags)
    return get_shark_model(bucket, model_name, iree_flags)


 def get_clip():
-    bucket_key = f"{args.variant}/untuned"
-    model_key = f"{args.variant}/{args.version}/clip/fp32/length_{args.max_length}/untuned"
+    variant, version = hf_model_variant_map[args.hf_model_id]
+    bucket_key = f"{variant}/untuned"
+    model_key = (
+        f"{variant}/{version}/clip/fp32/length_{args.max_length}/untuned"
+    )
    bucket, model_name, iree_flags = get_params(
        bucket_key, model_key, "clip", "untuned", "fp32"
    )
-    if args.import_mlir:
-        return get_clip_mlir(model_name, iree_flags)
    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_tokenizer():
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.hf_model_id, subfolder="tokenizer"
+    )
+    return tokenizer
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -0,0 +1,3 @@
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_txt2img import (
+    Text2ImagePipeline,
+)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -0,0 +1,134 @@
+import torch
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+
+
+class Text2ImagePipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get initial latents
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Get Image latents
+        latents = self.produce_img_latents(
+            latents=init_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=self.scheduler.timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -0,0 +1,206 @@
+import torch
+from transformers import CLIPTokenizer
+from PIL import Image
+from tqdm.auto import tqdm
+import time
+from typing import Union
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from shark.shark_inference import SharkInference
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.models import (
+    SharkifyStableDiffusionModel,
+    get_vae,
+    get_clip,
+    get_unet,
+    get_tokenizer,
+)
+from apps.stable_diffusion.src.utils import (
+    start_profiling,
+    end_profiling,
+    preprocessCKPT,
+)
+
+
+class StableDiffusionPipeline:
+    def __init__(
+        self,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        self.vae = vae
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+        self.unet = unet
+        self.scheduler = scheduler
+        # TODO: Implement using logging python utility.
+        self.log = ""
+
+    def encode_prompts(self, prompts, neg_prompts, max_length):
+        # Tokenize text and get embeddings
+        text_input = self.tokenizer(
+            prompts,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        # Get unconditional embeddings as well
+        uncond_input = self.tokenizer(
+            neg_prompts,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])
+
+        clip_inf_start = time.time()
+        text_embeddings = self.text_encoder("forward", (text_input,))
+        clip_inf_time = (time.time() - clip_inf_start) * 1000
+        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"
+
+        return text_embeddings
+
+    def decode_latents(self, latents, use_base_vae, cpu_scheduling):
+        if use_base_vae:
+            latents = 1 / 0.18215 * latents
+
+        latents_numpy = latents
+        if cpu_scheduling:
+            latents_numpy = latents.detach().numpy()
+
+        profile_device = start_profiling(file_path="vae.rdc")
+        vae_start = time.time()
+        images = self.vae("forward", (latents_numpy,))
+        vae_inf_time = (time.time() - vae_start) * 1000
+        end_profiling(profile_device)
+        self.log += f"\nVAE Inference time (ms): {vae_inf_time:.3f}"
+
+        if use_base_vae:
+            images = torch.from_numpy(images)
+            images = (images.detach().cpu() * 255.0).numpy()
+            images = images.round()
+
+        images = torch.from_numpy(images).to(torch.uint8).permute(0, 2, 3, 1)
+        pil_images = [Image.fromarray(image) for image in images.numpy()]
+        return pil_images
+
+    def produce_img_latents(
+        self,
+        latents,
+        text_embeddings,
+        guidance_scale,
+        total_timesteps,
+        dtype,
+        cpu_scheduling,
+        return_all_latents=False,
+    ):
+        step_time_sum = 0
+        latent_history = [latents]
+        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            timestep = torch.tensor([t]).to(dtype).detach().numpy()
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            # Profiling Unet.
+            profile_device = start_profiling(file_path="unet.rdc")
+            noise_pred = self.unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                ),
+                send_to_host=False,
+            )
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = self.scheduler.step(
+                    noise_pred, t, latents
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents)
+
+            latent_history.append(latents)
+            step_time = (time.time() - step_start_time) * 1000
+            #  self.log += (
+            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
+            #  )
+            step_time_sum += step_time
+
+        avg_step_time = step_time_sum / len(total_timesteps)
+        self.log += f"\nAverage step time: {avg_step_time}ms/it"
+
+        if not return_all_latents:
+            return latents
+        all_latents = torch.cat(latent_history, dim=0)
+        return all_latents
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+        import_mlir: bool,
+        model_id: str,
+        ckpt_loc: str,
+        precision: str,
+        max_length: int,
+        batch_size: int,
+        height: int,
+        width: int,
+        use_base_vae: bool,
+    ):
+        init_kwargs = None
+        if import_mlir:
+            if ckpt_loc:
+                preprocessCKPT()
+            mlir_import = SharkifyStableDiffusionModel(
+                model_id,
+                ckpt_loc,
+                precision,
+                max_len=max_length,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                use_base_vae=use_base_vae,
+            )
+            clip, unet, vae = mlir_import()
+            return cls(vae, clip, get_tokenizer(), unet, scheduler)
+        return cls(
+            get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
+        )
--- a/apps/stable_diffusion/src/schedulers/init.py
+++ b/apps/stable_diffusion/src/schedulers/init.py
@@ -0,0 +1,4 @@
+from apps.stable_diffusion.src.schedulers.sd_schedulers import get_schedulers
+from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
+    SharkEulerDiscreteScheduler,
+)
--- a/apps/stable_diffusion/src/schedulers/sd_schedulers.py
+++ b/apps/stable_diffusion/src/schedulers/sd_schedulers.py
@@ -0,0 +1,51 @@
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+)
+from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
+    SharkEulerDiscreteScheduler,
+)
+
+
+def get_schedulers(model_id):
+    schedulers = dict()
+    schedulers["PNDM"] = PNDMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["DDIM"] = DDIMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "DPMSolverMultistep"
+    ] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "EulerAncestralDiscrete"
+    ] = EulerAncestralDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "SharkEulerDiscrete"
+    ] = SharkEulerDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["SharkEulerDiscrete"].compile()
+    return schedulers
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -9,21 +9,13 @@ from diffusers import (
    EulerDiscreteScheduler,
 )
 from diffusers.configuration_utils import register_to_config
-from models.stable_diffusion.utils import compile_through_fx, get_shark_model
-from models.stable_diffusion.stable_args import args
+from apps.stable_diffusion.src.utils import (
+    compile_through_fx,
+    get_shark_model,
+    args,
+)
 import torch

-SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
-
-model_input = {
-    "euler": {
-        "latent": torch.randn(1, 4, 64, 64),
-        "output": torch.randn(1, 4, 64, 64),
-        "sigma": torch.tensor(1).to(torch.float32),
-        "dt": torch.tensor(1).to(torch.float32),
-    },
-}
-

 class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
    @register_to_config
@@ -46,6 +38,22 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
        )

    def compile(self):
+        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+        BATCH_SIZE = args.batch_size
+
+        model_input = {
+            "euler": {
+                "latent": torch.randn(
+                    BATCH_SIZE, 4, args.height // 8, args.width // 8
+                ),
+                "output": torch.randn(
+                    BATCH_SIZE, 4, args.height // 8, args.width // 8
+                ),
+                "sigma": torch.tensor(1).to(torch.float32),
+                "dt": torch.tensor(1).to(torch.float32),
+            },
+        }
+
        example_latent = model_input["euler"]["latent"]
        example_output = model_input["euler"]["output"]
        if args.precision == "fp16":
@@ -84,7 +92,8 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            self.scaling_model = compile_through_fx(
                scaling_model,
                (example_latent, example_sigma),
-                model_name="euler_scale_model_input_" + args.precision,
+                model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
+                + args.precision,
                extra_args=iree_flags,
            )

@@ -92,7 +101,8 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            self.step_model = compile_through_fx(
                step_model,
                (example_output, example_sigma, example_latent, example_dt),
-                model_name="euler_step_" + args.precision,
+                model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}"
+                + args.precision,
                extra_args=iree_flags,
            )
        else:
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -0,0 +1,22 @@
+from apps.stable_diffusion.src.utils.profiler import (
+    start_profiling,
+    end_profiling,
+)
+from apps.stable_diffusion.src.utils.resources import (
+    prompt_examples,
+    models_db,
+    base_models,
+    opt_flags,
+    resource_path,
+)
+from apps.stable_diffusion.src.utils.stable_args import args
+from apps.stable_diffusion.src.utils.utils import (
+    get_shark_model,
+    compile_through_fx,
+    set_iree_runtime_flags,
+    map_device_to_name_path,
+    set_init_device_flags,
+    get_available_devices,
+    get_opt_flags,
+    preprocessCKPT,
+)
--- a/apps/stable_diffusion/src/utils/profiler.py
+++ b/apps/stable_diffusion/src/utils/profiler.py
@@ -0,0 +1,18 @@
+from apps.stable_diffusion.src.utils.stable_args import args
+
+
+# Helper function to profile the vulkan device.
+def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
+    if args.vulkan_debug_utils and "vulkan" in args.device:
+        import iree
+
+        print(f"Profiling and saving to {file_path}.")
+        vulkan_device = iree.runtime.get_device(args.device)
+        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
+        return vulkan_device
+    return None
+
+
+def end_profiling(device):
+    if device:
+        return device.end_profiling()
--- a/apps/stable_diffusion/src/utils/resources.py
+++ b/apps/stable_diffusion/src/utils/resources.py
@@ -0,0 +1,37 @@
+import os
+import json
+import sys
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+def get_json_file(path):
+    json_var = []
+    loc_json = resource_path(path)
+    if os.path.exists(loc_json):
+        with open(loc_json, encoding="utf-8") as fopen:
+            json_var = json.load(fopen)
+
+    if not json_var:
+        print(f"Unable to fetch {path}")
+
+    return json_var
+
+
+# TODO: This shouldn't be called from here, every time the file imports
+# it will run all the global vars.
+prompt_examples = get_json_file("resources/prompts.json")
+models_db = get_json_file("resources/model_db.json")
+
+# The base_model contains the input configuration for the different
+# models and also helps in providing information for the variants.
+base_models = get_json_file("resources/base_model.json")
+
+# Contains optimization flags for different models.
+opt_flags = get_json_file("resources/opt_flags.json")
--- a/apps/stable_diffusion/src/utils/resources/base_model.json
+++ b/apps/stable_diffusion/src/utils/resources/base_model.json
@@ -0,0 +1,98 @@
+{
+    "stabilityai/stable-diffusion-2-1": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    },
+    "CompVis/stable-diffusion-v1-4": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    }
+}
--- a/apps/stable_diffusion/src/utils/resources/model_config.json
+++ b/apps/stable_diffusion/src/utils/resources/model_config.json
--- a/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
@@ -5,8 +5,10 @@
    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
    "openjourney/tuned":"gs://shark_tank/sd_tuned",
    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
@@ -14,14 +16,17 @@
  {
    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
@@ -39,18 +44,22 @@
    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
--- a/apps/stable_diffusion/src/utils/resources/opt_flags.json
+++ b/apps/stable_diffusion/src/utils/resources/opt_flags.json
@@ -0,0 +1,95 @@
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
--- a/shark/examples/shark_inference/stable_diffusion/resources/prompts.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/prompts.json
--- a/shark/examples/shark_inference/stable_diffusion/stable_args.py
+++ b/shark/examples/shark_inference/stable_diffusion/stable_args.py
@@ -15,9 +15,10 @@ p = argparse.ArgumentParser(
 ##############################################################################

 p.add_argument(
+    "-p",
    "--prompts",
-    nargs="+",
-    default=["cyberpunk forest by Salvador Dali"],
+    action="append",
+    default=[],
    help="text of which images to be generated.",
 )

@@ -42,6 +43,28 @@ p.add_argument(
    help="the seed to use.",
 )

+p.add_argument(
+    "--batch_size",
+    type=int,
+    default=1,
+    choices=range(1, 4),
+    help="the number of inferences to be made in a single `run`.",
+)
+
+p.add_argument(
+    "--height",
+    type=int,
+    default=512,
+    help="the height of the output image.",
+)
+
+p.add_argument(
+    "--width",
+    type=int,
+    default=512,
+    help="the width of the output image.",
+)
+
 p.add_argument(
    "--guidance_scale",
    type=float,
@@ -64,13 +87,6 @@ p.add_argument(
    "--device", type=str, default="vulkan", help="device to run the model."
 )

-p.add_argument(
-    "--version",
-    type=str,
-    default="v2_1base",
-    help="Specify version of stable diffusion model",
-)
-
 p.add_argument(
    "--precision", type=str, default="fp16", help="precision to run the model."
 )
@@ -110,12 +126,6 @@ p.add_argument(
    help="Do conversion from the VAE output to pixel space on cpu.",
 )

-p.add_argument(
-    "--variant",
-    default="stablediffusion",
-    help="We now support multiple vairants of SD finetuned for different dataset. you can use the following anythingv3, ...",  # TODO add more once supported
-)
-
 p.add_argument(
    "--scheduler",
    type=str,
@@ -123,12 +133,48 @@ p.add_argument(
    help="other supported schedulers are [PNDM, DDIM, LMSDiscrete, EulerDiscrete, DPMSolverMultistep]",
 )

+p.add_argument(
+    "--output_img_format",
+    type=str,
+    default="png",
+    help="specify the format in which output image is save. Supported options: jpg / png",
+)
+
 p.add_argument(
    "--output_dir",
    type=str,
    default=None,
    help="Directory path to save the output images and json",
 )
+
+p.add_argument(
+    "--runs",
+    type=int,
+    default=1,
+    help="number of images to be generated with random seeds in single execution",
+)
+
+p.add_argument(
+    "--ckpt_loc",
+    type=str,
+    default="",
+    help="Path to SD's .ckpt file.",
+)
+
+p.add_argument(
+    "--hf_model_id",
+    type=str,
+    default="stabilityai/stable-diffusion-2-1-base",
+    help="The repo-id of hugging face.",
+)
+
+p.add_argument(
+    "--enable_stack_trace",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Enable showing the stack trace when retrying the base model configuration",
+)
+
 ##############################################################################
 ### IREE - Vulkan supported flags
 ##############################################################################
@@ -224,6 +270,20 @@ p.add_argument(
    help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
 )

+p.add_argument(
+    "--save_metadata_to_json",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for whether or not to save a generation information json file with the image.",
+)
+
+p.add_argument(
+    "--write_metadata_to_png",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for whether or not to save generation information in PNG chunk text to generated images.",
+)
+
 ##############################################################################
 ### Web UI flags
 ##############################################################################
@@ -235,6 +295,20 @@ p.add_argument(
    help="flag for removing the pregress bar animation during image generation",
 )

+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
 ##############################################################################
 ### SD model auto-annotation flags
 ##############################################################################
@@ -260,4 +334,4 @@ p.add_argument(
    help="Apply Winograd on selected conv ops.",
 )

-args = p.parse_args()
+args, unknown = p.parse_known_args()
--- a/shark/examples/shark_inference/stable_diffusion/utils.py
+++ b/shark/examples/shark_inference/stable_diffusion/utils.py
@@ -1,13 +1,18 @@
 import os
 import torch
 from shark.shark_inference import SharkInference
-from stable_args import args
 from shark.shark_importer import import_with_fx
 from shark.iree_utils.vulkan_utils import (
    set_iree_vulkan_runtime_flags,
    get_vulkan_target_triple,
 )
 from shark.iree_utils.gpu_utils import get_cuda_sm_cc
+from apps.stable_diffusion.src.utils.stable_args import args
+from apps.stable_diffusion.src.utils.resources import opt_flags
+import sys
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    load_pipeline_from_original_stable_diffusion_ckpt,
+)


 def _compile_module(shark_module, model_name, extra_args=[]):
@@ -62,10 +67,17 @@ def get_shark_model(tank_url, model_name, extra_args=[]):


 # Converts the torch-module into a shark_module.
-def compile_through_fx(model, inputs, model_name, extra_args=[]):
-
-    mlir_module, func_name = import_with_fx(model, inputs)
-
+def compile_through_fx(
+    model,
+    inputs,
+    model_name,
+    is_f16=False,
+    f16_input_mask=None,
+    extra_args=[],
+):
+    mlir_module, func_name = import_with_fx(
+        model, inputs, is_f16, f16_input_mask
+    )
    shark_module = SharkInference(
        mlir_module,
        device=args.device,
@@ -76,7 +88,6 @@ def compile_through_fx(model, inputs, model_name, extra_args=[]):


 def set_iree_runtime_flags():
-
    vulkan_runtime_flags = [
        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
@@ -183,38 +194,58 @@ def set_init_device_flags():
        args.device = "cpu"

    # set max_length based on availability.
-    if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
+    if args.hf_model_id in [
+        "Linaqruf/anything-v3.0",
+        "wavymulder/Analog-Diffusion",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+    ]:
        args.max_length = 77
-    elif args.variant == "openjourney":
+    elif args.hf_model_id == "prompthero/openjourney":
        args.max_length = 64

-    # Use tuned models in the case of stablediffusion/fp16 and rdna3 cards.
+    # Use tuned models in the case of a specific setting.
    if (
-        args.variant in ["openjourney", "dreamlike"]
+        args.hf_model_id
+        in ["prompthero/openjourney", "dreamlike-art/dreamlike-diffusion-1.0"]
        or args.precision != "fp16"
-        or "vulkan" not in args.device
-        or "rdna3" not in args.iree_vulkan_target_triple
    ):
        args.use_tuned = False

-    elif args.use_base_vae and args.variant != "stablediffusion":
+    elif (
+        "vulkan" in args.device
+        and "rdna3" not in args.iree_vulkan_target_triple
+    ):
        args.use_tuned = False

-    # Use tuned model in the case of stablediffusion/fp16 and cuda device sm_80
-    if (
-        args.variant == "stablediffusion"
-        and args.precision == "fp16"
-        and "cuda" in args.device
-        and get_cuda_sm_cc() == "sm_80"
-        and args.version == "v2_1base"
-    ):
-        args.use_tuned = True
+    elif "cuda" in args.device and get_cuda_sm_cc() not in ["sm_80", "sm_89"]:
+        args.use_tuned = False
+
+    elif args.use_base_vae and args.hf_model_id not in [
+        "stabilityai/stable-diffusion-2-1-base",
+        "CompVis/stable-diffusion-v1-4",
+    ]:
+        args.use_tuned = False

    if args.use_tuned:
-        print(f"Using {args.device} tuned models for stablediffusion/fp16.")
+        print(f"Using tuned models for {args.hf_model_id}/fp16/{args.device}.")
    else:
        print("Tuned models are currently not supported for this setting.")

+    # set import_mlir to True for unuploaded models.
+    if args.hf_model_id not in [
+        "Linaqruf/anything-v3.0",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+        "prompthero/openjourney",
+        "wavymulder/Analog-Diffusion",
+        "stabilityai/stable-diffusion-2-1",
+        "stabilityai/stable-diffusion-2-1-base",
+        "CompVis/stable-diffusion-v1-4",
+    ]:
+        args.import_mlir = True
+
+    if args.height != 512 or args.width != 512 or args.batch_size != 1:
+        args.import_mlir = True
+

 # Utility to get list of devices available.
 def get_available_devices():
@@ -230,7 +261,7 @@ def get_available_devices():
            print(f"{driver_name} devices are not available.")
        else:
            for i, device in enumerate(device_list_dict):
-                device_list.append(f"{driver_name}://{i} => {device['name']}")
+                device_list.append(f"{device['name']} => {driver_name}://{i}")
        return device_list

    set_iree_runtime_flags()
@@ -242,3 +273,79 @@ def get_available_devices():
    available_devices.extend(cuda_devices)
    available_devices.append("cpu")
    return available_devices
+
+
+def disk_space_check(path, lim=20):
+    from shutil import disk_usage
+
+    du = disk_usage(path)
+    free = du.free / (1024 * 1024 * 1024)
+    if free <= lim:
+        print(f"[WARNING] Only {free:.2f}GB space available in {path}.")
+
+
+def get_opt_flags(model, precision="fp16"):
+    iree_flags = []
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    if "specified_compilation_flags" in opt_flags[model][is_tuned][precision]:
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in opt_flags[model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += opt_flags[model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return iree_flags
+
+
+def preprocessCKPT():
+    from pathlib import Path
+
+    path = Path(args.ckpt_loc)
+    diffusers_path = path.parent.absolute()
+    diffusers_directory_name = path.stem
+    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
+    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
+    print(
+        "Created directory : ",
+        diffusers_directory_name,
+        " at -> ",
+        diffusers_path,
+    )
+    path_to_diffusers = complete_path_to_diffusers.as_posix()
+    from_safetensors = (
+        True if args.ckpt_loc.lower().endswith(".safetensors") else False
+    )
+    # EMA weights usually yield higher quality images for inference but non-EMA weights have
+    # been yielding better results in our case.
+    # TODO: Add an option `--ema` (`--no-ema`) for users to specify if they want to go for EMA
+    #       weight extraction or not.
+    extract_ema = False
+    print("Loading pipeline from original stable diffusion checkpoint")
+    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path=args.ckpt_loc,
+        extract_ema=extract_ema,
+        from_safetensors=from_safetensors,
+    )
+    pipe.save_pretrained(path_to_diffusers)
+    print("Loading complete")
+    args.ckpt_loc = path_to_diffusers
+    print("Custom model path is : ", args.ckpt_loc)
--- a/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
+++ b/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
@@ -12,22 +12,23 @@ If it works well for you, please "star" the following GitHub projects... this is

 *AMD Software: Adrenalin Edition 22.11.1 for MLIR/IREE Driver Version 22.20.29.09 for Windows® 10 and Windows® 11 (Windows Driver Store Version 31.0.12029.9003)*

-First, download this special driver in a folder of your choice. We recommend you keep that driver around since you may need to re-install it later, if Windows Update decides to overwrite it:
+First, for RDNA2 users, download this special driver in a folder of your choice. We recommend you keep the installation files around, since you may need to re-install it later, if Windows Update decides to overwrite it:
 https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mlir-iree

+For RDNA3, the latest driver 23.1.2 supports MLIR/IREE as well: https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-1-2-kb
+
 KNOWN ISSUES with this special AMD driver:
-* `Windows Update` may (depending how it's configured) automatically install a new official AMD driver that overwrites this IREE-specific driver. If Stable Diffusion used to work, then a few days later, it slows down a lot or produces incorrect results (e.g. black images), this may be the cause. To fix this problem, please check the installed driver's version, and re-install the special driver if needed. (TODO: document how to prevent this `Windows Update` behavior!)
-* Some people using this special driver experience mouse pointer accuracy issues, if you use a larger-than-default mouse pointer. The clicked point isn't centered properly. One possible work-around is to reset the pointer size to "1" in "Change pointer size and color".
+* `Windows Update` may (depending how it's configured) automatically install a new official AMD driver that overwrites this IREE-specific driver. If Stable Diffusion used to work, then a few days later, it slows down a lot or produces incorrect results (e.g. black images), this may be the cause. To fix this problem, please check the installed driver version, and re-install the special driver if needed. (TODO: document how to prevent this `Windows Update` behavior!)
+* Some people using this special driver experience mouse pointer accuracy issues, especially if using a larger-than-default mouse pointer. The clicked point isn't centered properly. One possible work-around is to reset the pointer size to "1" in "Change pointer size and color".

 ## Installation

-Download the latest Windows SHARK SD binary [423 here](https://github.com/nod-ai/SHARK/releases/download/20230101.423/shark_sd_20230101_423.exe) in a folder of your choice. If you want nighly builds you can look for them in the github releases page. Please read carefully the following notes:
+Download the latest Windows SHARK SD binary [469 here](https://github.com/nod-ai/SHARK/releases/download/20230124.469/shark_sd_20230124_469.exe) in a folder of your choice. If you want nighly builds, you can look for them on the GitHub releases page.

 Notes:
-* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR, that can get outdated if you run multiple EXE from the same folder. You can use `--clean_all` flag once to clean all the old files. 
-* Your browser may warn you about downloading an .exe file
+* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR which can be outdated if you run a new EXE from the same folder. You can use `--clean_all` flag once to clean all the old files. 
 * If you recently updated the driver or this binary (EXE file), we recommend you:
-  * clear all the local artifacts with `--clean_all` OR 
+  * clear all the local artifacts with `--clear_all` OR 
  * clear the Vulkan shader cache: For Windows users this can be done by clearing the contents of `C:\Users\%username%\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
  * clear the `huggingface` cache. In Windows, this is `C:\Users\%username%\.cache\huggingface`.

@@ -59,9 +60,9 @@ Here are some samples generated:
  <summary>Advanced Installation </summary>


-## Setup your Python VirtualEnvironment and Dependencies
-
-### Windows 10/11 Users
+## Setup your Python Virtual Environment and Dependencies
+<details>
+ <summary> Windows 10/11 Users </summary>

 * Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)

@@ -78,8 +79,10 @@ git clone https://github.com/nod-ai/SHARK.git
 cd SHARK
 ./setup_venv.ps1 #You can re-run this script to get the latest version
 ```
+</details> 

-### Linux
+ <details>
+  <summary>Linux</summary>

 ```shell
 git clone https://github.com/nod-ai/SHARK.git
@@ -87,53 +90,65 @@ cd SHARK
 ./setup_venv.sh
 source shark.venv/bin/activate
 ```
+ </details>

 ### Run Stable Diffusion on your device - WebUI

-#### Windows 10/11 Users
+<details>
+ <summary>Windows 10/11 Users</summary>
+ 
 ```powershell
-(shark.venv) PS C:\Users\nod\SHARK> cd web
-(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
+(shark.venv) PS C:\g\shark> cd .\apps\stable_diffusion\web\
+(shark.venv) PS C:\g\shark\apps\stable_diffusion\web> python .\index.py
 ```
-#### Linux Users
+ 
+ </details>
+ 
+<details>
+ <summary>Linux Users</summary>
+ 
 ```shell
-(shark.venv) > cd web
+(shark.venv) > cd apps/stable_diffusion/web
 (shark.venv) > python index.py
 ```
-
-
+ 
+</details>

 ### Run Stable Diffusion on your device - Commandline

-#### Windows 10/11 Users
+<details>
+ <summary>Windows 10/11 Users</summary>
+ 
 ```powershell
-(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\txt2img.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
 ```
+ 
+  </details>

-#### Linux
+<details>
+ <summary>Linux</summary>
+ 
 ```shell
-python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+python3.10 apps/stable_diffusion/scripts/txt2img.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
 ```
+ 
+  </details>

-The output on a 6900XT would like:
+The output on a 7900XTX would like:

 ```shell 
-44it [00:08,  5.14it/s]i = 44 t = 120 (191ms)
-45it [00:08,  5.15it/s]i = 45 t = 100 (191ms)
-46it [00:08,  5.16it/s]i = 46 t = 80 (191ms)
-47it [00:09,  5.16it/s]i = 47 t = 60 (193ms)
-48it [00:09,  5.15it/s]i = 48 t = 40 (195ms)
-49it [00:09,  5.12it/s]i = 49 t = 20 (196ms)
-50it [00:09,  5.14it/s]
-Average step time: 192.8154182434082ms/it
-Total image generation runtime (s): 10.390909433364868
-(shark.venv) PS C:\g\shark>
+Stats for run 0:
+Average step time: 47.19188690185547ms/it
+Clip Inference time (ms) = 109.531
+VAE Inference time (ms): 78.590
+
+Total image generation time: 2.5788655281066895sec
 ```

-
 For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
+ 
 </details>
-<details>
+  <details>
  <summary>Discord link</summary>
 Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
 </details>
--- a/shark/examples/shark_inference/stable_diffusion/stable_diffusion_telegram_bot.md
+++ b/shark/examples/shark_inference/stable_diffusion/stable_diffusion_telegram_bot.md
--- a/apps/stable_diffusion/web/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/css/sd_dark_theme.css
@@ -0,0 +1,67 @@
+.gradio-container {
+    background-color: black
+}
+
+.container {
+    background-color: black !important;
+    padding-top: 20px !important;
+}
+
+#ui_title {
+    padding: 10px !important;
+}
+
+#top_logo {
+    background-color: transparent;
+    border-radius: 0 !important;
+    border: 0;
+}
+
+#demo_title {
+    background-color: black;
+    border-radius: 0 !important;
+    border: 0;
+    padding-top: 50px;
+    padding-bottom: 0px;
+    width: 460px !important;
+}
+
+#demo_title_outer {
+    border-radius: 0;
+}
+
+#prompt_box_outer div:first-child {
+    border-radius: 0 !important
+}
+
+#prompt_box textarea {
+    background-color: #1d1d1d !important
+}
+
+#prompt_examples {
+    margin: 0 !important
+}
+
+#prompt_examples svg {
+    display: none !important;
+}
+
+.gr-sample-textbox {
+    border-radius: 1rem !important;
+    border-color: rgb(31, 41, 55) !important;
+    border-width: 2px !important;
+}
+
+#ui_body {
+    background-color: #111111 !important;
+    padding: 10px !important;
+    border-radius: 0.5em !important;
+}
+
+#img_result+div {
+    display: none !important;
+}
+
+footer {
+    display: none !important;
+}
--- a/apps/stable_diffusion/web/gradio/img2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/img2img_ui.py
--- a/apps/stable_diffusion/web/gradio/txt2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/txt2img_ui.py
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -0,0 +1,262 @@
+import os
+import sys
+from pathlib import Path
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"
+
+if sys.platform == "darwin":
+    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.src import (
+    prompt_examples,
+    args,
+    get_available_devices,
+)
+from apps.stable_diffusion.scripts import txt2img_inf
+
+nodlogo_loc = resource_path("logos/nod-logo.png")
+sdlogo_loc = resource_path("logos/sd-demo-logo.png")
+
+
+demo_css = resource_path("css/sd_dark_theme.css")
+
+
+with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        logo2 = Image.open(sdlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=100)
+            with gr.Column(scale=5, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=logo2,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="demo_title",
+                ).style(width=150, height=100)
+
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    with gr.Group():
+                        model_id = gr.Dropdown(
+                            label="Model ID",
+                            value="stabilityai/stable-diffusion-2-1-base",
+                            choices=[
+                                "Linaqruf/anything-v3.0",
+                                "prompthero/openjourney",
+                                "wavymulder/Analog-Diffusion",
+                                "stabilityai/stable-diffusion-2-1",
+                                "stabilityai/stable-diffusion-2-1-base",
+                                "CompVis/stable-diffusion-v1-4",
+                            ],
+                        )
+                        custom_model_id = gr.Textbox(
+                            placeholder="check here: https://huggingface.co/models eg. runwayml/stable-diffusion-v1-5",
+                            value="",
+                            label="HuggingFace Model ID",
+                        )
+                    with gr.Group():
+                        ckpt_loc = gr.File(
+                            label="Upload checkpoint",
+                            file_types=[".ckpt", ".safetensors"],
+                        )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value="cyberpunk forest by Salvador Dali",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="trees, green",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                with gr.Accordion(label="Advance Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            label="Scheduler",
+                            value="SharkEulerDiscrete",
+                            choices=[
+                                "DDIM",
+                                "PNDM",
+                                "LMSDiscrete",
+                                "DPMSolverMultistep",
+                                "EulerDiscrete",
+                                "EulerAncestralDiscrete",
+                                "SharkEulerDiscrete",
+                            ],
+                        )
+                        batch_size = gr.Slider(
+                            1, 4, value=1, step=1, label="Number of Images"
+                        )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 786, value=512, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 786, value=512, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value="fp16",
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=64,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=50, step=1, label="Steps"
+                        )
+                        guidance_scale = gr.Slider(
+                            0,
+                            50,
+                            value=7.5,
+                            step=0.1,
+                            label="CFG Scale",
+                        )
+                    with gr.Row():
+                        save_metadata_to_png = gr.Checkbox(
+                            label="Save prompt information to PNG",
+                            value=False,
+                            interactive=True,
+                        )
+                        save_metadata_to_json = gr.Checkbox(
+                            label="Save prompt information to JSON file",
+                            value=False,
+                            interactive=True,
+                        )
+                with gr.Row():
+                    seed = gr.Number(value=-1, precision=0, label="Seed")
+                    available_devices = get_available_devices()
+                    device = gr.Dropdown(
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        None,
+                        inputs=[],
+                        outputs=[seed],
+                        _js="() => Math.floor(Math.random() * 4294967295)",
+                    )
+                    stable_diffusion = gr.Button("Generate Image")
+                with gr.Accordion(label="Prompt Examples!", open=False):
+                    ex = gr.Examples(
+                        examples=prompt_examples,
+                        inputs=prompt,
+                        cache_examples=False,
+                        elem_id="prompt_examples",
+                    )
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2], height="auto")
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=4,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+
+        prompt.submit(
+            txt2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_size,
+                scheduler,
+                model_id,
+                custom_model_id,
+                ckpt_loc,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+            ],
+            outputs=[gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+        stable_diffusion.click(
+            txt2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_size,
+                scheduler,
+                model_id,
+                custom_model_id,
+                ckpt_loc,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+            ],
+            outputs=[gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+shark_web.queue()
+shark_web.launch(
+    share=args.share,
+    inbrowser=True,
+    server_name="0.0.0.0",
+    server_port=args.server_port,
+)
--- a/web/models/stable_diffusion/logos/Nod_logo.png
+++ b/web/models/stable_diffusion/logos/Nod_logo.png
--- a/web/models/stable_diffusion/logos/nod-logo.png
+++ b/web/models/stable_diffusion/logos/nod-logo.png
--- a/web/models/stable_diffusion/logos/sd-demo-logo.png
+++ b/web/models/stable_diffusion/logos/sd-demo-logo.png
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -0,0 +1,45 @@
+import argparse
+from PIL import Image
+import numpy as np
+
+import requests
+import shutil
+import os
+import subprocess
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-n", "--newfile")
+parser.add_argument(
+    "-g",
+    "--golden_url",
+    default="https://storage.googleapis.com/shark_tank/testdata/cyberpunk_fores_42_0_230119_021148.png",
+)
+
+
+def get_image(url, local_filename):
+    res = requests.get(url, stream=True)
+    if res.status_code == 200:
+        with open(local_filename, "wb") as f:
+            shutil.copyfileobj(res.raw, f)
+
+
+def compare_images(new_filename, golden_filename):
+    new = np.array(Image.open(new_filename)) / 255.0
+    golden = np.array(Image.open(golden_filename)) / 255.0
+    diff = np.abs(new - golden)
+    mean = np.mean(diff)
+    if mean > 0.01:
+        subprocess.run(
+            ["gsutil", "cp", new_filename, "gs://shark_tank/testdata/builder/"]
+        )
+        raise SystemExit("new and golden not close")
+    else:
+        print("SUCCESS")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    tempfile_name = os.path.join(os.getcwd(), "golden.png")
+    get_image(args.golden_url, tempfile_name)
+    compare_images(args.newfile, tempfile_name)
--- a/build_tools/stable_diff_main_test.sh
+++ b/build_tools/stable_diff_main_test.sh
@@ -0,0 +1,7 @@
+rm -rf ./test_images
+mkdir test_images
+python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned
+python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned --beta_models=True
+
+python build_tools/image_comparison.py -n ./test_images/*.png
+exit $?
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -0,0 +1,77 @@
+import os
+import subprocess
+from apps.stable_diffusion.src.utils.resources import (
+    get_json_file,
+)
+from shark.shark_downloader import download_public_file
+from image_comparison import compare_images
+import argparse
+from glob import glob
+import shutil
+
+model_config_dicts = get_json_file(
+    os.path.join(
+        os.getcwd(),
+        "apps/stable_diffusion/src/utils/resources/model_config.json",
+    )
+)
+
+
+def test_loop(device="vulkan", beta=False, extra_flags=[]):
+    # Get golden values from tank
+    shutil.rmtree("./test_images", ignore_errors=True)
+    os.mkdir("./test_images")
+    os.mkdir("./test_images/golden")
+    hf_model_names = model_config_dicts[0].values()
+    tuned_options = ["--no-use_tuned"]  #'use_tuned']
+    devices = ["vulkan"]
+    if beta:
+        extra_flags.append("--beta_models=True")
+    for model_name in hf_model_names:
+        for use_tune in tuned_options:
+            command = [
+                "python",
+                "apps/stable_diffusion/scripts/txt2img.py",
+                "--device=" + device,
+                "--output_dir=./test_images/" + model_name,
+                "--hf_model_id=" + model_name,
+                use_tune,
+            ]
+            command += extra_flags
+            generated_image = not subprocess.call(
+                command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+            )
+            if generated_image:
+                os.makedirs(
+                    "./test_images/golden/" + model_name, exist_ok=True
+                )
+                download_public_file(
+                    "gs://shark_tank/testdata/golden/" + model_name,
+                    "./test_images/golden/" + model_name,
+                )
+                comparison = [
+                    "python",
+                    "build_tools/image_comparison.py",
+                    "--golden_url=gs://shark_tank/testdata/golden/"
+                    + model_name
+                    + "/*.png",
+                    "--newfile=./test_images/" + model_name + "/*.png",
+                ]
+                test_file = glob("./test_images/" + model_name + "/*.png")[0]
+                golden_path = "./test_images/golden/" + model_name + "/*.png"
+                golden_file = glob(golden_path)[0]
+                compare_images(test_file, golden_file)
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-d", "--device", default="vulkan")
+parser.add_argument(
+    "-b", "--beta", action=argparse.BooleanOptionalAction, default=False
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    test_loop(args.device, args.beta, [])
--- a/dataset/README.md
+++ b/dataset/README.md
@@ -0,0 +1,27 @@
+# Dataset annotation tool
+
+SHARK annotator for adding or modifying prompts of dataset images
+
+## Set up
+
+Activate SHARK Python virtual environment and install additional packages
+```shell
+source ../shark.venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Run annotator
+
+```shell
+python annotation_tool.py
+```
+
+<img width="1280" alt="annotator" src="https://user-images.githubusercontent.com/49575973/214521137-7ef6ae10-7cd8-46e6-b270-b6c0445157f1.png">
+
+* Select a dataset from `Dataset` dropdown list
+* Select an image from `Image` dropdown list
+* Image and the existing prompt will be loaded
+* Select a prompt from `Prompt` dropdown list to modify or "Add new" to add a prompt
+* Click `Save` to save changes, click `Delete` to delete prompt
+* Click `Back` or `Next` to switch image, you could also select other images from `Image`
+* Click `Finish` when finishing annotation or before switching dataset
--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -0,0 +1,247 @@
+import gradio as gr
+import json
+import jsonlines
+import os
+from args import args
+from pathlib import Path
+from PIL import Image
+from utils import get_datasets
+
+
+shark_root = Path(__file__).parent.parent
+demo_css = shark_root.joinpath("web/demo.css").resolve()
+nodlogo_loc = shark_root.joinpath(
+    "web/models/stable_diffusion/logos/nod-logo.png"
+)
+
+
+with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Column(scale=1, elem_id="demo_title_outer"):
+            gr.Image(
+                value=nod_logo,
+                show_label=False,
+                interactive=False,
+                elem_id="top_logo",
+            ).style(width=150, height=100)
+
+    datasets, images, ds_w_prompts = get_datasets(args.gs_url)
+    prompt_data = dict()
+
+    with gr.Row(elem_id="ui_body"):
+        # TODO: add multiselect dataset, there is a gradio version conflict
+        dataset = gr.Dropdown(label="Dataset", choices=datasets)
+        image_name = gr.Dropdown(label="Image", choices=[])
+
+    with gr.Row(elem_id="ui_body"):
+        # TODO: add ability to search image by typing
+        with gr.Column(scale=1, min_width=600):
+            image = gr.Image(type="filepath").style(height=512)
+
+        with gr.Column(scale=1, min_width=600):
+            prompts = gr.Dropdown(
+                label="Prompts",
+                choices=[],
+            )
+            prompt = gr.Textbox(
+                label="Editor",
+                lines=3,
+            )
+            with gr.Row():
+                save = gr.Button("Save")
+                delete = gr.Button("Delete")
+            with gr.Row():
+                back_image = gr.Button("Back")
+                next_image = gr.Button("Next")
+            finish = gr.Button("Finish")
+
+    def filter_datasets(dataset):
+        if dataset is None:
+            return gr.Dropdown.update(value=None, choices=[])
+
+        # create the dataset dir if doesn't exist and download prompt file
+        dataset_path = str(shark_root) + "/dataset/" + dataset
+        if not os.path.exists(dataset_path):
+            os.mkdir(dataset_path)
+
+        # read prompt jsonlines file
+        prompt_data.clear()
+        if dataset in ds_w_prompts:
+            prompt_gs_path = args.gs_url + "/" + dataset + "/metadata.jsonl"
+            os.system(f'gsutil cp "{prompt_gs_path}" "{dataset_path}"/')
+            with jsonlines.open(dataset_path + "/metadata.jsonl") as reader:
+                for line in reader.iter(type=dict, skip_invalid=True):
+                    prompt_data[line["file_name"]] = (
+                        [line["text"]]
+                        if type(line["text"]) is str
+                        else line["text"]
+                    )
+
+        return gr.Dropdown.update(choices=images[dataset])
+
+    dataset.change(fn=filter_datasets, inputs=dataset, outputs=image_name)
+
+    def display_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return gr.Image.update(value=None), gr.Dropdown.update(value=None)
+
+        # download and load the image
+        img_gs_path = args.gs_url + "/" + dataset + "/" + image_name
+        img_sub_path = "/".join(image_name.split("/")[:-1])
+        img_dst_path = (
+            str(shark_root) + "/dataset/" + dataset + "/" + img_sub_path + "/"
+        )
+        if not os.path.exists(img_dst_path):
+            os.mkdir(img_dst_path)
+        os.system(f'gsutil cp "{img_gs_path}" "{img_dst_path}"')
+        img = Image.open(img_dst_path + image_name.split("/")[-1])
+
+        if image_name not in prompt_data.keys():
+            prompt_data[image_name] = []
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Image.update(value=img), gr.Dropdown.update(
+            choices=prompt_choices
+        )
+
+    image_name.change(
+        fn=display_image,
+        inputs=[dataset, image_name],
+        outputs=[image, prompts],
+    )
+
+    def edit_prompt(prompts):
+        if prompts == "Add new":
+            return gr.Textbox.update(value=None)
+
+        return gr.Textbox.update(value=prompts)
+
+    prompts.change(fn=edit_prompt, inputs=prompts, outputs=prompt)
+
+    def save_prompt(dataset, image_name, prompts, prompt):
+        if (
+            dataset is None
+            or image_name is None
+            or prompts is None
+            or prompt is None
+        ):
+            return
+
+        if prompts == "Add new":
+            prompt_data[image_name].append(prompt)
+        else:
+            idx = prompt_data[image_name].index(prompts)
+            prompt_data[image_name][idx] = prompt
+
+        prompt_path = (
+            str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
+        )
+        # write prompt jsonlines file
+        with open(prompt_path, "w") as f:
+            for key, value in prompt_data.items():
+                if not value:
+                    continue
+                v = value if len(value) > 1 else value[0]
+                f.write(json.dumps({"file_name": key, "text": v}))
+                f.write("\n")
+
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Dropdown.update(choices=prompt_choices, value=None)
+
+    save.click(
+        fn=save_prompt,
+        inputs=[dataset, image_name, prompts, prompt],
+        outputs=prompts,
+    )
+
+    def delete_prompt(dataset, image_name, prompts):
+        if dataset is None or image_name is None or prompts is None:
+            return
+        if prompts == "Add new":
+            return
+
+        prompt_data[image_name].remove(prompts)
+        prompt_path = (
+            str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
+        )
+        # write prompt jsonlines file
+        with open(prompt_path, "w") as f:
+            for key, value in prompt_data.items():
+                if not value:
+                    continue
+                v = value if len(value) > 1 else value[0]
+                f.write(json.dumps({"file_name": key, "text": v}))
+                f.write("\n")
+
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Dropdown.update(choices=prompt_choices, value=None)
+
+    delete.click(
+        fn=delete_prompt,
+        inputs=[dataset, image_name, prompts],
+        outputs=prompts,
+    )
+
+    def get_back_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return
+
+        # remove local image
+        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
+        os.system(f'rm "{img_path}"')
+        # get the index for the back image
+        idx = images[dataset].index(image_name)
+        if idx == 0:
+            return gr.Dropdown.update(value=None)
+
+        return gr.Dropdown.update(value=images[dataset][idx - 1])
+
+    back_image.click(
+        fn=get_back_image, inputs=[dataset, image_name], outputs=image_name
+    )
+
+    def get_next_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return
+
+        # remove local image
+        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
+        os.system(f'rm "{img_path}"')
+        # get the index for the next image
+        idx = images[dataset].index(image_name)
+        if idx == len(images[dataset]) - 1:
+            return gr.Dropdown.update(value=None)
+
+        return gr.Dropdown.update(value=images[dataset][idx + 1])
+
+    next_image.click(
+        fn=get_next_image, inputs=[dataset, image_name], outputs=image_name
+    )
+
+    def finish_annotation(dataset):
+        if dataset is None:
+            return
+
+        # upload prompt and remove local data
+        dataset_path = str(shark_root) + "/dataset/" + dataset
+        dataset_gs_path = args.gs_url + "/" + dataset + "/"
+        os.system(
+            f'gsutil cp "{dataset_path}/metadata.jsonl" "{dataset_gs_path}"'
+        )
+        os.system(f'rm -rf "{dataset_path}"')
+
+        return gr.Dropdown.update(value=None)
+
+    finish.click(fn=finish_annotation, inputs=dataset, outputs=dataset)
+
+
+if __name__ == "__main__":
+    shark_web.launch(
+        share=args.share,
+        inbrowser=True,
+        server_name="0.0.0.0",
+        server_port=args.server_port,
+    )
--- a/dataset/args.py
+++ b/dataset/args.py
@@ -0,0 +1,34 @@
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Dataset Annotator flags
+##############################################################################
+
+p.add_argument(
+    "--gs_url",
+    type=str,
+    required=True,
+    help="URL to datasets in GS bucket",
+)
+
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
+##############################################################################
+
+args = p.parse_args()
--- a/dataset/requirements.txt
+++ b/dataset/requirements.txt
@@ -0,0 +1,3 @@
+# SHARK Annotator
+gradio==3.15.0
+jsonlines
--- a/dataset/utils.py
+++ b/dataset/utils.py
@@ -0,0 +1,29 @@
+from google.cloud import storage
+
+
+def get_datasets(gs_url):
+    datasets = set()
+    images = dict()
+    ds_w_prompts = []
+
+    storage_client = storage.Client()
+    bucket_name = gs_url.split("/")[2]
+    source_blob_name = "/".join(gs_url.split("/")[3:])
+    blobs = storage_client.list_blobs(bucket_name, prefix=source_blob_name)
+
+    for blob in blobs:
+        dataset_name = blob.name.split("/")[1]
+        if dataset_name == "":
+            continue
+        datasets.add(dataset_name)
+        if dataset_name not in images.keys():
+            images[dataset_name] = []
+
+        # check if image or jsonl
+        file_sub_path = "/".join(blob.name.split("/")[2:])
+        if "/" in file_sub_path:
+            images[dataset_name] += [file_sub_path]
+        elif "metadata.jsonl" in file_sub_path:
+            ds_w_prompts.append(dataset_name)
+
+    return list(datasets), images, ds_w_prompts
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -14,21 +14,16 @@ import csv
 import argparse
 from shark.shark_importer import SharkImporter
 from shark.parser import shark_args
-import tensorflow as tf
 import subprocess as sp
 import hashlib
 import numpy as np
 from pathlib import Path
-
-visible_default = tf.config.list_physical_devices("GPU")
-try:
-    tf.config.set_visible_devices([], "GPU")
-    visible_devices = tf.config.get_visible_devices()
-    for device in visible_devices:
-        assert device.device_type != "GPU"
-except:
-    # Invalid device or cannot modify virtual devices once initialized.
-    pass
+from apps.stable_diffusion.src.models import (
+    model_wrappers as mw,
+)
+from apps.stable_diffusion.src.utils.stable_args import (
+    args,
+)


 def create_hash(file_name):
@@ -62,6 +57,31 @@ def save_torch_model(torch_model_list):

            model = None
            input = None
+            if model_type == "stable_diffusion":
+                args.use_tuned = False
+                args.import_mlir = True
+                args.use_tuned = False
+                args.local_tank_cache = WORKDIR
+
+                precision_values = ["fp16"]
+                seq_lengths = [64, 77]
+                for precision_value in precision_values:
+                    args.precision = precision_value
+                    for length in seq_lengths:
+                        model = mw.SharkifyStableDiffusionModel(
+                            model_id=torch_model_name,
+                            custom_weights="",
+                            precision=precision_value,
+                            max_len=length,
+                            width=512,
+                            height=512,
+                            use_base_vae=False,
+                            debug=True,
+                            sharktank_dir=WORKDIR,
+                            generate_vmfb=False,
+                        )
+                        model()
+                continue
            if model_type == "vision":
                model, input, _ = get_vision_model(torch_model_name)
            elif model_type == "hf":
@@ -110,6 +130,17 @@ def save_tf_model(tf_model_list):
        get_keras_model,
        get_TFhf_model,
    )
+    import tensorflow as tf
+
+    visible_default = tf.config.list_physical_devices("GPU")
+    try:
+        tf.config.set_visible_devices([], "GPU")
+        visible_devices = tf.config.get_visible_devices()
+        for device in visible_devices:
+            assert device.device_type != "GPU"
+    except:
+        # Invalid device or cannot modify virtual devices once initialized.
+        pass

    with open(tf_model_list) as csvfile:
        tf_reader = csv.reader(csvfile, delimiter=",")
@@ -205,34 +236,35 @@ def is_valid_file(arg):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--torch_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/torch_model_list.csv",
-        help="""Contains the file with torch_model name and args.
-             Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
-    )
-    parser.add_argument(
-        "--tf_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/tf_model_list.csv",
-        help="Contains the file with tf model name and args.",
-    )
-    parser.add_argument(
-        "--tflite_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/tflite/tflite_model_list.csv",
-        help="Contains the file with tf model name and args.",
-    )
-    parser.add_argument(
-        "--ci_tank_dir",
-        type=bool,
-        default=False,
-    )
-    parser.add_argument("--upload", type=bool, default=False)
+    # Note, all of these flags are overridden by the import of args from stable_args.py, flags are duplicated temporarily to preserve functionality
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument(
+    #    "--torch_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/torch_model_list.csv",
+    #    help="""Contains the file with torch_model name and args.
+    #         Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
+    # )
+    # parser.add_argument(
+    #    "--tf_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/tf_model_list.csv",
+    #    help="Contains the file with tf model name and args.",
+    # )
+    # parser.add_argument(
+    #    "--tflite_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/tflite/tflite_model_list.csv",
+    #    help="Contains the file with tf model name and args.",
+    # )
+    # parser.add_argument(
+    #    "--ci_tank_dir",
+    #    type=bool,
+    #    default=False,
+    # )
+    # parser.add_argument("--upload", type=bool, default=False)

-    args = parser.parse_args()
+    # old_args = parser.parse_args()

    home = str(Path.home())
    if args.ci_tank_dir == True:
@@ -248,8 +280,3 @@ if __name__ == "__main__":

    if args.tflite_model_csv:
        save_tflite_model(args.tflite_model_csv)
-
-    if args.upload:
-        git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
-        print("uploading files to gs://shark_tank/" + git_hash)
-        os.system(f"gsutil cp -r {WORKDIR}* gs://shark_tank/" + git_hash)
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -3,7 +3,7 @@

 numpy==1.22.4
 torchvision
-torchtriton
+pytorch-triton
 tabulate

 tqdm
@@ -15,7 +15,7 @@ iree-tools-tf

 # TensorFlow and JAX.
 gin-config
-tensorflow==2.10
+tensorflow==2.10.1
 keras==2.10
 #tf-models-nightly
 #tensorflow-text-nightly
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ google-cloud-storage
 # Testing
 pytest
 pytest-xdist
+pytest-forked
 Pillow
 parameterized

@@ -20,6 +21,9 @@ scipy
 ftfy
 gradio
 altair
+omegaconf
+safetensors

 # Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
+pefile
 pyinstaller
--- a/setup.py
+++ b/setup.py
@@ -2,11 +2,12 @@ from setuptools import find_packages
 from setuptools import setup

 import os
+import glob

 with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

-PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
+PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.5"
 backend_deps = []
 if "NO_BACKEND" in os.environ.keys():
    backend_deps = [
@@ -34,6 +35,7 @@ setup(
    ],
    packages=find_packages(exclude=("examples")),
    python_requires=">=3.9",
+    data_files=glob.glob("apps/stable_diffusion/resources/**"),
    install_requires=[
        "numpy",
        "PyYAML",
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -128,6 +128,7 @@ if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
  TV_VERSION=${TV_VER:9:18}
  $PYTHON -m pip uninstall -y torch torchvision
+  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu117/torch-${TORCH_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu117/torchvision-${TV_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl
  if [ $? -eq 0 ];then
    echo "Successfully Installed torch + cu117."
--- a/shark/examples/shark_inference/ESRGAN/esrgan.py
+++ b/shark/examples/shark_inference/ESRGAN/esrgan.py
@@ -128,7 +128,6 @@ def load_mlir(mlir_loc):


 def compile_through_fx(model, inputs, mlir_loc=None):
-
    module = load_mlir(mlir_loc)
    if module == None:
        fx_g = make_fx(
--- a/shark/examples/shark_inference/simple_dlrm.py
+++ b/shark/examples/shark_inference/simple_dlrm.py
@@ -151,7 +151,6 @@ class DLRM_Net(nn.Module):
            and (ln_top is not None)
            and (arch_interaction_op is not None)
        ):
-
            # save arguments
            self.output_d = 0
            self.arch_interaction_op = arch_interaction_op
@@ -216,7 +215,6 @@ class DLRM_Net(nn.Module):
        return ly

    def interact_features(self, x, ly):
-
        if self.arch_interaction_op == "dot":
            # concatenate dense and sparse features
            (batch_size, d) = x.shape
--- a/shark/examples/shark_inference/sparse_arch.py
+++ b/shark/examples/shark_inference/sparse_arch.py
@@ -99,7 +99,6 @@ class SparseArchShark(nn.Module):
        )

    def forward(self, *batched_inputs):
-
        concatenated_list = []
        input_enum, embedding_enum = 0, 0

@@ -121,7 +120,6 @@ class SparseArchShark(nn.Module):


 def test_sparse_arch() -> None:
-
    D = 3
    eb1_config = EmbeddingBagConfig(
        name="t1",
@@ -211,7 +209,6 @@ class DLRMShark(nn.Module):
    def forward(
        self, dense_features: torch.Tensor, *sparse_features
    ) -> torch.Tensor:
-
        embedded_dense = self.dense_arch(dense_features)
        embedded_sparse = self.sparse_arch(*sparse_features)
        concatenated_dense = self.inter_arch(
--- a/shark/examples/shark_inference/stable_diff.py
+++ b/shark/examples/shark_inference/stable_diff.py
@@ -1,272 +0,0 @@
-from transformers import CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
-import torch
-from PIL import Image
-from diffusers import LMSDiscreteScheduler
-from tqdm.auto import tqdm
-from shark.shark_inference import SharkInference
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-import torch_mlir
-import tempfile
-import numpy as np
-
-# pip install diffusers
-# pip install scipy
-
-############### Parsing args #####################
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a photograph of an astronaut riding a horse",
-    help="the text prompt to use",
-)
-p.add_argument("--device", type=str, default="cpu", help="the device to use")
-p.add_argument("--steps", type=int, default=10, help="the device to use")
-p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
-p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
-args = p.parse_args()
-
-#####################################################
-
-
-def load_mlir(mlir_loc):
-    import os
-
-    if mlir_loc == None:
-        return None
-    print(f"Trying to load the model from {mlir_loc}.")
-    with open(os.path.join(mlir_loc)) as f:
-        mlir_module = f.read()
-    return mlir_module
-
-
-def compile_through_fx(model, inputs, mlir_loc=None, extra_args=[]):
-
-    module = load_mlir(mlir_loc)
-    if mlir_loc == None:
-        fx_g = make_fx(
-            model,
-            decomposition_table=get_decompositions(
-                [
-                    torch.ops.aten.embedding_dense_backward,
-                    torch.ops.aten.native_layer_norm_backward,
-                    torch.ops.aten.slice_backward,
-                    torch.ops.aten.select_backward,
-                    torch.ops.aten.norm.ScalarOpt_dim,
-                    torch.ops.aten.native_group_norm,
-                    torch.ops.aten.upsample_bilinear2d.vec,
-                    torch.ops.aten.split.Tensor,
-                    torch.ops.aten.split_with_sizes,
-                ]
-            ),
-        )(*inputs)
-
-        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-        fx_g.recompile()
-
-        def strip_overloads(gm):
-            """
-            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-            Args:
-                gm(fx.GraphModule): The input Fx graph module to be modified
-            """
-            for node in gm.graph.nodes:
-                if isinstance(node.target, torch._ops.OpOverload):
-                    node.target = node.target.overloadpacket
-            gm.recompile()
-
-        strip_overloads(fx_g)
-
-        ts_g = torch.jit.script(fx_g)
-
-        module = torch_mlir.compile(
-            ts_g,
-            inputs,
-            torch_mlir.OutputType.LINALG_ON_TENSORS,
-            use_tracing=False,
-            verbose=False,
-        )
-
-    mlir_model = module
-    func_name = "forward"
-
-    shark_module = SharkInference(
-        mlir_model,
-        func_name,
-        device=args.device,
-        mlir_dialect="tm_tensor",
-    )
-    shark_module.compile(extra_args)
-
-    return shark_module
-
-
-if __name__ == "__main__":
-
-    YOUR_TOKEN = "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"
-
-    # 1. Load the autoencoder model which will be used to decode the latents into image space.
-    vae = AutoencoderKL.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
-        subfolder="vae",
-        use_auth_token=YOUR_TOKEN,
-    )
-
-    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    text_encoder = CLIPTextModel.from_pretrained(
-        "openai/clip-vit-large-patch14"
-    )
-
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="vae",
-                use_auth_token=YOUR_TOKEN,
-            )
-
-        def forward(self, input):
-            return self.vae.decode(input, return_dict=False)[0]
-
-    vae = VaeModel()
-    vae_input = torch.rand(1, 4, 64, 64)
-    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
-
-    # Wrap the unet model to return tuples.
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-        def forward(self, x, y, z):
-            return self.unet.forward(x, y, z, return_dict=False)[0]
-
-    # 3. The UNet model for generating the latents.
-    unet = UnetModel()
-    latent_model_input = torch.rand([2, 4, 64, 64])
-    text_embeddings = torch.rand([2, 77, 768])
-    shark_unet = compile_through_fx(
-        unet,
-        (latent_model_input, torch.tensor([1.0]), text_embeddings),
-        args.mlir_loc,
-        ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
-    )
-
-    # torch.jit.script(unet)
-
-    scheduler = LMSDiscreteScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-    )
-
-    prompt = [args.prompt]
-
-    height = 512  # default height of Stable Diffusion
-    width = 512  # default width of Stable Diffusion
-
-    num_inference_steps = args.steps  # Number of denoising steps
-
-    guidance_scale = 7.5  # Scale for classifier-free guidance
-
-    generator = torch.manual_seed(
-        42
-    )  # Seed generator to create the inital latent noise
-
-    batch_size = len(prompt)
-
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-
-    text_embeddings = text_encoder(text_input.input_ids)[0]
-
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-        [""] * batch_size,
-        padding="max_length",
-        max_length=max_length,
-        return_tensors="pt",
-    )
-    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
-
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-    latents = torch.randn(
-        (batch_size, unet.in_channels, height // 8, width // 8),
-        generator=generator,
-    )
-    # latents = latents.to(torch_device)
-
-    scheduler.set_timesteps(num_inference_steps)
-
-    latents = latents * scheduler.sigmas[0]
-    # print(latents, latents.shape)
-
-    for i, t in tqdm(enumerate(scheduler.timesteps)):
-
-        print(f"i = {i} t = {t}")
-        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-        latent_model_input = torch.cat([latents] * 2)
-        sigma = scheduler.sigmas[i]
-        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
-
-        # predict the noise residual
-
-        # with torch.no_grad():
-        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
-
-        latent_model_input_numpy = latent_model_input.detach().numpy()
-        text_embeddings_numpy = text_embeddings.detach().numpy()
-
-        noise_pred = shark_unet.forward(
-            (
-                latent_model_input_numpy,
-                np.array([t]).astype(np.float32),
-                text_embeddings_numpy,
-            )
-        )
-        noise_pred = torch.from_numpy(noise_pred)
-
-        # perform guidance
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        noise_pred = noise_pred_uncond + guidance_scale * (
-            noise_pred_text - noise_pred_uncond
-        )
-
-        # compute the previous noisy sample x_t -> x_t-1
-        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
-
-    # print("Latents shape : ", latents.shape)
-
-    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    latents_numpy = latents.detach().numpy()
-    image = shark_vae.forward((latents_numpy,))
-    image = torch.from_numpy(image)
-
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_f16.py
+++ b/shark/examples/shark_inference/stable_diff_f16.py
@@ -1,280 +0,0 @@
-from transformers import CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
-import torch
-from PIL import Image
-from diffusers import LMSDiscreteScheduler
-from tqdm.auto import tqdm
-from shark.shark_inference import SharkInference
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-import torch_mlir
-import tempfile
-import numpy as np
-
-# pip install diffusers
-# pip install scipy
-
-############### Parsing args #####################
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a photograph of an astronaut riding a horse",
-    help="the text prompt to use",
-)
-p.add_argument("--device", type=str, default="cpu", help="the device to use")
-p.add_argument("--steps", type=int, default=50, help="the device to use")
-p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
-p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
-args = p.parse_args()
-
-#####################################################
-
-
-def fp16_unet():
-    from shark.shark_downloader import download_model
-
-    mlir_model, func_name, inputs, golden_out = download_model(
-        "stable_diff_f16_18_OCT",
-        tank_url="gs://shark_tank/prashant_nod",
-        frontend="torch",
-    )
-    shark_module = SharkInference(
-        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
-    )
-    shark_module.compile()
-    return shark_module
-
-
-def load_mlir(mlir_loc):
-    import os
-
-    if mlir_loc == None:
-        return None
-    print(f"Trying to load the model from {mlir_loc}.")
-    with open(os.path.join(mlir_loc)) as f:
-        mlir_module = f.read()
-    return mlir_module
-
-
-def compile_through_fx(model, inputs, mlir_loc=None):
-
-    module = load_mlir(mlir_loc)
-    if mlir_loc == None:
-        fx_g = make_fx(
-            model,
-            decomposition_table=get_decompositions(
-                [
-                    torch.ops.aten.embedding_dense_backward,
-                    torch.ops.aten.native_layer_norm_backward,
-                    torch.ops.aten.slice_backward,
-                    torch.ops.aten.select_backward,
-                    torch.ops.aten.norm.ScalarOpt_dim,
-                    torch.ops.aten.native_group_norm,
-                    torch.ops.aten.upsample_bilinear2d.vec,
-                    torch.ops.aten.split.Tensor,
-                    torch.ops.aten.split_with_sizes,
-                ]
-            ),
-        )(*inputs)
-
-        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-        fx_g.recompile()
-
-        def strip_overloads(gm):
-            """
-            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-            Args:
-                gm(fx.GraphModule): The input Fx graph module to be modified
-            """
-            for node in gm.graph.nodes:
-                if isinstance(node.target, torch._ops.OpOverload):
-                    node.target = node.target.overloadpacket
-            gm.recompile()
-
-        strip_overloads(fx_g)
-
-        ts_g = torch.jit.script(fx_g)
-
-        module = torch_mlir.compile(
-            ts_g,
-            inputs,
-            torch_mlir.OutputType.LINALG_ON_TENSORS,
-            use_tracing=False,
-            verbose=False,
-        )
-
-    mlir_model = module
-    func_name = "forward"
-
-    shark_module = SharkInference(
-        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
-    )
-    shark_module.compile()
-
-    return shark_module
-
-
-if __name__ == "__main__":
-
-    YOUR_TOKEN = "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"
-
-    # 1. Load the autoencoder model which will be used to decode the latents into image space.
-    vae = AutoencoderKL.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
-        subfolder="vae",
-        use_auth_token=YOUR_TOKEN,
-    )
-
-    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    text_encoder = CLIPTextModel.from_pretrained(
-        "openai/clip-vit-large-patch14"
-    )
-
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="vae",
-                use_auth_token=YOUR_TOKEN,
-            )
-
-        def forward(self, input):
-            return self.vae.decode(input, return_dict=False)[0]
-
-    vae = VaeModel()
-    vae_input = torch.rand(1, 4, 64, 64)
-    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
-
-    # Wrap the unet model to return tuples.
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-    def forward(self, x, y, z):
-        return self.unet.forward(x, y, z, return_dict=False)[0]
-
-    # # 3. The UNet model for generating the latents.
-    unet = UnetModel()
-
-    shark_unet = fp16_unet()
-
-    scheduler = LMSDiscreteScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-    )
-
-    prompt = [args.prompt]
-
-    height = 512  # default height of Stable Diffusion
-    width = 512  # default width of Stable Diffusion
-
-    num_inference_steps = args.steps  # Number of denoising steps
-
-    guidance_scale = 7.5  # Scale for classifier-free guidance
-
-    generator = torch.manual_seed(
-        42
-    )  # Seed generator to create the inital latent noise
-
-    batch_size = len(prompt)
-
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-
-    text_embeddings = text_encoder(text_input.input_ids)[0]
-
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-        [""] * batch_size,
-        padding="max_length",
-        max_length=max_length,
-        return_tensors="pt",
-    )
-    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
-
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-    latents = torch.randn(
-        (batch_size, unet.in_channels, height // 8, width // 8),
-        generator=generator,
-    )
-    # latents = latents.to(torch_device)
-
-    scheduler.set_timesteps(num_inference_steps)
-
-    latents = latents * scheduler.sigmas[0]
-    # print(latents, latents.shape)
-
-    for i, t in tqdm(enumerate(scheduler.timesteps)):
-
-        print(f"i = {i} t = {t}")
-        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-        latent_model_input = torch.cat([latents] * 2)
-        sigma = scheduler.sigmas[i]
-        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
-
-        # predict the noise residual
-
-        # with torch.no_grad():
-        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
-
-        latent_model_input_numpy = (
-            latent_model_input.detach().numpy().astype(np.half)
-        )
-        text_embeddings_numpy = (
-            text_embeddings.detach().numpy().astype(np.half)
-        )
-
-        noise_pred = shark_unet.forward(
-            (
-                latent_model_input_numpy,
-                np.array([t]).astype(np.half),
-                text_embeddings_numpy,
-            )
-        )
-        noise_pred = torch.from_numpy(noise_pred).to(torch.float32)
-
-        # perform guidance
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        noise_pred = noise_pred_uncond + guidance_scale * (
-            noise_pred_text - noise_pred_uncond
-        )
-
-        # compute the previous noisy sample x_t -> x_t-1
-        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
-
-    # print("Latents shape : ", latents.shape)
-
-    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    latents_numpy = latents.detach().numpy()
-    image = shark_vae.forward((latents_numpy,))
-    image = torch.from_numpy(image)
-
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_tf.py
+++ b/shark/examples/shark_inference/stable_diff_tf.py
@@ -1,313 +0,0 @@
-import math
-import numpy as np
-import tensorflow as tf
-from tensorflow import keras
-from keras_cv.models.generative.stable_diffusion.clip_tokenizer import (
-    SimpleTokenizer,
-)
-from keras_cv.models.generative.stable_diffusion.constants import (
-    _ALPHAS_CUMPROD,
-)
-from keras_cv.models.generative.stable_diffusion.constants import (
-    _UNCONDITIONAL_TOKENS,
-)
-from keras_cv.models.generative.stable_diffusion.decoder import Decoder
-from keras_cv.models.generative.stable_diffusion.text_encoder import (
-    TextEncoder,
-)
-
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
-from PIL import Image
-
-# pip install "git+https://github.com/keras-team/keras-cv.git"
-# pip install tensorflow_dataset
-
-############### Parsing args #####################
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a photograph of an astronaut riding a horse",
-    help="the text prompt to use",
-)
-p.add_argument("--device", type=str, default="cpu", help="the device to use")
-p.add_argument(
-    "--steps", type=int, default=10, help="the number of steps to use"
-)
-p.add_argument(
-    "--save_path",
-    type=str,
-    default=None,
-    help="the file to save the resulting image to. (default to <input prompt>.jpg)",
-)
-args = p.parse_args()
-
-#####################################################
-
-MAX_PROMPT_LENGTH = 77
-
-
-class SharkStableDiffusion:
-    """Shark implementation of Stable Diffusion based on model from keras_cv.
-    Stable Diffusion is a powerful image generation model that can be used,
-    among other things, to generate pictures according to a short text description
-    (called a "prompt").
-    Arguments:
-        device: Device to use with SHARK. Default: cpu
-        jit_compile: Whether to compile the underlying models to XLA.
-            This can lead to a significant speedup on some systems. Default: False.
-    References:
-    - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
-    - [Original implementation](https://github.com/CompVis/stable-diffusion)
-    """
-
-    def __init__(self, device="cpu", jit_compile=True):
-        self.img_height = 512
-        self.img_width = 512
-        self.tokenizer = SimpleTokenizer()
-
-        # Create models
-        self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
-
-        mlir_model, func_name, inputs, golden_out = download_model(
-            "stable_diff", tank_url="gs://shark_tank/quinn", frontend="tf"
-        )
-        shark_module = SharkInference(
-            mlir_model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        self.diffusion_model = shark_module
-        self.decoder = Decoder(self.img_height, self.img_width)
-        if jit_compile:
-            self.text_encoder.compile(jit_compile=True)
-            self.decoder.compile(jit_compile=True)
-
-        print(
-            "By using this model checkpoint, you acknowledge that its usage is "
-            "subject to the terms of the CreativeML Open RAIL-M license at "
-            "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE"
-        )
-        # Load weights
-        text_encoder_weights_fpath = keras.utils.get_file(
-            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_encoder.h5",
-            file_hash="4789e63e07c0e54d6a34a29b45ce81ece27060c499a709d556c7755b42bb0dc4",
-        )
-        decoder_weights_fpath = keras.utils.get_file(
-            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_decoder.h5",
-            file_hash="ad350a65cc8bc4a80c8103367e039a3329b4231c2469a1093869a345f55b1962",
-        )
-        self.text_encoder.load_weights(text_encoder_weights_fpath)
-        self.decoder.load_weights(decoder_weights_fpath)
-
-    def text_to_image(
-        self,
-        prompt,
-        batch_size=1,
-        num_steps=25,
-        unconditional_guidance_scale=7.5,
-        seed=None,
-    ):
-        encoded_text = self.encode_text(prompt)
-
-        return self.generate_image(
-            encoded_text,
-            batch_size=batch_size,
-            num_steps=num_steps,
-            unconditional_guidance_scale=unconditional_guidance_scale,
-            seed=seed,
-        )
-
-    def encode_text(self, prompt):
-        """Encodes a prompt into a latent text encoding.
-        The encoding produced by this method should be used as the
-        `encoded_text` parameter of `StableDiffusion.generate_image`. Encoding
-        text separately from generating an image can be used to arbitrarily
-        modify the text encoding priot to image generation, e.g. for walking
-        between two prompts.
-        Args:
-            prompt: a string to encode, must be 77 tokens or shorter.
-        Example:
-        ```python
-        from keras_cv.models import StableDiffusion
-        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
-        encoded_text  = model.encode_text("Tacos at dawn")
-        img = model.generate_image(encoded_text)
-        ```
-        """
-        # Tokenize prompt (i.e. starting context)
-        inputs = self.tokenizer.encode(prompt)
-        if len(inputs) > MAX_PROMPT_LENGTH:
-            raise ValueError(
-                f"Prompt is too long (should be <= {MAX_PROMPT_LENGTH} tokens)"
-            )
-        phrase = inputs + [49407] * (MAX_PROMPT_LENGTH - len(inputs))
-        phrase = tf.convert_to_tensor([phrase], dtype=tf.int32)
-
-        context = self.text_encoder.predict_on_batch(
-            [phrase, self._get_pos_ids()]
-        )
-
-        return context
-
-    def generate_image(
-        self,
-        encoded_text,
-        batch_size=1,
-        num_steps=25,
-        unconditional_guidance_scale=7.5,
-        diffusion_noise=None,
-        seed=None,
-    ):
-        """Generates an image based on encoded text.
-        The encoding passed to this method should be derived from
-        `StableDiffusion.encode_text`.
-        Args:
-            encoded_text: Tensor of shape (`batch_size`, 77, 768), or a Tensor
-            of shape (77, 768). When the batch axis is omitted, the same encoded
-            text will be used to produce every generated image.
-            batch_size: number of images to generate. Default: 1.
-            num_steps: number of diffusion steps (controls image quality).
-                Default: 25.
-            unconditional_guidance_scale: float controling how closely the image
-                should adhere to the prompt. Larger values result in more
-                closely adhering to the prompt, but will make the image noisier.
-                Default: 7.5.
-            diffusion_noise: Tensor of shape (`batch_size`, img_height // 8,
-                img_width // 8, 4), or a Tensor of shape (img_height // 8,
-                img_width // 8, 4). Optional custom noise to seed the diffusion
-                process. When the batch axis is omitted, the same noise will be
-                used to seed diffusion for every generated image.
-            seed: integer which is used to seed the random generation of
-                diffusion noise, only to be specified if `diffusion_noise` is
-                None.
-        Example:
-        ```python
-        from keras_cv.models import StableDiffusion
-        batch_size = 8
-        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
-        e_tacos = model.encode_text("Tacos at dawn")
-        e_watermelons = model.encode_text("Watermelons at dusk")
-        e_interpolated = tf.linspace(e_tacos, e_watermelons, batch_size)
-        images = model.generate_image(e_interpolated, batch_size=batch_size)
-        ```
-        """
-        if diffusion_noise is not None and seed is not None:
-            raise ValueError(
-                "`diffusion_noise` and `seed` should not both be passed to "
-                "`generate_image`. `seed` is only used to generate diffusion "
-                "noise when it's not already user-specified."
-            )
-
-        encoded_text = tf.squeeze(encoded_text)
-        if encoded_text.shape.rank == 2:
-            encoded_text = tf.repeat(
-                tf.expand_dims(encoded_text, axis=0), batch_size, axis=0
-            )
-
-        context = encoded_text
-        unconditional_context = tf.repeat(
-            self._get_unconditional_context(), batch_size, axis=0
-        )
-        context = tf.concat([context, unconditional_context], 0)
-
-        if diffusion_noise is not None:
-            diffusion_noise = tf.squeeze(diffusion_noise)
-            if diffusion_noise.shape.rank == 3:
-                diffusion_noise = tf.repeat(
-                    tf.expand_dims(diffusion_noise, axis=0), batch_size, axis=0
-                )
-            latent = diffusion_noise
-        else:
-            latent = self._get_initial_diffusion_noise(batch_size, seed)
-
-        # Iterative reverse diffusion stage
-        timesteps = tf.range(1, 1000, 1000 // num_steps)
-        alphas, alphas_prev = self._get_initial_alphas(timesteps)
-        progbar = keras.utils.Progbar(len(timesteps))
-        iteration = 0
-        for index, timestep in list(enumerate(timesteps))[::-1]:
-            latent_prev = latent  # Set aside the previous latent vector
-            t_emb = self._get_timestep_embedding(timestep, batch_size)
-
-            # Prepare the latent and unconditional latent to be run with a single forward call
-            latent = tf.concat([latent, latent], 0)
-            t_emb = tf.concat([t_emb, t_emb], 0)
-            latent_numpy = self.diffusion_model.forward(
-                [latent.numpy(), t_emb.numpy(), context.numpy()]
-            )
-            latent = tf.convert_to_tensor(latent_numpy, dtype=tf.float32)
-            latent, unconditional_latent = tf.split(latent, 2)
-
-            latent = unconditional_latent + unconditional_guidance_scale * (
-                latent - unconditional_latent
-            )
-            a_t, a_prev = alphas[index], alphas_prev[index]
-            pred_x0 = (latent_prev - math.sqrt(1 - a_t) * latent) / math.sqrt(
-                a_t
-            )
-            latent = (
-                latent * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0
-            )
-            iteration += 1
-            progbar.update(iteration)
-
-        # Decoding stage
-        decoded = self.decoder.predict_on_batch(latent)
-        decoded = ((decoded + 1) / 2) * 255
-        return np.clip(decoded, 0, 255).astype("uint8")
-
-    def _get_unconditional_context(self):
-        unconditional_tokens = tf.convert_to_tensor(
-            [_UNCONDITIONAL_TOKENS], dtype=tf.int32
-        )
-        unconditional_context = self.text_encoder.predict_on_batch(
-            [unconditional_tokens, self._get_pos_ids()]
-        )
-
-        return unconditional_context
-
-    def _get_timestep_embedding(
-        self, timestep, batch_size, dim=320, max_period=10000
-    ):
-        half = dim // 2
-        freqs = tf.math.exp(
-            -math.log(max_period) * tf.range(0, half, dtype=tf.float32) / half
-        )
-        args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
-        embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
-        embedding = tf.reshape(embedding, [1, -1])
-        return tf.repeat(embedding, batch_size, axis=0)
-
-    def _get_initial_alphas(self, timesteps):
-        alphas = [_ALPHAS_CUMPROD[t] for t in timesteps]
-        alphas_prev = [1.0] + alphas[:-1]
-
-        return alphas, alphas_prev
-
-    def _get_initial_diffusion_noise(self, batch_size, seed):
-        return tf.random.normal(
-            (batch_size, self.img_height // 8, self.img_width // 8, 4),
-            seed=seed,
-        )
-
-    @staticmethod
-    def _get_pos_ids():
-        return tf.convert_to_tensor(
-            [list(range(MAX_PROMPT_LENGTH))], dtype=tf.int32
-        )
-
-
-if __name__ == "__main__":
-    SD = SharkStableDiffusion(device=args.device)
-    images = SD.text_to_image(args.prompt, num_steps=args.steps)
-    pil_images = [Image.fromarray(image) for image in images]
-    save_fname = args.prompt + ".jpg"
-    if args.save_path is not None:
-        save_fname = args.save_path
-    pil_images[0].save(save_fname)
--- a/shark/examples/shark_inference/stable_diffusion/.gitignore
+++ b/shark/examples/shark_inference/stable_diffusion/.gitignore
@@ -1,2 +0,0 @@
-*.vmfb
-*.jpg
--- a/shark/examples/shark_inference/stable_diffusion/README.md
+++ b/shark/examples/shark_inference/stable_diffusion/README.md
@@ -1,56 +0,0 @@
-# STABLE DIFFUSION
-
-## Installation
-
-Follow setup instructions in the main [README.md](https://github.com/nod-ai/SHARK#readme) for regular usage. 
-
-## Debug commands and other advanced usage follows.
-
-```shell
-python main.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text" 
-
-```
-
-## dump all dispatch .spv and isa using amdllpc
-
-```shell
-python main.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
-```
-
-## Compile and save the .vmfb (using vulkan fp16 as an example):
-
-```shell
-python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
-```
-
-## Capture an RGP trace
-
-```shell
-python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
-```
-
-## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
-
-```shell
-iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf16  
-```
-
-## Run the unet module with iree-benchmark-module (same config as above):
-```shell
-##if you want to use .npz inputs:
-unzip ~/.local/shark_tank/<your unet>/inputs.npz
-
-iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --function_input=@arr_0.npy --function_input=1xf16 --function_input=@arr_2.npy --function_input=@arr_3.npy --function_input=@arr_4.npy  
-```
-
-## Using other supported Stable Diffusion variants with SHARK:
-
-Currently we support the following fine-tuned versions of Stable Diffusion:
- [AnythingV3](https://huggingface.co/Linaqruf/anything-v3.0)
- [Analog Diffusion](https://huggingface.co/wavymulder/Analog-Diffusion)
-
-use the flag `--variant=` to specify the model to be used.
-
-```shell
-python .\shark\examples\shark_inference\stable_diffusion\main.py --variant=anythingv3 --max_length=77 --prompt="1girl, brown hair, green eyes, colorful, autumn, cumulonimbus clouds, lighting, blue sky, falling leaves, garden"
-```
--- a/shark/examples/shark_inference/stable_diffusion/download_hf_models.py
+++ b/shark/examples/shark_inference/stable_diffusion/download_hf_models.py
@@ -1,25 +0,0 @@
-from PIL import Image
-import requests
-
-from transformers import CLIPProcessor, CLIPModel
-
-model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-
-inputs = processor(
-    text=["a photo of a cat", "a photo of a dog"],
-    images=image,
-    return_tensors="pt",
-    padding=True,
-)
-
-outputs = model(**inputs)
-logits_per_image = (
-    outputs.logits_per_image
-)  # this is the image-text similarity score
-probs = logits_per_image.softmax(
-    dim=1
-)  # we can take the softmax to get the label probabilities
--- a/shark/examples/shark_inference/stable_diffusion/main.py
+++ b/shark/examples/shark_inference/stable_diffusion/main.py
@@ -1,280 +0,0 @@
-import os
-
-os.environ["AMD_ENABLE_LLPC"] = "1"
-
-from transformers import CLIPTextModel, CLIPTokenizer
-import torch
-from PIL import Image
-import torchvision.transforms as T
-from diffusers import (
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerDiscreteScheduler,
-)
-from tqdm.auto import tqdm
-import numpy as np
-from random import randint
-from stable_args import args
-from datetime import datetime as dt
-import json
-import re
-from pathlib import Path
-
-# This has to come before importing cache objects
-if args.clear_all:
-    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
-    from glob import glob
-    import shutil
-
-    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
-    for vmfb in vmfbs:
-        if os.path.exists(vmfb):
-            os.remove(vmfb)
-    home = os.path.expanduser("~")
-    if os.name == "nt":  # Windows
-        appdata = os.getenv("LOCALAPPDATA")
-        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
-        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
-    elif os.name == "unix":
-        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
-        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
-
-
-from utils import set_init_device_flags
-
-from opt_params import get_unet, get_vae, get_clip
-from schedulers import (
-    SharkEulerDiscreteScheduler,
-)
-import time
-import sys
-from shark.iree_utils.compile_utils import dump_isas
-
-# Helper function to profile the vulkan device.
-def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
-    if args.vulkan_debug_utils and "vulkan" in args.device:
-        import iree
-
-        print(f"Profiling and saving to {file_path}.")
-        vulkan_device = iree.runtime.get_device(args.device)
-        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
-        return vulkan_device
-    return None
-
-
-def end_profiling(device):
-    if device:
-        return device.end_profiling()
-
-
-if __name__ == "__main__":
-
-    dtype = torch.float32 if args.precision == "fp32" else torch.half
-
-    prompt = args.prompts
-    neg_prompt = args.negative_prompts
-    height = 512  # default height of Stable Diffusion
-    width = 512  # default width of Stable Diffusion
-    if args.version == "v2_1":
-        height = 768
-        width = 768
-
-    num_inference_steps = args.steps  # Number of denoising steps
-
-    # Scale for classifier-free guidance
-    guidance_scale = torch.tensor(args.guidance_scale).to(torch.float32)
-
-    # Handle out of range seeds.
-    uint32_info = np.iinfo(np.uint32)
-    uint32_min, uint32_max = uint32_info.min, uint32_info.max
-    seed = args.seed
-    if seed < uint32_min or seed >= uint32_max:
-        seed = randint(uint32_min, uint32_max)
-    generator = torch.manual_seed(
-        seed
-    )  # Seed generator to create the inital latent noise
-
-    # TODO: Add support for batch_size > 1.
-    batch_size = len(prompt)
-    if batch_size != 1:
-        sys.exit("More than one prompt is not supported yet.")
-    if batch_size != len(neg_prompt):
-        sys.exit("prompts and negative prompts must be of same length")
-
-    set_init_device_flags()
-    clip = get_clip()
-    unet = get_unet()
-    vae = get_vae()
-    if args.dump_isa:
-        dump_isas(args.dispatch_benchmarks_dir)
-
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    scheduler = DPMSolverMultistepScheduler.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
-        subfolder="scheduler",
-    )
-    cpu_scheduling = True
-    if args.version == "v2_1":
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "stabilityai/stable-diffusion-2-1", subfolder="tokenizer"
-        )
-
-        scheduler = DPMSolverMultistepScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-2-1",
-            subfolder="scheduler",
-        )
-
-    if args.version == "v2_1base" and args.variant == "stablediffusion":
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer"
-        )
-
-        if args.use_compiled_scheduler:
-            scheduler = SharkEulerDiscreteScheduler.from_pretrained(
-                "stabilityai/stable-diffusion-2-1-base",
-                subfolder="scheduler",
-            )
-            scheduler.compile()
-            cpu_scheduling = False
-        else:
-            scheduler = EulerDiscreteScheduler.from_pretrained(
-                "stabilityai/stable-diffusion-2-1-base",
-                subfolder="scheduler",
-            )
-
-    # create a random initial latent.
-    latents = torch.randn(
-        (batch_size, 4, height // 8, width // 8),
-        generator=generator,
-        dtype=torch.float32,
-    ).to(dtype)
-    # Warmup phase to improve performance.
-    if args.warmup_count >= 1:
-        vae_warmup_input = torch.clone(latents).detach().numpy()
-        clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
-    for i in range(args.warmup_count):
-        vae("forward", (vae_warmup_input,))
-        clip("forward", (clip_warmup_input,))
-
-    start = time.time()
-
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=args.max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-        neg_prompt,
-        padding="max_length",
-        max_length=max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])
-
-    clip_inf_start = time.time()
-    text_embeddings = clip("forward", (text_input,))
-    clip_inf_end = time.time()
-    text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
-    text_embeddings_numpy = text_embeddings.detach().numpy()
-
-    scheduler.set_timesteps(num_inference_steps)
-    scheduler.is_scale_input_called = True
-
-    latents = latents * scheduler.init_noise_sigma
-
-    avg_ms = 0
-    for i, t in tqdm(enumerate(scheduler.timesteps), disable=args.hide_steps):
-        step_start = time.time()
-        if not args.hide_steps:
-            print(f"i = {i} t = {t}", end="")
-        timestep = torch.tensor([t]).to(dtype).detach().numpy()
-        latent_model_input = scheduler.scale_model_input(latents, t)
-        if cpu_scheduling:
-            latent_model_input = latent_model_input.detach().numpy()
-
-        profile_device = start_profiling(file_path="unet.rdc")
-
-        noise_pred = unet(
-            "forward",
-            (
-                latent_model_input,
-                timestep,
-                text_embeddings_numpy,
-                guidance_scale,
-            ),
-            send_to_host=False,
-        )
-
-        end_profiling(profile_device)
-
-        if cpu_scheduling:
-            noise_pred = torch.from_numpy(noise_pred.to_host())
-            latents = scheduler.step(noise_pred, t, latents).prev_sample
-        else:
-            latents = scheduler.step(noise_pred, t, latents)
-        step_time = time.time() - step_start
-        avg_ms += step_time
-        step_ms = int((step_time) * 1000)
-        if not args.hide_steps:
-            print(f" ({step_ms}ms)")
-
-    # scale and decode the image latents with vae
-    if args.use_base_vae:
-        latents = 1 / 0.18215 * latents
-    latents_numpy = latents
-    if cpu_scheduling:
-        latents_numpy = latents.detach().numpy()
-    profile_device = start_profiling(file_path="vae.rdc")
-    vae_start = time.time()
-    images = vae("forward", (latents_numpy,))
-    vae_end = time.time()
-    end_profiling(profile_device)
-    if args.use_base_vae:
-        image = torch.from_numpy(images)
-        image = (image.detach().cpu() * 255.0).numpy()
-        images = image.round()
-    end_time = time.time()
-
-    avg_ms = 1000 * avg_ms / args.steps
-    clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
-    vae_inf_time = (vae_end - vae_start) * 1000
-    total_time = end_time - start
-    print(f"\nAverage step time: {avg_ms}ms/it")
-    print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
-    print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
-    print(f"\nTotal image generation time: {total_time}sec")
-
-    transform = T.ToPILImage()
-    pil_images = [
-        transform(image) for image in torch.from_numpy(images).to(torch.uint8)
-    ]
-
-    if args.output_dir is not None:
-        output_path = Path(args.output_dir)
-        output_path.mkdir(parents=True, exist_ok=True)
-    else:
-        output_path = Path.cwd()
-    for i in range(batch_size):
-        json_store = {
-            "prompt": args.prompts[i],
-            "negative prompt": args.negative_prompts[i],
-            "seed": args.seed,
-            "variant": args.variant,
-            "precision": args.precision,
-            "steps": args.steps,
-            "guidance_scale": args.guidance_scale,
-            "scheduler": args.scheduler,
-        }
-        prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[i][:15])
-        img_name = f"{prompt_slice}_{args.seed}_{i}_{dt.now().strftime('%y%m%d_%H%M%S')}"
-        pil_images[i].save(
-            output_path / f"{img_name}.jpg", quality=95, subsampling=0
-        )
-        with open(output_path / f"{img_name}.json", "w") as f:
-            f.write(json.dumps(json_store, indent=4))
--- a/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
+++ b/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
@@ -1,285 +0,0 @@
-from diffusers import AutoencoderKL, UNet2DConditionModel
-from transformers import CLIPTextModel
-from utils import compile_through_fx
-from stable_args import args
-import torch
-
-model_config = {
-    "v2_1": "stabilityai/stable-diffusion-2-1",
-    "v2_1base": "stabilityai/stable-diffusion-2-1-base",
-    "v1_4": "CompVis/stable-diffusion-v1-4",
-}
-
-# clip has 2 variants of max length 77 or 64.
-model_clip_max_length = 64 if args.max_length == 64 else 77
-if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
-    model_clip_max_length = 77
-elif args.variant == "openjourney":
-    model_clip_max_length = 64
-
-model_variant = {
-    "stablediffusion": "SD",
-    "anythingv3": "Linaqruf/anything-v3.0",
-    "dreamlike": "dreamlike-art/dreamlike-diffusion-1.0",
-    "openjourney": "prompthero/openjourney",
-    "analogdiffusion": "wavymulder/Analog-Diffusion",
-}
-
-model_input = {
-    "v2_1": {
-        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
-        "vae": (torch.randn(1, 4, 96, 96),),
-        "unet": (
-            torch.randn(1, 4, 96, 96),  # latents
-            torch.tensor([1]).to(torch.float32),  # timestep
-            torch.randn(2, model_clip_max_length, 1024),  # embedding
-            torch.tensor(1).to(torch.float32),  # guidance_scale
-        ),
-    },
-    "v2_1base": {
-        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
-        "vae": (torch.randn(1, 4, 64, 64),),
-        "unet": (
-            torch.randn(1, 4, 64, 64),  # latents
-            torch.tensor([1]).to(torch.float32),  # timestep
-            torch.randn(2, model_clip_max_length, 1024),  # embedding
-            torch.tensor(1).to(torch.float32),  # guidance_scale
-        ),
-    },
-    "v1_4": {
-        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
-        "vae": (torch.randn(1, 4, 64, 64),),
-        "unet": (
-            torch.randn(1, 4, 64, 64),
-            torch.tensor([1]).to(torch.float32),  # timestep
-            torch.randn(2, model_clip_max_length, 768),
-            torch.tensor(1).to(torch.float32),
-        ),
-    },
-}
-
-# revision param for from_pretrained defaults to "main" => fp32
-model_revision = {
-    "stablediffusion": "fp16" if args.precision == "fp16" else "main",
-    "anythingv3": "diffusers",
-    "analogdiffusion": "main",
-    "openjourney": "main",
-    "dreamlike": "main",
-}
-
-
-def get_clip_mlir(model_name="clip_text", extra_args=[]):
-
-    text_encoder = CLIPTextModel.from_pretrained(
-        "openai/clip-vit-large-patch14"
-    )
-    if args.variant == "stablediffusion":
-        if args.version != "v1_4":
-            text_encoder = CLIPTextModel.from_pretrained(
-                model_config[args.version], subfolder="text_encoder"
-            )
-
-    elif args.variant in [
-        "anythingv3",
-        "analogdiffusion",
-        "openjourney",
-        "dreamlike",
-    ]:
-        text_encoder = CLIPTextModel.from_pretrained(
-            model_variant[args.variant],
-            subfolder="text_encoder",
-            revision=model_revision[args.variant],
-        )
-    else:
-        raise ValueError(f"{args.variant} not yet added")
-
-    class CLIPText(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.text_encoder = text_encoder
-
-        def forward(self, input):
-            return self.text_encoder(input)[0]
-
-    clip_model = CLIPText()
-    shark_clip = compile_through_fx(
-        clip_model,
-        model_input[args.version]["clip"],
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_clip
-
-
-def get_base_vae_mlir(model_name="vae", extra_args=[]):
-    class BaseVaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                model_config[args.version]
-                if args.variant == "stablediffusion"
-                else model_variant[args.variant],
-                subfolder="vae",
-                revision=model_revision[args.variant],
-            )
-
-        def forward(self, input):
-            x = self.vae.decode(input, return_dict=False)[0]
-            return (x / 2 + 0.5).clamp(0, 1)
-
-    vae = BaseVaeModel()
-    if args.variant == "stablediffusion":
-        if args.precision == "fp16":
-            vae = vae.half().cuda()
-            inputs = tuple(
-                [
-                    inputs.half().cuda()
-                    for inputs in model_input[args.version]["vae"]
-                ]
-            )
-        else:
-            inputs = model_input[args.version]["vae"]
-    elif args.variant in [
-        "anythingv3",
-        "analogdiffusion",
-        "openjourney",
-        "dreamlike",
-    ]:
-        if args.precision == "fp16":
-            vae = vae.half().cuda()
-            inputs = tuple(
-                [inputs.half().cuda() for inputs in model_input["v1_4"]["vae"]]
-            )
-        else:
-            inputs = model_input["v1_4"]["vae"]
-    else:
-        raise ValueError(f"{args.variant} not yet added")
-
-    shark_vae = compile_through_fx(
-        vae,
-        inputs,
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_vae
-
-
-def get_vae_mlir(model_name="vae", extra_args=[]):
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                model_config[args.version]
-                if args.variant == "stablediffusion"
-                else model_variant[args.variant],
-                subfolder="vae",
-                revision=model_revision[args.variant],
-            )
-
-        def forward(self, input):
-            input = 1 / 0.18215 * input
-            x = self.vae.decode(input, return_dict=False)[0]
-            x = (x / 2 + 0.5).clamp(0, 1)
-            x = x * 255.0
-            return x.round()
-
-    vae = VaeModel()
-    if args.variant == "stablediffusion":
-        if args.precision == "fp16":
-            vae = vae.half().cuda()
-            inputs = tuple(
-                [
-                    inputs.half().cuda()
-                    for inputs in model_input[args.version]["vae"]
-                ]
-            )
-        else:
-            inputs = model_input[args.version]["vae"]
-    elif args.variant in [
-        "anythingv3",
-        "analogdiffusion",
-        "openjourney",
-        "dreamlike",
-    ]:
-        if args.precision == "fp16":
-            vae = vae.half().cuda()
-            inputs = tuple(
-                [inputs.half().cuda() for inputs in model_input["v1_4"]["vae"]]
-            )
-        else:
-            inputs = model_input["v1_4"]["vae"]
-    else:
-        raise ValueError(f"{args.variant} not yet added")
-
-    shark_vae = compile_through_fx(
-        vae,
-        inputs,
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_vae
-
-
-def get_unet_mlir(model_name="unet", extra_args=[]):
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                model_config[args.version]
-                if args.variant == "stablediffusion"
-                else model_variant[args.variant],
-                subfolder="unet",
-                revision=model_revision[args.variant],
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-        def forward(self, latent, timestep, text_embedding, guidance_scale):
-            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-            latents = torch.cat([latent] * 2)
-            unet_out = self.unet.forward(
-                latents, timestep, text_embedding, return_dict=False
-            )[0]
-            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
-            return noise_pred
-
-    unet = UnetModel()
-    if args.variant == "stablediffusion":
-        if args.precision == "fp16":
-            unet = unet.half().cuda()
-            inputs = tuple(
-                [
-                    inputs.half().cuda() if len(inputs.shape) != 0 else inputs
-                    for inputs in model_input[args.version]["unet"]
-                ]
-            )
-        else:
-            inputs = model_input[args.version]["unet"]
-    elif args.variant in [
-        "anythingv3",
-        "analogdiffusion",
-        "openjourney",
-        "dreamlike",
-    ]:
-        if args.precision == "fp16":
-            unet = unet.half().cuda()
-            inputs = tuple(
-                [
-                    inputs.half().cuda() if len(inputs.shape) != 0 else inputs
-                    for inputs in model_input["v1_4"]["unet"]
-                ]
-            )
-        else:
-            inputs = model_input["v1_4"]["unet"]
-    else:
-        raise ValueError(f"{args.variant} is not yet added")
-    shark_unet = compile_through_fx(
-        unet,
-        inputs,
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_unet
--- a/shark/examples/shark_inference/stable_diffusion/resources.py
+++ b/shark/examples/shark_inference/stable_diffusion/resources.py
@@ -1,31 +0,0 @@
-import os
-import json
-import sys
-
-
-def resource_path(relative_path):
-    """Get absolute path to resource, works for dev and for PyInstaller"""
-    base_path = getattr(
-        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
-    )
-    return os.path.join(base_path, relative_path)
-
-
-prompt_examples = []
-prompts_loc = resource_path("resources/prompts.json")
-if os.path.exists(prompts_loc):
-    with open(prompts_loc, encoding="utf-8") as fopen:
-        prompt_examples = json.load(fopen)
-
-if not prompt_examples:
-    print("Unable to fetch prompt examples.")
-
-
-models_db = []
-models_loc = resource_path("resources/model_db.json")
-if os.path.exists(models_loc):
-    with open(models_loc, encoding="utf-8") as fopen:
-        models_db = json.load(fopen)
-
-if len(models_db) != 3:
-    sys.exit("Error: Unable to load models database.")
--- a/shark/examples/shark_inference/stable_diffusion/schedulers.py
+++ b/shark/examples/shark_inference/stable_diffusion/schedulers.py
@@ -1,133 +0,0 @@
-import sys
-import numpy as np
-from typing import List, Optional, Tuple, Union
-from diffusers import (
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerDiscreteScheduler,
-)
-from diffusers.configuration_utils import register_to_config
-from utils import compile_through_fx, get_shark_model
-from stable_args import args
-import torch
-
-SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
-
-model_input = {
-    "euler": {
-        "latent": torch.randn(1, 4, 64, 64),
-        "output": torch.randn(1, 4, 64, 64),
-        "sigma": torch.tensor(1).to(torch.float32),
-        "dt": torch.tensor(1).to(torch.float32),
-    },
-}
-
-
-class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-    ):
-        super().__init__(
-            num_train_timesteps,
-            beta_start,
-            beta_end,
-            beta_schedule,
-            trained_betas,
-            prediction_type,
-        )
-
-    def compile(self):
-        example_latent = model_input["euler"]["latent"]
-        example_output = model_input["euler"]["output"]
-        if args.precision == "fp16":
-            example_latent = example_latent.half()
-            example_output = example_output.half()
-        example_sigma = model_input["euler"]["sigma"]
-        example_dt = model_input["euler"]["dt"]
-
-        class ScalingModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, latent, sigma):
-                return latent / ((sigma**2 + 1) ** 0.5)
-
-        class SchedulerStepModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, noise_pred, sigma, latent, dt):
-                pred_original_sample = latent - sigma * noise_pred
-                derivative = (latent - pred_original_sample) / sigma
-                return latent + derivative * dt
-
-        iree_flags = []
-        if len(args.iree_vulkan_target_triple) > 0:
-            iree_flags.append(
-                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-            )
-        # Disable bindings fusion to work with moltenVK.
-        if sys.platform == "darwin":
-            iree_flags.append("-iree-stream-fuse-binding=false")
-
-        if args.import_mlir:
-            scaling_model = ScalingModel()
-            self.scaling_model = compile_through_fx(
-                scaling_model,
-                (example_latent, example_sigma),
-                model_name="euler_scale_model_input_" + args.precision,
-                extra_args=iree_flags,
-            )
-
-            step_model = SchedulerStepModel()
-            self.step_model = compile_through_fx(
-                step_model,
-                (example_output, example_sigma, example_latent, example_dt),
-                model_name="euler_step_" + args.precision,
-                extra_args=iree_flags,
-            )
-        else:
-            self.scaling_model = get_shark_model(
-                SCHEDULER_BUCKET,
-                "euler_scale_model_input_" + args.precision,
-                iree_flags,
-            )
-            self.step_model = get_shark_model(
-                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
-            )
-
-    def scale_model_input(self, sample, timestep):
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-        return self.scaling_model(
-            "forward",
-            (
-                sample,
-                sigma,
-            ),
-            send_to_host=False,
-        )
-
-    def step(self, noise_pred, timestep, latent):
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-        dt = self.sigmas[step_index + 1] - sigma
-        return self.step_model(
-            "forward",
-            (
-                noise_pred,
-                sigma,
-                latent,
-                dt,
-            ),
-            send_to_host=False,
-        )
--- a/shark/examples/shark_inference/stable_diffusion/sd_annotation.py
+++ b/shark/examples/shark_inference/stable_diffusion/sd_annotation.py
@@ -1,122 +0,0 @@
-import os
-from shark.model_annotation import model_annotation, create_context
-from shark.iree_utils._common import run_cmd, iree_target_map
-from shark.shark_downloader import (
-    download_model,
-    download_public_file,
-    WORKDIR,
-)
-from shark.parser import shark_args
-from stable_args import args
-from opt_params import get_params
-from utils import set_init_device_flags
-
-
-set_init_device_flags()
-device = (
-    args.device if "://" not in args.device else args.device.split("://")[0]
-)
-
-# Downloads the model (Unet or VAE fp16) from shark_tank
-shark_args.local_tank_cache = args.local_tank_cache
-bucket_key = f"{args.variant}/untuned"
-if args.annotation_model == "unet":
-    model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/untuned"
-elif args.annotation_model == "vae":
-    is_base = "/base" if args.use_base_vae else ""
-    model_key = f"{args.variant}/{args.version}/vae/{args.precision}/length_77/untuned{is_base}"
-
-bucket, model_name, iree_flags = get_params(
-    bucket_key, model_key, args.annotation_model, "untuned", args.precision
-)
-mlir_model, func_name, inputs, golden_out = download_model(
-    model_name,
-    tank_url=bucket,
-    frontend="torch",
-)
-
-# Downloads the tuned config files from shark_tank
-config_bucket = "gs://shark_tank/sd_tuned/configs/"
-if args.use_winograd:
-    config_name = f"{args.annotation_model}_winograd_{device}.json"
-    full_gs_url = config_bucket + config_name
-    winograd_config_dir = f"{WORKDIR}configs/" + config_name
-    download_public_file(full_gs_url, winograd_config_dir, True)
-
-if args.annotation_model == "unet" or device == "cuda":
-    if (
-        args.variant in ["anythingv3", "analogdiffusion"]
-        or args.annotation_model == "vae"
-    ):
-        args.max_length = 77
-    config_name = f"{args.annotation_model}_{args.version}_{args.precision}_len{args.max_length}_{device}.json"
-    full_gs_url = config_bucket + config_name
-    lowering_config_dir = f"{WORKDIR}configs/" + config_name
-    download_public_file(full_gs_url, lowering_config_dir, True)
-
-# Annotate the model with Winograd attribute on selected conv ops
-if args.use_winograd:
-    with create_context() as ctx:
-        winograd_model = model_annotation(
-            ctx,
-            input_contents=mlir_model,
-            config_path=winograd_config_dir,
-            search_op="conv",
-            winograd=args.use_winograd,
-        )
-        with open(
-            f"{args.annotation_output}/{model_name}_tuned_torch.mlir", "w"
-        ) as f:
-            f.write(str(winograd_model))
-
-# For Unet annotate the model with tuned lowering configs
-if args.annotation_model == "unet" or device == "cuda":
-    if args.use_winograd:
-        input_mlir = f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
-        dump_after = "iree-linalg-ext-convert-conv2d-to-winograd"
-    else:
-        input_mlir = f"{WORKDIR}{model_name}_torch/{model_name}_torch.mlir"
-        dump_after = "iree-flow-pad-linalg-ops"
-
-    # Dump IR after padding/img2col/winograd passes
-    device_spec_args = ""
-    if device == "cuda":
-        from shark.iree_utils.gpu_utils import get_iree_gpu_args
-
-        gpu_flags = get_iree_gpu_args()
-        for flag in gpu_flags:
-            device_spec_args += flag + " "
-    elif device == "vulkan":
-        device_spec_args = (
-            f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
-        )
-    run_cmd(
-        f"iree-compile {input_mlir} "
-        "--iree-input-type=tm_tensor "
-        f"--iree-hal-target-backends={iree_target_map(device)} "
-        f"{device_spec_args}"
-        "--iree-stream-resource-index-bits=64 "
-        "--iree-vm-target-index-bits=64 "
-        "--iree-flow-enable-padding-linalg-ops "
-        "--iree-flow-linalg-ops-padding-size=32 "
-        "--iree-flow-enable-conv-img2col-transform "
-        f"--mlir-print-ir-after={dump_after} "
-        "--compile-to=flow "
-        f"2>{args.annotation_output}/dump_after_winograd.mlir "
-    )
-
-    # Annotate the model with lowering configs in the config file
-    with create_context() as ctx:
-        tuned_model = model_annotation(
-            ctx,
-            input_contents=f"{args.annotation_output}/dump_after_winograd.mlir",
-            config_path=lowering_config_dir,
-            search_op="all",
-        )
-
-    # Remove the intermediate mlir and save the final annotated model
-    os.remove(f"{args.annotation_output}/dump_after_winograd.mlir")
-    output_path = f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
-    with open(output_path, "w") as f:
-        f.write(str(tuned_model))
-    print(f"Saved the annotated mlir in {output_path}.")
--- a/shark/examples/shark_inference/upscaler/model_wrappers.py
+++ b/shark/examples/shark_inference/upscaler/model_wrappers.py
@@ -9,16 +9,15 @@ model_input = {
    "clip": (torch.randint(1, 2, (1, 77)),),
    "vae": (torch.randn(1, 4, 128, 128),),
    "unet": (
-        torch.randn(2, 7, 128, 128).half(),  # latents
+        torch.randn(2, 7, 128, 128),  # latents
        torch.tensor([1]).to(torch.float32),  # timestep
-        torch.randn(2, 77, 1024).half(),  # embedding
+        torch.randn(2, 77, 1024),  # embedding
        torch.randn(2).to(torch.int64),  # noise_level
    ),
 }


 def get_clip_mlir(model_name="clip_text", extra_args=[]):
-
    text_encoder = CLIPTextModel.from_pretrained(
        model_id,
        subfolder="text_encoder",
@@ -72,7 +71,6 @@ def get_unet_mlir(model_name="unet", extra_args=[]):
            self.unet = UNet2DConditionModel.from_pretrained(
                model_id,
                subfolder="unet",
-                revision="fp16",
            )
            self.in_channels = self.unet.in_channels
            self.train(False)
@@ -88,12 +86,13 @@ def get_unet_mlir(model_name="unet", extra_args=[]):
            return unet_out

    unet = UnetModel()
-    unet = unet.half().cuda()
-    inputs = tuple([inputs.cuda() for inputs in model_input["unet"]])
+    f16_input_mask = (True, True, True, False)
    shark_unet = compile_through_fx(
        unet,
-        inputs,
+        model_input["unet"],
        model_name=model_name,
+        is_f16=True,
+        f16_input_mask=f16_input_mask,
        extra_args=extra_args,
    )
    return shark_unet
--- a/shark/examples/shark_inference/upscaler/pipeline_shark_stable_diffusion_upscale.py
+++ b/shark/examples/shark_inference/upscaler/pipeline_shark_stable_diffusion_upscale.py
@@ -339,7 +339,6 @@ class SharkStableDiffusionUpscalePipeline:
        ] = None,
        callback_steps: Optional[int] = 1,
    ):
-
        # 1. Check inputs
        self.check_inputs(prompt, image, noise_level, callback_steps)

--- a/shark/examples/shark_inference/upscaler/utils.py
+++ b/shark/examples/shark_inference/upscaler/utils.py
@@ -59,10 +59,12 @@ def get_shark_model(tank_url, model_name, extra_args=[]):


 # Converts the torch-module into a shark_module.
-def compile_through_fx(model, inputs, model_name, extra_args=[]):
-
-    mlir_module, func_name = import_with_fx(model, inputs)
-
+def compile_through_fx(
+    model, inputs, model_name, is_f16=False, f16_input_mask=None, extra_args=[]
+):
+    mlir_module, func_name = import_with_fx(
+        model, inputs, is_f16, f16_input_mask
+    )
    shark_module = SharkInference(
        mlir_module,
        device=args.device,
@@ -73,7 +75,6 @@ def compile_through_fx(model, inputs, model_name, extra_args=[]):


 def set_iree_runtime_flags():
-
    vulkan_runtime_flags = [
        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
--- a/shark/examples/shark_training/bert_training.py
+++ b/shark/examples/shark_training/bert_training.py
@@ -1,7 +1,7 @@
 import torch
 from torch.nn.utils import _stateless
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from shark.shark_runner import SharkTrainer
+from shark.shark_trainer import SharkTrainer


 class MiniLMSequenceClassification(torch.nn.Module):
@@ -42,6 +42,7 @@ def forward(params, buffers, args):
    return params, buffers


-shark_module = SharkTrainer(mod, inp, custom_inference_fn=forward)
+shark_module = SharkTrainer(mod, inp)
+shark_module.compile(forward)

-print(shark_module.forward())
+print(shark_module.train())
--- a/shark/examples/shark_training/stable-diffusion-img2img/stable_diffusion_img2img.py
+++ b/shark/examples/shark_training/stable-diffusion-img2img/stable_diffusion_img2img.py
@@ -169,6 +169,7 @@ imagenet_style_templates_small = [
    "a large painting in the style of {}",
 ]

+
 # Setup the dataset
 class TextualInversionDataset(Dataset):
    def __init__(
@@ -184,7 +185,6 @@ class TextualInversionDataset(Dataset):
        placeholder_token="*",
        center_crop=False,
    ):
-
        self.data_root = data_root
        self.tokenizer = tokenizer
        self.learnable_property = learnable_property
@@ -244,7 +244,10 @@ class TextualInversionDataset(Dataset):

        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
-            h, w, = (
+            (
+                h,
+                w,
+            ) = (
                img.shape[0],
                img.shape[1],
            )
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -33,8 +33,9 @@ def run_cmd(cmd):
        )
        result_str = result.stdout.decode()
        return result_str
-    except Exception:
-        sys.exit("Exiting program due to error running:", cmd)
+    except subprocess.CalledProcessError as e:
+        print(e.output)
+        sys.exit(f"Exiting program due to error running {cmd}")


 def iree_device_map(device):
--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -18,6 +18,7 @@ from shark.iree_utils.cpu_utils import get_cpu_count
 import numpy as np
 import os
 import re
+import platform

 UNIT_TO_SECOND_MAP = {"us": 1e-6, "ms": 0.001, "s": 1}

@@ -62,7 +63,16 @@ def build_benchmark_args(
    Outputs: string that execute benchmark-module on target model.
    """
    path = benchmark_module.__path__[0]
-    benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
+    if platform.system() == "Windows":
+        benchmarker_path = os.path.join(
+            path, "..", "..", "iree-benchmark-module.exe"
+        )
+        time_extractor = None
+    else:
+        benchmarker_path = os.path.join(
+            path, "..", "..", "iree-benchmark-module"
+        )
+        time_extractor = "| awk 'END{{print $2 $3}}'"
    benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
    # TODO: The function named can be passed as one of the args.
    fn_name = "forward"
@@ -78,8 +88,8 @@ def build_benchmark_args(
        num_cpus = get_cpu_count()
        if num_cpus is not None:
            benchmark_cl.append(f"--task_topology_max_group_count={num_cpus}")
-    time_extractor = "| awk 'END{{print $2 $3}}'"
-    benchmark_cl.append(time_extractor)
+    # if time_extractor:
+    #    benchmark_cl.append(time_extractor)
    return benchmark_cl


@@ -96,7 +106,14 @@ def build_benchmark_args_non_tensor_input(
    Outputs: string that execute benchmark-module on target model.
    """
    path = benchmark_module.__path__[0]
-    benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
+    if platform.system() == "Windows":
+        benchmarker_path = os.path.join(
+            path, "..", "..", "iree-benchmark-module.exe"
+        )
+    else:
+        benchmarker_path = os.path.join(
+            path, "..", "..", "iree-benchmark-module"
+        )
    benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
    # TODO: The function named can be passed as one of the args.
    if function_name:
@@ -104,8 +121,9 @@ def build_benchmark_args_non_tensor_input(
    benchmark_cl.append(f"--device={iree_device_map(device)}")
    for input in inputs:
        benchmark_cl.append(f"--function_input={input}")
-    time_extractor = "| awk 'END{{print $2 $3}}'"
-    benchmark_cl.append(time_extractor)
+    if platform.system() != "Windows":
+        time_extractor = "| awk 'END{{print $2 $3}}'"
+        benchmark_cl.append(time_extractor)
    return benchmark_cl


@@ -121,8 +139,9 @@ def run_benchmark_module(benchmark_cl):
        benchmark_path
    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
    bench_result = run_cmd(" ".join(benchmark_cl))
-    regex_split = re.compile("([0-9]+[.]*[0-9]*)([a-zA-Z]+)")
-    match = regex_split.match(bench_result)
+    print(bench_result)
+    regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
+    match = regex_split.search(bench_result)
    time = float(match.group(1))
-    unit = match.group(2)
-    return 1.0 / (time * UNIT_TO_SECOND_MAP[unit])
+    unit = match.group(3)
+    return 1.0 / (time * 0.001)
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -143,7 +143,6 @@ def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
                    in_dispatches = True
            if all_dispatches or in_dispatches:
                for f_ in os.listdir(f"{bench_dir}/{d_}"):
-
                    if "benchmark.mlir" in f_:
                        dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
                        module = dispatch_file.read()
@@ -276,9 +275,19 @@ def compile_module_to_flatbuffer(
    return flatbuffer_blob


-def get_iree_module(flatbuffer_blob, device):
+def get_iree_module(flatbuffer_blob, device, device_idx=None):
    # Returns the compiled module and the configs.
-    config = get_iree_runtime_config(device)
+    if device_idx is not None:
+        device = iree_device_map(device)
+        print("registering device id: ", device_idx)
+        haldriver = ireert.get_driver(device)
+
+        haldevice = haldriver.create_device(
+            haldriver.query_available_devices()[device_idx]["device_id"]
+        )
+        config = ireert.Config(device=haldevice)
+    else:
+        config = get_iree_runtime_config(device)
    vm_module = ireert.VmModule.from_flatbuffer(
        config.vm_instance, flatbuffer_blob
    )
@@ -294,20 +303,20 @@ def get_iree_compiled_module(
    frontend: str = "torch",
    model_config_path: str = None,
    extra_args: list = [],
+    device_idx: int = None,
 ):
    """Given a module returns the compiled .vmfb and configs"""
    flatbuffer_blob = compile_module_to_flatbuffer(
        module, device, frontend, model_config_path, extra_args
    )
-    return get_iree_module(flatbuffer_blob, device)
+    return get_iree_module(flatbuffer_blob, device, device_idx=device_idx)


-def load_flatbuffer(flatbuffer_path: str, device: str):
-
+def load_flatbuffer(flatbuffer_path: str, device: str, device_idx: int = None):
    with open(os.path.join(flatbuffer_path), "rb") as f:
        flatbuffer_blob = f.read()

-    return get_iree_module(flatbuffer_blob, device)
+    return get_iree_module(flatbuffer_blob, device, device_idx=device_idx)


 def export_iree_module_to_vmfb(
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -18,6 +18,7 @@ import iree.runtime as ireert
 import ctypes
 from shark.parser import shark_args

+
 # Get the default gpu args given the architecture.
 def get_iree_gpu_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
@@ -39,8 +40,17 @@ def get_iree_gpu_args():
 # Get the default gpu args given the architecture.
 def get_iree_rocm_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    # TODO: find a way to get arch from code.
-    rocm_arch = "gfx908"
+    # get arch from rocminfo.
+    import re
+    import subprocess
+
+    rocm_arch = re.match(
+        r".*(gfx\w+)",
+        subprocess.check_output(
+            "rocminfo | grep -i 'gfx'", shell=True, text=True
+        ),
+    ).group(1)
+    print(f"Found rocm arch {rocm_arch}...")
    return [
        f"--iree-rocm-target-chip={rocm_arch}",
        "--iree-rocm-link-bc=true",
--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -16,7 +16,6 @@ from collections import OrderedDict


 def get_vulkan_target_env(vulkan_target_triple):
-
    arch, product, os = vulkan_target_triple.split("=")[1].split("-")
    triple = (arch, product, os)
    # get version
@@ -37,7 +36,6 @@ def get_vulkan_target_env(vulkan_target_triple):


 def get_vulkan_target_env_flag(vulkan_target_triple):
-
    target_env = get_vulkan_target_env(vulkan_target_triple)
    target_env_flag = f"--iree-vulkan-target-env={target_env}"
    return target_env_flag
@@ -124,7 +122,6 @@ def get_extensions(triple):


 def get_vendor(triple):
-
    arch, product, os = triple
    if arch == "unknown":
        return "Unknown"
@@ -206,7 +203,6 @@ def get_vulkan_target_capabilities(triple):
    cap["coopmatCases"] = None

    if arch in ["rdna1", "rdna2", "rdna3"]:
-
        cap["maxComputeSharedMemorySize"] = 65536
        cap["maxComputeWorkGroupInvocations"] = 1024
        cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
@@ -287,7 +283,6 @@ def get_vulkan_target_capabilities(triple):
        cap["variablePointersStorageBuffer"] = True

    elif arch == "m1":
-
        cap["maxComputeSharedMemorySize"] = 32768
        cap["maxComputeWorkGroupInvocations"] = 1024
        cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
@@ -362,7 +357,6 @@ def get_vulkan_target_capabilities(triple):
            ]

    elif arch in ["ampere", "turing"]:
-
        cap["maxComputeSharedMemorySize"] = 49152
        cap["maxComputeWorkGroupInvocations"] = 1024
        cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
@@ -402,7 +396,6 @@ def get_vulkan_target_capabilities(triple):
        ]

    elif arch == "adreno":
-
        cap["maxComputeSharedMemorySize"] = 32768
        cap["maxComputeWorkGroupInvocations"] = 1024
        cap["maxComputeWorkGroupSize"] = [1024, 1024, 64]
@@ -447,7 +440,6 @@ def get_vulkan_target_capabilities(triple):

    res = ""
    for k, v in cap.items():
-
        if v is None or v == False:
            continue
        if isinstance(v, bool):
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -66,11 +66,24 @@ def get_vulkan_target_triple(device_name):
    elif all(x in device_name for x in ("RTX", "2080")):
        triple = f"turing-rtx2080-{system_os}"
    elif all(x in device_name for x in ("A100", "SXM4")):
-        triple = f"ampere-rtx3080-{system_os}"
+        triple = f"ampere-a100-{system_os}"
    elif all(x in device_name for x in ("RTX", "3090")):
        triple = f"ampere-rtx3090-{system_os}"
+    elif all(x in device_name for x in ("RTX", "3080")):
+        triple = f"ampere-rtx3080-{system_os}"
+    elif all(x in device_name for x in ("RTX", "3070")):
+        triple = f"ampere-rtx3070-{system_os}"
+    elif all(x in device_name for x in ("RTX", "3060")):
+        triple = f"ampere-rtx3060-{system_os}"
+    elif all(x in device_name for x in ("RTX", "3050")):
+        triple = f"ampere-rtx3050-{system_os}"
+    # We use ampere until lovelace target triples are plumbed in.
    elif all(x in device_name for x in ("RTX", "4090")):
-        triple = f"ampere-rtx3090-{system_os}"
+        triple = f"ampere-rtx4090-{system_os}"
+    elif all(x in device_name for x in ("RTX", "4080")):
+        triple = f"ampere-rtx4080-{system_os}"
+    elif all(x in device_name for x in ("RTX", "4070")):
+        triple = f"ampere-rtx4070-{system_os}"
    elif all(x in device_name for x in ("RTX", "4000")):
        triple = f"turing-rtx4000-{system_os}"
    elif all(x in device_name for x in ("RTX", "5000")):
@@ -89,7 +102,9 @@ def get_vulkan_target_triple(device_name):
        triple = f"pascal-gtx1080-{system_os}"

    # Amd Targets
-    elif all(x in device_name for x in ("AMD", "7900")):
+    # Linux: Radeon RX 7900 XTX
+    # Windows: AMD Radeon RX 7900 XTX
+    elif all(x in device_name for x in ("RX", "7900")):
        triple = f"rdna3-7900-{system_os}"
    elif any(x in device_name for x in ("AMD", "Radeon")):
        triple = f"rdna2-unknown-{system_os}"
--- a/shark/model_annotation.py
+++ b/shark/model_annotation.py
@@ -47,6 +47,9 @@ def model_annotation(
            input_contents = f.read()
    module = ir.Module.parse(input_contents)

+    if config_path == "":
+        return module
+
    if winograd:
        with open(config_path, "r") as f:
            data = json.load(f)
@@ -162,7 +165,6 @@ def walk_children(
                        add_attributes(
                            child_op, configs[child_op_shape]["options"][0]
                        )
-                    print(f"Updated op {child_op}", file=sys.stderr)

                walk_children(child_op, configs, search_op, winograd)

@@ -394,7 +396,6 @@ def add_winograd_attribute(op: ir.Operation, config: List):
        op.attributes["iree_winograd_conv"] = ir.IntegerAttr.get(
            ir.IntegerType.get_signless(64), 1
        )
-        print("Apply Winograd on selected conv op: ", op)


 def add_attribute_by_name(op: ir.Operation, name: str, val: int):
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -44,7 +44,7 @@ parser.add_argument(
    "--repro_dir",
    help="Directory to which module files will be saved for reproduction or debugging.",
    type=dir_path,
-    default="./shark_tmp",
+    default="shark_tmp",
 )
 parser.add_argument(
    "--enable_tf32",
@@ -89,7 +89,7 @@ parser.add_argument(
 )
 parser.add_argument(
    "--local_tank_cache",
-    default="",
+    default=None,
    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
 )

--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -23,8 +23,6 @@ from datetime import datetime
 import time
 import csv
 import os
-import torch
-import torch._dynamo as dynamo


 class OnnxFusionOptions(object):
@@ -106,6 +104,7 @@ class SharkBenchmarkRunner(SharkRunner):

    def benchmark_torch(self, modelname):
        import torch
+        import torch._dynamo as dynamo
        from tank.model_utils import get_torch_model

        if self.device == "cuda":
@@ -158,7 +157,10 @@ class SharkBenchmarkRunner(SharkRunner):
        # tf_device = "/GPU:0" if self.device == "cuda" else "/CPU:0"
        tf_device = "/CPU:0"
        with tf.device(tf_device):
-            model, input, = get_tf_model(
+            (
+                model,
+                input,
+            ) = get_tf_model(
                modelname
            )[:2]
            frontend_model = model
@@ -278,7 +280,8 @@ for currently supported models. Exiting benchmark ONNX."
            ]

    def get_metadata(self, modelname):
-        with open("./tank/model_metadata.csv", mode="r") as csvfile:
+        metadata_path = os.path.join(".", "tank", "model_metadata.csv")
+        with open(metadata_path, mode="r") as csvfile:
            torch_reader = csv.reader(csvfile, delimiter=",")
            fields = next(torch_reader)
            for row in torch_reader:
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -34,7 +34,6 @@ def download_public_file(
    dest_filename = None
    desired_file = None
    if single_file:
-
        desired_file = full_gs_url.split("/")[-1]
        source_blob_name = "/".join(full_gs_url.split("/")[3:-1])
        destination_folder_name, dest_filename = os.path.split(
@@ -80,13 +79,17 @@ input_type_to_np_dtype = {
 # Save the model in the home local so it needn't be fetched everytime in the CI.
 home = str(Path.home())
 alt_path = os.path.join(os.path.dirname(__file__), "../gen_shark_tank/")
-custom_path = shark_args.local_tank_cache
+custom_path_list = None
+if shark_args.local_tank_cache is not None:
+    custom_path_list = shark_args.local_tank_cache.split("/")
+
 if os.path.exists(alt_path):
    WORKDIR = alt_path
    print(
        f"Using {WORKDIR} as shark_tank directory. Delete this directory if you aren't working from locally generated shark_tank."
    )
-if custom_path:
+if custom_path_list:
+    custom_path = os.path.join(*custom_path_list)
    if not os.path.exists(custom_path):
        os.mkdir(custom_path)

--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -55,6 +55,7 @@ class SharkImporter:
        inputs: tuple = (),
        frontend: str = "torch",
        raw_model_file: str = "",
+        return_str: bool = False,
    ):
        self.module = module
        self.inputs = None if len(inputs) == 0 else inputs
@@ -65,6 +66,7 @@ class SharkImporter:
            )
            sys.exit(1)
        self.raw_model_file = raw_model_file
+        self.return_str = return_str

    # NOTE: The default function for torch is "forward" and tf-lite is "main".

@@ -72,7 +74,11 @@ class SharkImporter:
        from shark.torch_mlir_utils import get_torch_mlir_module

        return get_torch_mlir_module(
-            self.module, self.inputs, is_dynamic, tracing_required
+            self.module,
+            self.inputs,
+            is_dynamic,
+            tracing_required,
+            self.return_str,
        )

    def _tf_mlir(self, func_name, save_dir="./shark_tmp/"):
@@ -158,6 +164,7 @@ class SharkImporter:
        func_name="forward",
        dir=tempfile.gettempdir(),
        model_name="model",
+        golden_values=None,
    ):
        if self.inputs == None:
            print(
@@ -177,7 +184,11 @@ class SharkImporter:
        if self.frontend in ["torch", "pytorch"]:
            import torch

-            golden_out = self.module(*self.inputs)
+            golden_out = None
+            if golden_values is not None:
+                golden_out = golden_values
+            else:
+                golden_out = self.module(*self.inputs)
            if torch.is_tensor(golden_out):
                golden_out = tuple(
                    golden_out.detach().cpu().numpy(),
@@ -245,12 +256,128 @@ class SharkImporter:
            )


+def get_f16_inputs(inputs, is_f16, f16_input_mask):
+    if is_f16 == False:
+        return inputs
+    if f16_input_mask == None:
+        return tuple([x.half() for x in inputs])
+
+    f16_masked_inputs = []
+    for i in range(len(inputs)):
+        if f16_input_mask[i]:
+            f16_masked_inputs.append(inputs[i].half())
+        else:
+            f16_masked_inputs.append(inputs[i])
+
+    return tuple(f16_masked_inputs)
+
+
+def transform_fx(fx_g):
+    import torch
+
+    kwargs_dict = {
+        "dtype": torch.float16,
+        "device": torch.device(type="cpu"),
+        "pin_memory": False,
+    }
+    for node in fx_g.graph.nodes:
+        if node.op == "call_function":
+            if node.target in [
+                torch.ops.aten.arange,
+                torch.ops.aten.empty,
+            ]:
+                node.kwargs = kwargs_dict
+            # Inputs and outputs of aten.var.mean should be upcasted to fp32.
+            if node.target in [torch.ops.aten.var_mean]:
+                with fx_g.graph.inserting_before(node):
+                    new_node = fx_g.graph.call_function(
+                        torch.ops.prims.convert_element_type,
+                        args=(node.args[0], torch.float32),
+                        kwargs={},
+                    )
+                    node.args = (new_node, node.args[1])
+            if node.name.startswith("getitem"):
+                with fx_g.graph.inserting_before(node):
+                    if node.args[0].target in [torch.ops.aten.var_mean]:
+                        new_node = fx_g.graph.call_function(
+                            torch.ops.aten._to_copy,
+                            args=(node,),
+                            kwargs={"dtype": torch.float16},
+                        )
+                        node.append(new_node)
+                        node.replace_all_uses_with(new_node)
+                        new_node.args = (node,)
+                        new_node.kwargs = {"dtype": torch.float16}
+            # aten.empty should be filled with zeros.
+            if node.target in [torch.ops.aten.empty]:
+                with fx_g.graph.inserting_after(node):
+                    new_node = fx_g.graph.call_function(
+                        torch.ops.aten.zero_,
+                        args=(node,),
+                    )
+                    node.append(new_node)
+                    node.replace_all_uses_with(new_node)
+                    new_node.args = (node,)
+
+    fx_g.graph.lint()
+
+
+# Doesn't replace the None type.
+def change_fx_graph_return_to_tuple(fx_g):
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            # output nodes always have one argument
+            node_arg = node.args[0]
+            out_nodes = []
+            if isinstance(node_arg, list):
+                # Don't return NoneType elements.
+                for out_node in node_arg:
+                    if not isinstance(out_node, type(None)):
+                        out_nodes.append(out_node)
+                # If there is a single tensor/element to be returned don't
+                # a tuple for it.
+                if len(out_nodes) == 1:
+                    node.args = out_nodes
+                else:
+                    node.args = (tuple(out_nodes),)
+    fx_g.graph.lint()
+    fx_g.recompile()
+    return fx_g
+
+
+def flatten_training_input(inputs):
+    flattened_input = []
+    for i in inputs:
+        if isinstance(i, dict):
+            for value in i.values():
+                flattened_input.append(value.detach())
+        elif isinstance(i, tuple):
+            for value in i:
+                flattened_input.append(value)
+        else:
+            flattened_input.append(i)
+    return tuple(flattened_input)
+
+
 # Applies fx conversion to the model and imports the mlir.
-def import_with_fx(model, inputs, debug=False):
+def import_with_fx(
+    model,
+    inputs,
+    is_f16=False,
+    f16_input_mask=None,
+    debug=False,
+    training=False,
+    return_str=False,
+    save_dir=tempfile.gettempdir(),
+    model_name="model",
+):
    import torch
    from torch.fx.experimental.proxy_tensor import make_fx
    from torch._decomp import get_decompositions

+    golden_values = None
+    if debug:
+        golden_values = model(*inputs)
    # TODO: Control the decompositions.
    fx_g = make_fx(
        model,
@@ -286,16 +413,29 @@ def import_with_fx(model, inputs, debug=False):

    strip_overloads(fx_g)

+    if is_f16:
+        fx_g = fx_g.half()
+        transform_fx(fx_g)
+        fx_g.recompile()
+
+    if training:
+        change_fx_graph_return_to_tuple(fx_g)
+        inputs = flatten_training_input(inputs)
+
+    ts_graph = torch.jit.script(fx_g)
+    inputs = get_f16_inputs(inputs, is_f16, f16_input_mask)
    mlir_importer = SharkImporter(
-        fx_g,
+        ts_graph,
        inputs,
        frontend="torch",
+        return_str=return_str,
    )

-    if debug:
-        (mlir_module, func_name), _, _ = mlir_importer.import_debug()
+    if debug:  # and not is_f16:
+        (mlir_module, func_name), _, _ = mlir_importer.import_debug(
+            dir=save_dir, model_name=model_name, golden_values=golden_values
+        )
        return mlir_module, func_name

    mlir_module, func_name = mlir_importer.import_mlir()
-
    return mlir_module, func_name
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -69,11 +69,13 @@ class SharkInference:
        is_benchmark: bool = False,
        dispatch_benchmark: str = None,
        dispatch_benchmark_dir: str = "temp_dispatch_benchmarks",
+        device_idx: int = None,
    ):
        self.mlir_module = mlir_module
        self.device = shark_args.device if device == "none" else device
        self.mlir_dialect = mlir_dialect
        self.is_benchmark = is_benchmark
+        self.device_idx = device_idx
        self.dispatch_benchmarks = (
            shark_args.dispatch_benchmarks
            if dispatch_benchmark is None
@@ -88,7 +90,6 @@ class SharkInference:
        self.shark_runner = None

    def compile(self, extra_args=[]):
-
        if self.dispatch_benchmarks is not None:
            extra_args.append(
                f"--iree-hal-dump-executable-sources-to={self.dispatch_benchmarks_dir}"
@@ -120,6 +121,7 @@ class SharkInference:
                self.device,
                self.mlir_dialect,
                extra_args=extra_args,
+                device_idx=self.device_idx,
            )

        if self.dispatch_benchmarks is not None:
@@ -205,5 +207,6 @@ class SharkInference:
        ) = load_flatbuffer(
            path,
            self.device,
+            self.device_idx,
        )
        return
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -64,11 +64,13 @@ class SharkRunner:
        mlir_dialect: str = "linalg",
        extra_args: list = [],
        compile_vmfb: bool = True,
+        device_idx: int = None,
    ):
        self.mlir_module = mlir_module
        self.device = shark_args.device if device == "none" else device
        self.mlir_dialect = mlir_dialect
        self.extra_args = extra_args
+        self.device_idx = device_idx

        if check_device_drivers(self.device):
            print(device_driver_info(self.device))
@@ -84,6 +86,7 @@ class SharkRunner:
                self.device,
                self.mlir_dialect,
                extra_args=self.extra_args,
+                device_idx=self.device_idx,
            )

    def run(self, function_name, inputs: tuple, send_to_host=False):
--- a/shark/shark_trainer.py
+++ b/shark/shark_trainer.py
@@ -15,6 +15,7 @@
 from shark.parser import shark_args
 from shark.shark_runner import SharkRunner
 from shark.backward_makefx import MakeFxModule
+from shark.shark_importer import import_with_fx
 import numpy as np
 from tqdm import tqdm
 import sys
@@ -67,23 +68,21 @@ class SharkTrainer:
            self.frontend = frontend

    # Training function is needed in the case of torch_fn.
-    def compile(self, training_fn=None):
+    def compile(self, training_fn=None, extra_args=[]):
        if self.frontend in ["torch", "pytorch"]:
-            aot_module = MakeFxModule(
-                self.model, tuple(self.input), custom_inference_fn=training_fn
+            packed_inputs = (
+                dict(self.model.named_parameters()),
+                dict(self.model.named_buffers()),
+                tuple(self.input),
+            )
+            mlir_module, func_name = import_with_fx(
+                training_fn, packed_inputs, False, [], training=True
            )
-            aot_module.generate_graph()
-            # Returns the backward graph.
-            training_graph = aot_module.training_graph
-            weights = self.get_torch_params()
            self.shark_runner = SharkRunner(
-                training_graph,
-                weights + self.input,
-                self.dynamic,
+                mlir_module,
                self.device,
-                self.jit_trace,
-                self.from_aot,
-                self.frontend,
+                "tm_tensor",
+                extra_args=extra_args,
            )
        elif self.frontend in ["tensorflow", "tf", "mhlo"]:
            self.shark_runner = SharkRunner(
@@ -112,8 +111,8 @@ class SharkTrainer:
        params = [x.numpy() for x in params]
        print(f"Training started for {num_iters} iterations:")
        for i in tqdm(range(num_iters)):
-            params = self.shark_runner.forward(
-                params + self.input, self.frontend
+            params = self.shark_runner.run(
+                "forward", params + self.input, self.frontend
            )

        return params
--- a/shark/sharkdynamo/utils.py
+++ b/shark/sharkdynamo/utils.py
@@ -9,6 +9,7 @@ from torch._decomp import get_decompositions

 import torch_mlir

+
 # TODO: Control decompositions.
 def default_decompositions():
    return get_decompositions(
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -56,6 +56,7 @@ def get_torch_mlir_module(
    input: tuple,
    dynamic: bool,
    jit_trace: bool,
+    return_str: bool = False,
 ):
    """Get the MLIR's linalg-on-tensors module from the torchscipt module."""
    ignore_traced_shapes = False
@@ -73,6 +74,8 @@ def get_torch_mlir_module(
        use_tracing=jit_trace,
        ignore_traced_shapes=ignore_traced_shapes,
    )
+    if return_str:
+        return mlir_module.operation.get_asm()
    bytecode_stream = io.BytesIO()
    mlir_module.operation.write_bytecode(bytecode_stream)
    bytecode = bytecode_stream.getvalue()
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -1,36 +1,36 @@
-resnet50,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error: mostly conv"
-albert-base-v2,mhlo,tf,1e-2,1e-2,default,None,False,False,False,""
-roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,""
-bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
-camembert-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
-dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"https://github.com/iree-org/iree/issues/9971"
-distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
-facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,True,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342"
-funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,True,"https://github.com/nod-ai/SHARK/issues/201"
-google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
-google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile."
-google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
-microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile."
-microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
-microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,False,False,False,""
-albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir"
-alexnet,linalg,torch,1e-2,1e-3,default,None,False,False,True,"Assertion Error: Zeros Output"
-bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,""
-bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,""
-bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,False,True,""
-facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile."
-google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311"
-microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390"
-microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,True,""
-microsoft/resnet-50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
-google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,True,"https://github.com/nod-ai/SHARK/issues/344"
-mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,True,"https://github.com/nod-ai/SHARK/issues/388"
-nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,True,"https://github.com/nod-ai/SHARK/issues/343"
-resnet101,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
-resnet18,linalg,torch,1e-2,1e-3,default,None,True,True,True,""
-resnet50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
-resnet50_fp16,linalg,torch,1e-2,1e-2,default,nhcw-nhwc,True,False,True,""
-squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/388"
-wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
-efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/575"
-mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/388"
+resnet50,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+albert-base-v2,mhlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
+roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+camembert-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/iree-org/iree/issues/9971",""
+distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342",""
+funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/201",""
+google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile",""
+google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,False,"",""
+microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile.",""
+microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
+albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir",""
+alexnet,linalg,torch,1e-2,1e-3,default,None,True,False,False,"https://github.com/nod-ai/SHARK/issues/879",""
+bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
+bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
+bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,False,True,"",""
+facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile.",""
+google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311",""
+microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390",""
+microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
+microsoft/resnet-50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"https://github.com/nod-ai/SHARK/issues/344",""
+mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/388","macos"
+nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/343","macos"
+resnet101,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+resnet18,linalg,torch,1e-2,1e-3,default,None,True,True,False,"","macos"
+resnet50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+resnet50_fp16,linalg,torch,1e-2,1e-2,default,nhcw-nhwc,True,False,True,"",""
+squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
--- a/tank/examples/opt/hacked_hf_opt.py
+++ b/tank/examples/opt/hacked_hf_opt.py
@@ -338,7 +338,6 @@ class OPTDecoderLayer(nn.Module):
        torch.FloatTensor,
        Optional[Tuple[torch.FloatTensor, torch.FloatTensor]],
    ]:
-
        # TODO: Refactor this function

        residual = hidden_states
@@ -509,7 +508,6 @@ class OPTDecoder(OPTPreTrainedModel):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
-
        # TODO: Refactor this function

        output_attentions = (
@@ -788,7 +786,6 @@ class OPTForCausalLM(OPTPreTrainedModel):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
-
        # TODO: Refactor this function

        output_attentions = (
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -83,10 +83,10 @@ def get_hf_img_cls_model(name):
    # you can use preprocess_input_image to get the test_input or just random value.
    test_input = preprocess_input_image(name)
    # test_input = torch.FloatTensor(1, 3, 224, 224).uniform_(-1, 1)
-    print("test_input.shape: ", test_input.shape)
+    # print("test_input.shape: ", test_input.shape)
    # test_input.shape:  torch.Size([1, 3, 224, 224])
    actual_out = model(test_input)
-    print("actual_out.shape： ", actual_out.shape)
+    # print("actual_out.shape： ", actual_out.shape)
    # actual_out.shape：  torch.Size([1, 1000])
    return model, test_input, actual_out

--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -43,6 +43,7 @@ def load_csv_and_convert(filename, gen=False):
                    "xfail_cuda": row[8],
                    "xfail_vkm": row[9],
                    "xfail_reason": row[10],
+                    "xfail_other": row[11],
                }
            )
    # This is a pytest workaround
@@ -89,6 +90,8 @@ def get_valid_test_params():
 def is_valid_case(test_params):
    if test_params[0] == True and test_params[2]["framework"] == "tf":
        return False
+    elif "fp16" in test_params[2]["model_name"] and test_params[1] != "cuda":
+        return False
    else:
        return True

@@ -132,13 +135,14 @@ class SharkModuleTester:
        self.config = config

    def create_and_check_module(self, dynamic, device):
-
        shark_args.local_tank_cache = self.local_tank_cache
        shark_args.update_tank = self.update_tank
        if "nhcw-nhwc" in self.config["flags"] and not os.path.isfile(
            ".use-iree"
        ):
            shark_args.enable_conv_transform = True
+        else:
+            shark_args.enable_conv_transform = False

        model, func_name, inputs, golden_out = download_model(
            self.config["model_name"],
@@ -177,26 +181,12 @@ class SharkModuleTester:
            if self.ci == True:
                self.upload_repro()
            if self.benchmark == True:
-                # p = multiprocessing.Process(
-                #    target=self.benchmark_module,
-                #    args=(shark_module, inputs, dynamic, device),
-                # )
-                # p.start()
-                # p.join()
                self.benchmark_module(shark_module, inputs, dynamic, device)
                print(msg)
-                pytest.xfail(reason="Numerics Issue")
-
+                pytest.xfail(
+                    reason=f"Numerics Mismatch: Use -s flag to print stderr during pytests."
+                )
        if self.benchmark == True:
-            # We must create a new process each time we benchmark a model to allow
-            # for Tensorflow to release GPU resources. Using the same process to
-            # benchmark multiple models leads to OOM.
-            # p = multiprocessing.Process(
-            #    target=self.benchmark_module,
-            #    args=(shark_module, inputs, dynamic, device),
-            # )
-            # p.start()
-            # p.join()
            self.benchmark_module(shark_module, inputs, dynamic, device)

        if self.save_repro == True:
@@ -219,10 +209,11 @@ class SharkModuleTester:

    def save_reproducers(self):
        # Saves contents of IREE TempFileSaver temporary directory to ./shark_tmp/saved/<test_case>.
-        src = self.temp_dir
-        trg = f"./shark_tmp/saved/{self.tmp_prefix}"
-        if not os.path.isdir("./shark_tmp/saved/"):
-            os.mkdir("./shark_tmp/saved/")
+        src = os.path.join(*self.temp_dir.split("/"))
+        saves = os.path.join(".", "shark_tmp", "saved")
+        trg = os.path.join(saves, self.tmp_prefix)
+        if not os.path.isdir(saves):
+            os.mkdir(saves)
        if not os.path.isdir(trg):
            os.mkdir(trg)
        files = os.listdir(src)
@@ -232,7 +223,12 @@ class SharkModuleTester:
    def upload_repro(self):
        import subprocess

-        bashCommand = f"gsutil cp -r ./shark_tmp/saved/{self.tmp_prefix}/* gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
+        src = os.path.join(*self.temp_dir.split("/"))
+        repro_path = os.path.join(
+            ".", "shark_tmp", "saved", self.tmp_prefix, "*"
+        )
+
+        bashCommand = f"gsutil cp -r {repro_path} gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
        process = subprocess.run(bashCommand.split())

    def postprocess_outputs(self, golden_out, result):
@@ -293,31 +289,15 @@ class SharkModuleTest(unittest.TestCase):
            pytest.xfail(reason=config["xfail_reason"])

        # Special cases that need to be marked.
-        if config["model_name"] == "resnet50" and device in [
+        if "macos" in config["xfail_other"] and device in [
            "metal",
            "vulkan",
        ]:
            if get_vulkan_triple_flag() is not None:
                if "m1-moltenvk-macos" in get_vulkan_triple_flag():
                    pytest.xfail(
-                        reason="M2: Assert Error & M1: CompilerToolError"
+                        reason="conv-related issue on MacStudio, returns VK_ERROR_DEVICE_LOST."
                    )
-        if (
-            config["model_name"] == "camembert-base"
-            and dynamic == False
-            and device in ["metal", "vulkan"]
-        ):
-            pytest.xfail(
-                reason="chlo.broadcast_compare failed to satify constraint"
-            )
-        if (
-            config["model_name"] == "roberta-base"
-            and dynamic == False
-            and device in ["metal", "vulkan"]
-        ):
-            pytest.xfail(
-                reason="chlo.broadcast_compare failed to satify constraint"
-            )
        if (
            config["model_name"]
            in [
@@ -345,11 +325,11 @@ class SharkModuleTest(unittest.TestCase):
        )
        self.module_tester.tmp_prefix = safe_name.replace("/", "_")

-        if not os.path.isdir("./shark_tmp/"):
-            os.mkdir("./shark_tmp/")
+        if not os.path.isdir("shark_tmp"):
+            os.mkdir("shark_tmp")

        tempdir = tempfile.TemporaryDirectory(
-            prefix=self.module_tester.tmp_prefix, dir="./shark_tmp/"
+            prefix=self.module_tester.tmp_prefix, dir="shark_tmp"
        )
        self.module_tester.temp_dir = tempdir.name

--- a/tank/tflite/albert_lite_base/albert_lite_base_tflite_test.py
+++ b/tank/tflite/albert_lite_base/albert_lite_base_tflite_test.py
@@ -9,6 +9,7 @@ from shark.parser import shark_args
 # model_path = "https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1?lite-format=tflite"
 # model_path = model_path

+
 # Inputs modified to be useful albert inputs.
 def generate_inputs(input_details):
    for input in input_details:
--- a/web/README.md
+++ b/web/README.md
@@ -1,16 +0,0 @@
-In order to launch SHARK-web, from the root SHARK directory, run:
-
-## Linux
-```shell
-IMPORTER=1 ./setup_venv.sh
-source shark.venv/bin/activate
-cd web
-python index.py
-```
-
-## Windows
-```shell
-./setup_venv.ps1
-cd web
-python index.py --local_tank_cache=<current_working_dir>
-```
--- a/web/index.py
+++ b/web/index.py
@@ -1,182 +0,0 @@
-import os
-
-os.environ["AMD_ENABLE_LLPC"] = "1"
-import gradio as gr
-from PIL import Image
-from models.stable_diffusion.resources import resource_path, prompt_examples
-from models.stable_diffusion.main import stable_diff_inf
-from models.stable_diffusion.stable_args import args
-from models.stable_diffusion.utils import get_available_devices
-
-nodlogo_loc = resource_path("logos/nod-logo.png")
-sdlogo_loc = resource_path("logos/sd-demo-logo.png")
-
-
-demo_css = """
-.gradio-container {background-color: black}
-.container {background-color: black !important; padding-top:20px !important; }
-#ui_title {padding: 10px !important; }
-#top_logo {background-color: transparent; border-radius: 0 !important; border: 0; } 
-#demo_title {background-color: black; border-radius: 0 !important; border: 0; padding-top: 50px; padding-bottom: 0px; width: 460px !important;} 
-
-#demo_title_outer  {border-radius: 0; } 
-#prompt_box_outer div:first-child  {border-radius: 0 !important}
-#prompt_box textarea  {background-color:#1d1d1d !important}
-#prompt_examples {margin:0 !important}
-#prompt_examples svg {display: none !important;}
-
-.gr-sample-textbox { border-radius: 1rem !important; border-color: rgb(31,41,55) !important; border-width:2px !important; }
-#ui_body {background-color: #111111 !important; padding: 10px !important; border-radius: 0.5em !important;}
-
-#img_result+div {display: none !important;}
-
-footer {display: none !important;}
-"""
-
-
-with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:
-
-    with gr.Row(elem_id="ui_title"):
-        nod_logo = Image.open(nodlogo_loc)
-        logo2 = Image.open(sdlogo_loc)
-        with gr.Row():
-            with gr.Column(scale=1, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=nod_logo,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="top_logo",
-                ).style(width=150, height=100)
-            with gr.Column(scale=5, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=logo2,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="demo_title",
-                ).style(width=150, height=100)
-
-    with gr.Row(elem_id="ui_body"):
-
-        with gr.Row():
-            with gr.Column(scale=1, min_width=600):
-                with gr.Group(elem_id="prompt_box_outer"):
-                    prompt = gr.Textbox(
-                        label="Prompt",
-                        value="cyberpunk forest by Salvador Dali",
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                    negative_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value="trees, green",
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                with gr.Row():
-                    variant = gr.Dropdown(
-                        label="Model Variant",
-                        value="stablediffusion",
-                        choices=[
-                            "stablediffusion",
-                            "anythingv3",
-                            "analogdiffusion",
-                            "openjourney",
-                            "dreamlike",
-                        ],
-                    )
-                    scheduler_key = gr.Dropdown(
-                        label="Scheduler",
-                        value="SharkEulerDiscrete",
-                        choices=[
-                            "DDIM",
-                            "PNDM",
-                            "LMSDiscrete",
-                            "DPMSolverMultistep",
-                            "EulerDiscrete",
-                            "EulerAncestralDiscrete",
-                            "SharkEulerDiscrete",
-                        ],
-                    )
-                with gr.Row():
-                    steps = gr.Slider(1, 100, value=50, step=1, label="Steps")
-                    guidance_scale = gr.Slider(
-                        0,
-                        50,
-                        value=7.5,
-                        step=0.1,
-                        label="CFG Scale",
-                    )
-                with gr.Row():
-                    seed = gr.Number(value=-1, precision=0, label="Seed")
-                    available_devices = get_available_devices()
-                    device_key = gr.Dropdown(
-                        label="Device",
-                        value=available_devices[0],
-                        choices=available_devices,
-                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        None,
-                        inputs=[],
-                        outputs=[seed],
-                        _js="() => Math.floor(Math.random() * 4294967295)",
-                    )
-                    stable_diffusion = gr.Button("Generate Image")
-                with gr.Accordion(label="Prompt Examples!"):
-                    ex = gr.Examples(
-                        examples=prompt_examples,
-                        inputs=prompt,
-                        cache_examples=False,
-                        elem_id="prompt_examples",
-                    )
-
-            with gr.Column(scale=1, min_width=600):
-                with gr.Group():
-                    generated_img = gr.Image(
-                        type="pil", interactive=False
-                    ).style(height=512)
-                    std_output = gr.Textbox(
-                        value="Nothing to show.",
-                        lines=4,
-                        show_label=False,
-                    )
-
-        prompt.submit(
-            stable_diff_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                steps,
-                guidance_scale,
-                seed,
-                scheduler_key,
-                variant,
-                device_key,
-            ],
-            outputs=[generated_img, std_output],
-            show_progress=args.progress_bar,
-        )
-        stable_diffusion.click(
-            stable_diff_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                steps,
-                guidance_scale,
-                seed,
-                scheduler_key,
-                variant,
-                device_key,
-            ],
-            outputs=[generated_img, std_output],
-            show_progress=args.progress_bar,
-        )
-
-shark_web.queue()
-shark_web.launch(
-    share=args.share,
-    inbrowser=True,
-    server_name="0.0.0.0",
-    server_port=args.server_port,
-)
--- a/web/models/albert_maskfill.py
+++ b/web/models/albert_maskfill.py
@@ -1,108 +0,0 @@
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-import torch
-from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
-import numpy as np
-
-################################## Albert Module #########################
-
-
-class AlbertModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.model = AutoModelForMaskedLM.from_pretrained("albert-base-v2")
-        self.model.eval()
-
-    def forward(self, input_ids, attention_mask):
-        return self.model(
-            input_ids=input_ids, attention_mask=attention_mask
-        ).logits
-
-
-################################## Preprocessing inputs ####################
-
-DEBUG = False
-compiled_module = {}
-compiled_module["tokenizer"] = AutoTokenizer.from_pretrained("albert-base-v2")
-
-
-def preprocess_data(text):
-
-    global compiled_module
-
-    # Preparing Data
-    tokenizer = compiled_module["tokenizer"]
-    encoded_inputs = tokenizer(
-        text,
-        padding="max_length",
-        truncation=True,
-        max_length=512,
-        return_tensors="pt",
-    )
-    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
-    return inputs
-
-
-def top5_possibilities(text, inputs, token_logits, log_write):
-
-    global DEBUG
-    global compiled_module
-
-    if DEBUG:
-        log_write.write("Retrieving top 5 possible outcomes.\n")
-    tokenizer = compiled_module["tokenizer"]
-    mask_id = torch.where(inputs[0] == tokenizer.mask_token_id)[1]
-    mask_token_logits = token_logits[0, mask_id, :]
-    percentage = torch.nn.functional.softmax(mask_token_logits, dim=1)[0]
-    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-    top5 = {}
-    for token in top_5_tokens:
-        label = text.replace(tokenizer.mask_token, tokenizer.decode(token))
-        top5[label] = percentage[token].item()
-    if DEBUG:
-        log_write.write("Done.\n")
-    return top5
-
-
-##############################################################################
-
-
-def albert_maskfill_inf(masked_text, device):
-
-    global DEBUG
-    global compiled_module
-
-    DEBUG = False
-    log_write = open(r"logs/albert_maskfill_log.txt", "w")
-    if log_write:
-        DEBUG = True
-
-    inputs = preprocess_data(masked_text)
-    if device not in compiled_module.keys():
-        if DEBUG:
-            log_write.write("Compiling the Albert Maskfill module.\n")
-        mlir_importer = SharkImporter(
-            AlbertModule(),
-            inputs,
-            frontend="torch",
-        )
-        minilm_mlir, func_name = mlir_importer.import_mlir(
-            is_dynamic=False, tracing_required=True
-        )
-        shark_module = SharkInference(
-            minilm_mlir, func_name, mlir_dialect="linalg", device=device
-        )
-        shark_module.compile()
-        compiled_module[device] = shark_module
-        if DEBUG:
-            log_write.write("Compilation successful.\n")
-
-    token_logits = torch.tensor(compiled_module[device].forward(inputs))
-    output = top5_possibilities(masked_text, inputs, token_logits, log_write)
-    log_write.close()
-
-    std_output = ""
-    with open(r"logs/albert_maskfill_log.txt", "r") as log_read:
-        std_output = log_read.read()
-
-    return output, std_output
--- a/web/models/diffusion/setup_vdiffusion.sh
+++ b/web/models/diffusion/setup_vdiffusion.sh
@@ -1,5 +0,0 @@
-git clone --recursive https://github.com/crowsonkb/v-diffusion-pytorch.git
-pip install ftfy regex tqdm
-
-mkdir checkpoints
-wget https://the-eye.eu/public/AI/models/v-diffusion/cc12m_1_cfg.pth -P checkpoints/
--- a/web/models/diffusion/v_diffusion.py
+++ b/web/models/diffusion/v_diffusion.py
@@ -1,215 +0,0 @@
-"""classifier-free guidance sampling from a diffusion model."""
-
-from functools import partial
-from pathlib import Path
-
-from PIL import Image
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torchvision import transforms
-from torchvision.transforms import functional as TF
-from tqdm import trange
-
-from shark.shark_inference import SharkInference
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-import torch_mlir
-
-import sys
-
-sys.path.append("models/diffusion/v-diffusion-pytorch")
-
-from CLIP import clip
-from diffusion import get_model, get_models, sampling, utils
-
-import gradio as gr
-
-MODULE_DIR = Path(__file__).resolve().parent
-
-set_global_parameters = False
-device = None
-model = None
-checkpoint = None
-clip_model = None
-
-
-def parse_prompt(prompt, default_weight=3.0):
-    if prompt.startswith("http://") or prompt.startswith("https://"):
-        vals = prompt.rsplit(":", 2)
-        vals = [vals[0] + ":" + vals[1], *vals[2:]]
-    else:
-        vals = prompt.rsplit(":", 1)
-    vals = vals + ["", default_weight][len(vals) :]
-    print(vals[1])
-    print(vals[0])
-    return vals[0], float(vals[1])
-
-
-def run(x, steps, shark_module, args):
-    def compiled_cfg_model_fn(x, t):
-        x_ny = x.detach().numpy()
-        t_ny = t.detach().numpy()
-        inputs = (x_ny, t_ny)
-        result = shark_module.forward(inputs)
-        return torch.from_numpy(result)
-
-    return sampling.plms_sample(compiled_cfg_model_fn, x, steps, {})
-
-
-def run_all(
-    x,
-    t,
-    steps,
-    n,
-    batch_size,
-    side_x,
-    side_y,
-    shark_module,
-    args,
-):
-    x = torch.randn([n, 3, side_y, side_x], device=device)
-    t = torch.linspace(1, 0, args["steps"] + 1, device=device)[:-1]
-    steps = utils.get_spliced_ddpm_cosine_schedule(t)
-    pil_images = []
-    for i in trange(0, n, batch_size):
-        cur_batch_size = min(n - i, batch_size)
-        outs = run(x[i : i + cur_batch_size], steps, shark_module, args)
-        for j, out in enumerate(outs):
-            pil_images.append(utils.to_pil_image(out))
-    return pil_images[0]
-
-
-def cache_model():
-    global set_global_parameters
-    global device
-    global model
-    global checkpoint
-    global clip_model
-    if not set_global_parameters:
-        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        model = get_model("cc12m_1_cfg")()
-        checkpoint = MODULE_DIR / f"checkpoints/cc12m_1_cfg.pth"
-        model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
-        if device.type == "cuda":
-            model = model.half()
-        model = model.to(device).eval().requires_grad_(False)
-        clip_model_name = (
-            model.clip_model if hasattr(model, "clip_model") else "ViT-B/16"
-        )
-        clip_model = clip.load(clip_model_name, jit=False, device=device)[0]
-        clip_model.eval().requires_grad_(False)
-        set_global_parameters = True
-
-
-def vdiff_inf(prompts: str, n, bs, steps, _device):
-
-    global device
-    global model
-    global checkpoint
-    global clip_model
-
-    args = {}
-    target_embeds = []
-    weights = []
-    args["prompts"] = prompts
-    args["batch_size"] = int(bs)
-    args["n"] = int(n)
-    args["seed"] = 0
-    args["steps"] = int(steps)
-    args["device"] = _device
-
-    cache_model()
-
-    _, side_y, side_x = model.shape
-    normalize = transforms.Normalize(
-        mean=[0.48145466, 0.4578275, 0.40821073],
-        std=[0.26862954, 0.26130258, 0.27577711],
-    )
-
-    zero_embed = torch.zeros([1, clip_model.visual.output_dim], device=device)
-    target_embeds.append(zero_embed)
-
-    prompt_list = args["prompts"].rsplit(";")
-    for prompt in prompt_list:
-        txt, weight = parse_prompt(prompt)
-        target_embeds.append(
-            clip_model.encode_text(clip.tokenize(txt).to(device)).float()
-        )
-        weights.append(weight)
-    weights = torch.tensor([1 - sum(weights), *weights], device=device)
-
-    torch.manual_seed(args["seed"])
-
-    x = torch.randn([args["n"], 3, side_y, side_x], device=device)
-    t = torch.linspace(1, 0, args["steps"] + 1, device=device)[:-1]
-    steps = utils.get_spliced_ddpm_cosine_schedule(t)
-    min_batch_size = min(args["n"], args["batch_size"])
-    x_in = x[0:min_batch_size, :, :, :]
-    ts = x_in.new_ones([x_in.shape[0]])
-    t_in = t[0] * ts
-
-    def cfg_model_fn(x, t):
-        n = x.shape[0]
-        n_conds = len(target_embeds)
-        x_in = x.repeat([n_conds, 1, 1, 1])
-        t_in = t.repeat([n_conds])
-        clip_embed_in = torch.cat([*target_embeds]).repeat([n, 1])
-        vs = model(x_in, t_in, clip_embed_in).view([n_conds, n, *x.shape[1:]])
-        v = vs.mul(weights[:, None, None, None, None]).sum(0)
-        return v
-
-    fx_g = make_fx(
-        cfg_model_fn,
-        decomposition_table=get_decompositions(
-            [
-                torch.ops.aten.embedding_dense_backward,
-                torch.ops.aten.native_layer_norm_backward,
-                torch.ops.aten.slice_backward,
-                torch.ops.aten.select_backward,
-                torch.ops.aten.norm.ScalarOpt_dim,
-                torch.ops.aten.native_group_norm,
-                torch.ops.aten.upsample_bilinear2d.vec,
-                torch.ops.aten.split.Tensor,
-                torch.ops.aten.split_with_sizes,
-            ]
-        ),
-    )(x_in, t_in)
-
-    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-    fx_g.recompile()
-
-    for node in fx_g.graph.nodes:
-        if isinstance(node.target, torch._ops.OpOverload):
-            node.target = node.target.overloadpacket
-    fx_g.recompile()
-
-    ts_g = torch.jit.script(fx_g)
-
-    module = torch_mlir.compile(
-        ts_g,
-        [x_in, t_in],
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-    )
-
-    mlir_model = module
-    func_name = "forward"
-    shark_module = SharkInference(
-        mlir_model, func_name, device=args["device"], mlir_dialect="linalg"
-    )
-    shark_module.compile()
-    return (
-        run_all(
-            x,
-            t,
-            args["steps"],
-            args["n"],
-            args["batch_size"],
-            side_x,
-            side_y,
-            shark_module,
-            args,
-        ),
-        "Testing..",
-    )
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ean Garvey	a90812133b	Enable pytests on Windows (#901 )	2023-02-01 18:36:41 -06:00
powderluv	e26a70aa4f	Drop old cli and webui (#911 )	2023-02-01 13:13:46 -08:00
Daniel Garvey	6a32a4e26c	move ci sd stuff to apps (#912 ) Co-authored-by: dan <dan@nod-labs.com> Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-02-01 12:15:07 -08:00
powderluv	e853abf98b	Update stable_diffusion_amd.md	2023-02-01 11:11:58 -08:00
powderluv	51e81e6ef8	update main readme	2023-02-01 11:09:00 -08:00
powderluv	e355000ceb	Drop torchvision	2023-02-01 10:26:37 -08:00
Daniel Garvey	e374074013	Windows test (#896 ) * add generate_sharktank for stable_diffusion model defaults * add windows test for sd --------- Co-authored-by: dan <dan@nod-labs.com>	2023-02-01 12:03:54 -06:00
powderluv	81e3d1c2c6	switch to apps/	2023-02-01 06:54:20 -08:00
powderluv	ab0cbb4475	Add PyInstaller for apps/ webui and cli (#909 ) tested webui, cli and webui exe and cli exe	2023-02-01 06:51:27 -08:00
powderluv	1c64e40722	Add PyInstaller for apps/ (#907 ) Build with pyinstaller.exe .\apps\stable_diffusion\web\shark_sd.spec normal flow works. exe is missing a few json files	2023-02-01 06:04:49 -08:00
Evan Guan	8cafe56eb4	Added flags for metadata information. (#894 )	2023-02-01 05:16:11 -08:00
Eliasj42	3eceeb7b23	fixed a bug that would sometimes cause intel-gpu to appear unsupported (#899 ) Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-01-31 22:32:05 -08:00
powderluv	1a37675435	Revert "move beta to release (#898 )" (#905 ) This reverts commit `7edcaf5a06`.	2023-01-31 20:31:41 -08:00
powderluv	198ebede8d	Revert "replace new model_db.json (#902 )" (#904 ) This reverts commit `842adef29c`.	2023-01-31 20:29:40 -08:00
Ean Garvey	a504903dd5	Fix formatting issues. (#903 )	2023-02-01 09:12:45 +05:30
Daniel Garvey	842adef29c	replace new model_db.json (#902 )	2023-01-31 18:55:22 -08:00
Daniel Garvey	7edcaf5a06	move beta to release (#898 ) Co-authored-by: dan <dan@nod-labs.com>	2023-01-31 17:14:08 -06:00
Gaurav Shukla	c124b76328	[SD] Reorganize the stable diffusion model. (#806 ) The stable diffusion codebase has been reorganized to make it more modular so that the same script can be used for web as well as cli, instead of duplicating the whole codebase. Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-01-31 14:42:41 -08:00
aldesilv	e9c744ee5d	find rocm arch used in rocminfo (#893 ) Co-authored-by: Alex <alexander@nod-labs.com>	2023-01-31 10:22:31 -08:00
Ean Garvey	83302930d8	Update generate_sharktank.py (#897 )	2023-01-31 10:21:22 -08:00
Daniel Garvey	a4634632ba	add generate_sharktank for stable_diffusion model defaults (#742 ) Co-authored-by: dan <dan@nod-labs.com> Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-01-31 09:44:54 -08:00
Abhishek Varma	d17e8dc5ad	[NFC] Rename SD negative_prompts flag -- This commit renames SD `negative-prompts` -> `negative_prompts` flag. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-31 21:38:59 +05:30
powderluv	9fe63de4d4	Pin macOS SDK to 216	2023-01-31 01:09:44 -08:00
Eliasj42	8111f8bf35	added ability to select gpu (#891 ) Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-01-30 13:39:12 -08:00
Abhishek Varma	fcd62513cf	[SD-CLI] Add support for .safetensors + Use diffusers pipeline to load SD -- This commit uses `load_pipeline_from_original_stable_diffusion_ckpt` as exposed due to [Diffusers PR](https://github.com/huggingface/diffusers/pull/2019). -- It also adds a support for the end users to use `.safetensors` along with `.ckpt` file. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-31 00:00:37 +05:30
Abhishek Varma	c3c701e654	Update requirements.txt + README.md of SD -- This commit includes two python modules as part of requirements.txt. -- It also updates README.md to also inclue `--no-use_tuned` for users to be able to try `hf_model_id` or `ckpt_loc` without any issue. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-30 14:12:54 +05:30
Daniel Garvey	6bf991edf6	adding more robust main.py testing (#889 ) Co-authored-by: dan <dan@nod-labs.com>	2023-01-30 00:14:26 -08:00
yzhang93	9644e78545	Fix CUDA tuned model annotation (#880 )	2023-01-27 11:35:18 -08:00
dymil	c911189ef0	Add note about latest RDNA3 driver support (#881 ) Also tweak other wording	2023-01-27 09:39:19 -08:00
Abhishek Varma	1118b4b651	[SD-CLI] Clean up vmfbs if a retry method fails -- This commit cleans up vmfb files generated as a result of retry method. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-27 21:55:36 +05:30
PhaneeshB	4be75d4418	fix seed values in SD json and filename	2023-01-27 18:40:26 +05:30
Ean Garvey	fb6beae27c	Adds pytest-forked dependency to fix pytest memory accumulation issues. (#876 ) * Minor improvements to test-models workflow - cleaned up pytest command line args in Validate Models job scripts. - Removed -s flag to provide more readable logs - Changed shark_cache location to within github workspace and removed --update_tank flag from Linux workflows. * Use pytest-forked for managing pytest memory usage.	2023-01-26 18:20:15 -06:00
yzhang93	fee73b0b63	Add SD model annotation on fly (#869 ) * Add SD model annotation on fly * Move tuned_compile_through_fx to utils * Fix SD compilation flags	2023-01-26 11:46:36 -08:00
powderluv	9bbffa519e	Add an option to respect LLPC env var (#875 ) Also add OSX paths	2023-01-25 13:56:55 -08:00
jinchen62	c3a641f0ab	Address TODOs for dataset annotator (#872 ) - add args usage, pass gs_url by CL flag - add support for no existing prompts	2023-01-25 09:28:23 -08:00
yzhang93	aafe7c4701	Add more cuda devices to use tuned model (#868 )	2023-01-25 06:36:17 -08:00
Abhishek Varma	9a0b082cf8	[SD-CLI] Add `batch_size` command-line arg + prompt processing -- This commit adds `batch_size` command-line arg. -- It also involves replicating the prompt `batch_size` no. of times. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-25 19:21:25 +05:30
powderluv	8265e34a29	Add SHARK SD CLI tool (#870 )	2023-01-24 23:14:32 -08:00
powderluv	8ef8ae097f	Update to build 469	2023-01-24 22:16:13 -08:00
powderluv	c3d14293c0	Update sample results	2023-01-24 22:14:06 -08:00
powderluv	d55d8be504	Add signing of release builds	2023-01-24 21:32:21 -08:00
powderluv	03543030d3	use pefile	2023-01-24 18:35:51 -08:00
powderluv	fc6b474b92	Add ordlookup to requirements.txt	2023-01-24 18:30:16 -08:00
powderluv	a5db785dd7	checkoutv2 on windows	2023-01-24 18:23:22 -08:00
powderluv	1c1c5cd611	Build Windows nightly on 7950x	2023-01-24 16:21:56 -08:00
Abhishek Varma	6ed02f70ec	[SD-CLI] Make using `ckpt_loc` and `hf_model_id` easier -- Currently we require users to specify the base model on which the custom model (.ckpt) is tuned on. Even for running a HuggingFace repo-id, we require the users to go a tedious way of adding things to variants.json. -- This commit aims to address the above issues and will be treated as a starting point for a series of design changes which makes using SHARK's SD easier. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-24 23:03:46 +05:30
Prashant Kumar	cb78cd8ac0	Add the support for the batch size parameter.	2023-01-24 22:33:13 +05:30
Ean Garvey	0c4590b45a	Update generate_sharktank.py	2023-01-24 10:18:03 +05:30
jinchen62	d2e2ee6efa	Add multiple prompts support for dataset annotator (#862 )	2023-01-23 18:40:36 -08:00
powderluv	6a380a0b48	Add more nvidia cards	2023-01-23 17:07:45 -08:00
powderluv	e5d5acbf1f	Remove torchvision requirements from web (#860 )	2023-01-23 13:48:53 -08:00
powderluv	00e38abbf0	Add 4080 support	2023-01-23 09:56:34 -08:00
Abhishek Varma	e3e4ea5443	Update README.md -- Make usage of `hf_model_id` clearer.	2023-01-23 23:25:23 +05:30
Prashant Kumar	a3e4ea3228	Remove the dependency of the torchvision. (#858 ) Remove the dependency of torchvision library for the conversion of tensor layout format to what PIL library expects.	2023-01-23 08:49:57 -08:00
powderluv	56f16d6baf	Update SD readme	2023-01-23 06:51:54 -08:00
Abhishek Varma	7a55ab900e	[SD-CLI] Fix CKPT script + add more variants + update README.md -- This commit fixes CKPT script to rely on the previous CKPT to Diffusers script. TODO: Let go of the script once the CKPT is included in next release of diffusers. -- It also adds many variants as part of `variants.json` and updates `README.md` to reflect change in default `hf_model_id`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-23 18:34:24 +05:30
Abhishek Varma	137643fe72	[SD-CLI] Update README.md of custom models to include `hf_model_id`	2023-01-23 11:37:13 +05:30
Anush Elangovan	d6e59c6241	black format comments	2023-01-22 16:34:40 -08:00
powderluv	458eb5d34c	detect RX 7900 better	2023-01-22 16:32:27 -08:00
Erkin Alp Güney	8259f08864	Collapsibles for Win10 and Linux users (#851 ) Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-01-22 09:50:33 -08:00
Prashant Kumar	b3ab0a1843	Add width and height support for the scheduler.	2023-01-22 23:16:50 +05:30
dependabot[bot]	f09f217478	Bump tensorflow from 2.10 to 2.10.1 (#853 ) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.10 to 2.10.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.10.0...v2.10.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2023-01-22 06:40:17 -08:00
Daniel Garvey	e842c8c19b	add main.py testing for sdiff (#836 ) Co-authored-by: dan <dan@nod-labs.com> Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-01-22 01:16:17 -08:00
powderluv	f6c3112d44	Revert "potential fix to pre-load DLL dir for torch-mlir (#848 )" (#852 ) This reverts commit `6c470d8131`.	2023-01-22 00:09:35 -08:00
yzhang93	7059610632	Modify the default for --hf_model_id flag	2023-01-21 11:21:47 +05:30
powderluv	2d272930d9	Update to signed build 455	2023-01-20 16:50:42 -08:00
powderluv	6c470d8131	potential fix to pre-load DLL dir for torch-mlir (#848 ) Doesn't regress the main.py script but system already pre-loaded the DLL so needs more testing.	2023-01-20 14:48:45 -08:00
jinchen62	30b29ce8cd	Add readme for dataset annotator (#847 )	2023-01-20 01:03:33 -08:00
jinchen62	1a9933002f	Add dataset annotation tool (#835 )	2023-01-19 16:56:08 -08:00
stanley	c4a9365aa1	[Shark][Training] Refresh SharkTrainer to latest APIs.	2023-01-19 20:30:15 +00:00
Prashant Kumar	9d3af37104	bugfix related to the height width params.	2023-01-20 00:21:44 +05:30
Prashant Kumar	7b3d57cff7	Add height and width as args.	2023-01-19 23:43:29 +05:30
Abhishek Varma	a802270da9	[SD-CLI] Update README.md about variants.json	2023-01-19 22:46:54 +05:30
Abhishek Varma	dd194a8758	[SD-CLI] Reorder loading of opt_params when needed -- This commit reorders loading of opt_params when `import_mlir` is not used. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-19 22:02:51 +05:30
Abhishek Varma	6de02de221	[SD-CLI] Make using custom models easier -- This commit makes using custom models easier using a combination of `import_mlir`, `ckpt_loc` and `hf_model_id`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-19 22:02:36 +05:30
Abhishek Varma	85259750bf	[SD-CLI] Fix variants.json mapping -- This commit fixes variants.json's mapping. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-19 22:02:36 +05:30
Prashant Kumar	1249f0007d	Remove args.variant and args.version with args.custom_model.	2023-01-19 19:55:12 +05:30
Abhishek Varma	db0514d3fa	[SD-CLI] Fix get_model_configuration to use max_length -- This commit fixes `get_model_configuration` to use `max_length`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-19 19:10:04 +05:30
Abhishek Varma	dce42a7fad	[SD-CLI] Fix args.max_length range check This commit fixes args.max_length range check. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-19 18:26:23 +05:30
Prashant Kumar	ec0b380194	Refactor shark_tank models and custom models. The custom models shouldn't depend on shark_tank in anyway.	2023-01-19 13:56:11 +05:30
Ean Garvey	7f27b61c98	Update setup_venv.sh to install triton if BENCHMARK=1	2023-01-19 00:26:46 -06:00
Guy Nachshon	f0b3557b02	fix: replace malicious and deleted package (#833 )	2023-01-18 13:41:05 -08:00
xzuyn	2a1d1c1001	make jpeg optimized and progressive (#820 ) * GUI make jpeg optimized and progressive * CLI make jpeg optimized and progressive	2023-01-17 16:35:36 -08:00
Abhishek Varma	df7eb80e5b	[SD-CLI] Make `custom_model` take highest priority for generating models if present -- This commit makes `custom_model` take highest priority for generating models if present. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-17 22:50:58 +05:30
Fraser Humphries	b9d947ce6f	style: 🎨 Restore whitespace	2023-01-17 17:45:32 +05:30
Fraser Humphries	e6589d2454	fix: 🏗️ Add demo.css to spec file datas	2023-01-17 17:45:32 +05:30
Fraser Humphries	0f5ac6afcf	fix: 🐛 resolve css file path relative to __file__ issues-816	2023-01-17 17:45:32 +05:30
Abhishek Varma	bc1bb1d188	[SD-CLI] Fix vmfb naming + update README.md for `custom_model` -- This commit introduces a fix for .vmfb naming to strip away any non-alphanumeric characters from `custom_model` path. -- It also updates the README.md to include the `custom_model` arg. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-17 16:27:54 +05:30
Abhishek Varma	3af2dd10ce	[SD-CLI] Add CKPT support to update models irrespective of `import_mlir` flag -- This commit adds CKPT support to update models irrespective of `import_mlir` flag. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-01-17 13:24:27 +05:30
yzhang93	dd22c65855	Add CUDA tuned models for SD variants (#814 )	2023-01-16 09:38:27 -08:00
PhaneeshB	48137ced19	add png as default format	2023-01-16 18:37:36 +05:30
Phaneesh Barwaria	6eb47c12d1	add multi-run in single execution (#812 )	2023-01-13 11:12:43 -08:00
Prashant Kumar	5a1fc6675a	This PR adds --import-mlir for f16 tensors without cuda.	2023-01-13 22:19:53 +05:30
Prashant Kumar	6f80825814	Modify import_with_fx to import with dtype=f16.	2023-01-13 22:19:53 +05:30
PhaneeshB	f0dd48ed2a	remaining disk space warning	2023-01-13 19:34:05 +05:30
Gaurav Shukla	15e2df0db0	[SD][web] Add a UI textbox to show the output location Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-01-13 19:33:04 +05:30
Fraser Humphries	4ad0109769	fix: 🐛 Extract demo css string to css file fix: 🐛 Extract demo css string to css file issues/807 fix: 🐛 Revert background colors	2023-01-13 16:42:05 +05:30
				`@@ -0,0 +1 @@`
				`from apps.stable_diffusion.scripts.txt2img import txt2img_inf`