add generate_sharktank for stable_diffusion model defaults

2026-04-20 03:00:34 -04:00 · 2023-01-29 03:24:18 +00:00
100 changed files with 4271 additions and 1795 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -50,10 +50,10 @@ jobs:
      shell: powershell
      run: |
        ./setup_venv.ps1
-        pyinstaller .\apps\stable_diffusion\shark_sd.spec
+        pyinstaller web/shark_sd.spec
        mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
-        pyinstaller .\apps\stable_diffusion\shark_sd_cli.spec
+        pyinstaller .\shark\examples\shark_inference\stable_diffusion\shark_sd_cli.spec
        mv ./dist/shark_sd_cli.exe ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe

--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -29,7 +29,7 @@ jobs:
    strategy:
      fail-fast: true
      matrix:
-        os: [7950x, icelake, a100, MacStudio, ubuntu-latest]
+        os: [icelake, a100, MacStudio, ubuntu-latest]
        suite: [cpu,cuda,vulkan]
        python-version: ["3.10"]
        include:
@@ -52,19 +52,13 @@ jobs:
            suite: cuda
          - os: a100
            suite: cpu
-          - os: 7950x
-            suite: cpu
-          - os: 7950x
-            suite: cuda

    runs-on: ${{ matrix.os }}

    steps:
    - uses: actions/checkout@v3
-      if: matrix.os != '7950x'
    
    - name: Set Environment Variables
-      if: matrix.os != '7950x'
      run: |
        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
@@ -84,9 +78,6 @@ jobs:
        #cache-dependency-path: |
        #  **/requirements-importer.txt
        #  **/requirements.txt
-    
-    - uses: actions/checkout@v2
-      if: matrix.os == '7950x'
          
    - name: Install dependencies
      if: matrix.suite == 'lint'
@@ -111,7 +102,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k cpu
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k cpu
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

@@ -121,41 +112,26 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k cuda
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
-        # Disabled due to black image bug
-        # python build_tools/stable_diffusion_testing.py --device=cuda 
+        sh build_tools/stable_diff_main_test.sh

    - name: Validate Vulkan Models (MacOS)
      if: matrix.suite == 'vulkan' && matrix.os == 'MacStudio'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
        export DYLD_LIBRARY_PATH=/usr/local/lib/
        echo $PATH
        pip list | grep -E "torch|iree"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k vulkan
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" -k vulkan --update_tank

    - name: Validate Vulkan Models (a100)
-      if: matrix.suite == 'vulkan' && matrix.os == 'a100'
+      if: matrix.suite == 'vulkan' && matrix.os != 'MacStudio'
      run: |
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./shark_tmp/shark_cache" -k vulkan
-        python build_tools/stable_diffusion_testing.py --device=vulkan
-
-    - name: Validate Vulkan Models (Windows)
-      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
-      run: |
-        ./setup_venv.ps1
-        pytest --benchmark -k vulkan -s
-        type bench_results.csv
-
-    - name: Validate Stable Diffusion Models (Windows)
-      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
-      run: |
-        ./setup_venv.ps1
-        python build_tools/stable_diffusion_testing.py --device=vulkan
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k vulkan
--- a/.gitignore
+++ b/.gitignore
@@ -170,5 +170,6 @@ tank/dict_configs.py
 cache_models/
 onnx_models/

-# Generated images
-generated_imgs/
+#web logging
+web/logs/
+web/stored_results/stable_diffusion/
--- a/README.md
+++ b/README.md
@@ -1,47 +1,12 @@
 # SHARK

-High Performance Machine Learning Distribution
+High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerators and Heterogeneous Clusters

 [![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
 [![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)


-<details>
-  <summary>Prerequisites - Drivers </summary>
-  
-#### Install your Windows hardware drivers
-* [AMD RDNA Users] Download this specific driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree). Latest drivers may not work.
-* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
-* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
-  
-#### Linux Drivers
-* MESA / RADV drivers wont work with FP16. Please use the latest AMGPU-PRO drivers (non-pro OSS drivers also wont work) or the latest NVidia Linux Drivers.
-
-Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
-
-</details>
-
-
- 
-### Quick Start for SHARK Stable Diffusion for Windows 10/11 Users
-
-Install Driver from [Prerequisites](https://github.com/nod-ai/SHARK#install-your-hardware-drivers) above 
-
-Download the latest .exe https://github.com/nod-ai/SHARK/releases. 
-
-Double click the .exe and you should have the [UI]( http://localhost:8080/?__theme=dark) in the browser. 
-
-If you have custom models (ckpt, safetensors) put in a `models/` directory where the .exe is. 
-
-Enjoy. 
-
-Some known AMD Driver quirks and fixes with cursors are documented [here](https://github.com/nod-ai/SHARK/blob/main/apps/stable_diffusion/stable_diffusion_amd.md ).
-
-
-<details>
-  <summary>Advanced Installation (Only for developers)</summary>
-  
-## Advanced Installation (Windows, Linux and macOS) for developers
+## Installation (Windows, Linux and macOS)

 ## Check out the code

@@ -80,12 +45,12 @@ source shark.venv/bin/activate

 #### Windows 10/11 Users
 ```powershell
-(shark.venv) PS C:\g\shark> cd .\apps\stable_diffusion\web\
-(shark.venv) PS C:\g\shark\apps\stable_diffusion\web> python .\index.py
+(shark.venv) PS C:\Users\nod\SHARK> cd web
+(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
 ```
-#### Linux / macOS Users
+#### Linux Users
 ```shell
-(shark.venv) > cd apps/stable_diffusion/web
+(shark.venv) > cd web
 (shark.venv) > python index.py
 ```

@@ -98,18 +63,25 @@ source shark.venv/bin/activate

 ### Run Stable Diffusion on your device - Commandline

+#### Install your hardware drivers
+* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree)
+* [macOS Users] Download and install the latest Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home)
+* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
+
+Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
+
+
 #### Windows 10/11 Users
 ```powershell
-(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\txt2img.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
 ```

 #### Linux / macOS Users
 ```shell
-python3.10 apps/stable_diffusion/scripts/txt2img.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
 ```

 You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
-</details>

 The output on a 7900XTX would like:

--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -1 +0,0 @@
-from apps.stable_diffusion.scripts.txt2img import txt2img_inf
--- a/apps/stable_diffusion/scripts/txt2img.py
+++ b/apps/stable_diffusion/scripts/txt2img.py
@@ -1,309 +0,0 @@
-import os
-
-if "AMD_ENABLE_LLPC" not in os.environ:
-    os.environ["AMD_ENABLE_LLPC"] = "1"
-
-import sys
-import json
-import torch
-import re
-import time
-from pathlib import Path
-from PIL import PngImagePlugin
-from datetime import datetime as dt
-from dataclasses import dataclass
-from csv import DictWriter
-from apps.stable_diffusion.src import (
-    args,
-    Text2ImagePipeline,
-    get_schedulers,
-    set_init_device_flags,
-)
-
-
-@dataclass
-class Config:
-    model_id: str
-    ckpt_loc: str
-    precision: str
-    batch_size: int
-    max_length: int
-    height: int
-    width: int
-    device: str
-
-
-# This has to come before importing cache objects
-if args.clear_all:
-    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
-    from glob import glob
-    import shutil
-
-    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
-    for vmfb in vmfbs:
-        if os.path.exists(vmfb):
-            os.remove(vmfb)
-    # Temporary workaround of deleting yaml files to incorporate diffusers' pipeline.
-    # TODO: Remove this once we have better weight updation logic.
-    inference_yaml = ["v2-inference-v.yaml", "v1-inference.yaml"]
-    for yaml in inference_yaml:
-        if os.path.exists(yaml):
-            os.remove(yaml)
-    home = os.path.expanduser("~")
-    if os.name == "nt":  # Windows
-        appdata = os.getenv("LOCALAPPDATA")
-        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
-        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
-    elif os.name == "unix":
-        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
-        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
-
-
-# save output images and the inputs correspoding to it.
-def save_output_img(output_img):
-    output_path = args.output_dir if args.output_dir else Path.cwd()
-    generated_imgs_path = Path(output_path, "generated_imgs")
-    generated_imgs_path.mkdir(parents=True, exist_ok=True)
-    csv_path = Path(generated_imgs_path, "imgs_details.csv")
-
-    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
-    out_img_name = (
-        f"{prompt_slice}_{args.seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
-    )
-
-    if args.output_img_format == "jpg":
-        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
-        output_img.save(out_img_path, quality=95, subsampling=0)
-    else:
-        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
-        pngInfo = PngImagePlugin.PngInfo()
-
-        if args.write_metadata_to_png:
-            pngInfo.add_text(
-                "parameters",
-                f"{args.prompts[0]}\nNegative prompt: {args.negative_prompts[0]}\nSteps:{args.steps}, Sampler: {args.scheduler}, CFG scale: {args.guidance_scale}, Seed: {args.seed}, Size: {args.width}x{args.height}, Model: {args.hf_model_id}",
-            )
-
-        output_img.save(out_img_path, "PNG", pnginfo=pngInfo)
-
-        if args.output_img_format not in ["png", "jpg"]:
-            print(
-                f"[ERROR] Format {args.output_img_format} is not supported yet."
-                "Image saved as png instead. Supported formats: png / jpg"
-            )
-
-    new_entry = {
-        "VARIANT": args.hf_model_id,
-        "SCHEDULER": args.scheduler,
-        "PROMPT": args.prompts[0],
-        "NEG_PROMPT": args.negative_prompts[0],
-        "SEED": args.seed,
-        "CFG_SCALE": args.guidance_scale,
-        "PRECISION": args.precision,
-        "STEPS": args.steps,
-        "HEIGHT": args.height,
-        "WIDTH": args.width,
-        "MAX_LENGTH": args.max_length,
-        "OUTPUT": out_img_path,
-    }
-
-    with open(csv_path, "a") as csv_obj:
-        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
-        dictwriter_obj.writerow(new_entry)
-        csv_obj.close()
-
-    if args.save_metadata_to_json:
-        del new_entry["OUTPUT"]
-        json_path = Path(generated_imgs_path, f"{out_img_name}.json")
-        with open(json_path, "w") as f:
-            json.dump(new_entry, f, indent=4)
-
-
-txt2img_obj = None
-config_obj = None
-schedulers = None
-
-
-# Exposed to UI.
-def txt2img_inf(
-    prompt: str,
-    negative_prompt: str,
-    height: int,
-    width: int,
-    steps: int,
-    guidance_scale: float,
-    seed: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-):
-    global txt2img_obj
-    global config_obj
-    global schedulers
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.seed = seed
-    args.steps = steps
-    args.scheduler = scheduler
-
-    # set ckpt_loc and hf_model_id.
-    types = (
-        ".ckpt",
-        ".safetensors",
-    )  # the tuple of file types
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = custom_model
-    else:
-        args.hf_model_id = custom_model
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    new_config_obj = Config(
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-    )
-    if config_obj != new_config_obj:
-        config_obj = new_config_obj
-        args.precision = precision
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.use_tuned = True
-        args.import_mlir = False
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-1-base"
-        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
-        txt2img_obj = Text2ImagePipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
-        )
-
-    if not txt2img_obj:
-        sys.exit("text to image pipeline must not return a null value")
-
-    txt2img_obj.scheduler = schedulers[scheduler]
-
-    start_time = time.time()
-    txt2img_obj.log = ""
-    generated_imgs = txt2img_obj.generate_images(
-        prompt,
-        negative_prompt,
-        batch_size,
-        height,
-        width,
-        steps,
-        guidance_scale,
-        seed,
-        args.max_length,
-        dtype,
-        args.use_base_vae,
-        cpu_scheduling,
-    )
-    total_time = time.time() - start_time
-    save_output_img(generated_imgs[0])
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={device}"
-    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={args.height}x{args.width}"
-    text_output += (
-        f", batch size={args.batch_size}, max_length={args.max_length}"
-    )
-    text_output += txt2img_obj.log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    return generated_imgs, text_output
-
-
-if __name__ == "__main__":
-    dtype = torch.float32 if args.precision == "fp32" else torch.half
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    set_init_device_flags()
-    schedulers = get_schedulers(args.hf_model_id)
-    scheduler_obj = schedulers[args.scheduler]
-
-    txt2img_obj = Text2ImagePipeline.from_pretrained(
-        scheduler_obj,
-        args.import_mlir,
-        args.hf_model_id,
-        args.ckpt_loc,
-        args.precision,
-        args.max_length,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.use_base_vae,
-        args.use_tuned,
-    )
-
-    start_time = time.time()
-    generated_imgs = txt2img_obj.generate_images(
-        args.prompts,
-        args.negative_prompts,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.steps,
-        args.guidance_scale,
-        args.seed,
-        args.max_length,
-        dtype,
-        args.use_base_vae,
-        cpu_scheduling,
-    )
-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={args.device}"
-    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={args.height}x{args.width}"
-    text_output += (
-        f", batch size={args.batch_size}, max_length={args.max_length}"
-    )
-    text_output += txt2img_obj.log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    save_output_img(generated_imgs[0])
-    print(text_output)
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -1,8 +0,0 @@
-from apps.stable_diffusion.src.utils import (
-    args,
-    set_init_device_flags,
-    prompt_examples,
-    get_available_devices,
-)
-from apps.stable_diffusion.src.pipelines import Text2ImagePipeline
-from apps.stable_diffusion.src.schedulers import get_schedulers
--- a/apps/stable_diffusion/src/models/init.py
+++ b/apps/stable_diffusion/src/models/init.py
@@ -1,11 +0,0 @@
-from apps.stable_diffusion.src.models.model_wrappers import (
-    SharkifyStableDiffusionModel,
-)
-from apps.stable_diffusion.src.models.opt_params import (
-    get_vae,
-    get_unet,
-    get_clip,
-    get_tokenizer,
-    get_params,
-    get_variant_version,
-)
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -1,3 +0,0 @@
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_txt2img import (
-    Text2ImagePipeline,
-)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -1,134 +0,0 @@
-import torch
-from tqdm.auto import tqdm
-import numpy as np
-from random import randint
-from transformers import CLIPTokenizer
-from typing import Union
-from shark.shark_inference import SharkInference
-from diffusers import (
-    DDIMScheduler,
-    PNDMScheduler,
-    LMSDiscreteScheduler,
-    EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler,
-    DPMSolverMultistepScheduler,
-)
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-    StableDiffusionPipeline,
-)
-
-
-class Text2ImagePipeline(StableDiffusionPipeline):
-    def __init__(
-        self,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-            SharkEulerDiscreteScheduler,
-        ],
-    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-
-    def prepare_latents(
-        self,
-        batch_size,
-        height,
-        width,
-        generator,
-        num_inference_steps,
-        dtype,
-    ):
-        latents = torch.randn(
-            (
-                batch_size,
-                4,
-                height // 8,
-                width // 8,
-            ),
-            generator=generator,
-            dtype=torch.float32,
-        ).to(dtype)
-
-        self.scheduler.set_timesteps(num_inference_steps)
-        self.scheduler.is_scale_input_called = True
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def generate_images(
-        self,
-        prompts,
-        neg_prompts,
-        batch_size,
-        height,
-        width,
-        num_inference_steps,
-        guidance_scale,
-        seed,
-        max_length,
-        dtype,
-        use_base_vae,
-        cpu_scheduling,
-    ):
-        # prompts and negative prompts must be a list.
-        if isinstance(prompts, str):
-            prompts = [prompts]
-
-        if isinstance(neg_prompts, str):
-            neg_prompts = [neg_prompts]
-
-        prompts = prompts * batch_size
-        neg_prompts = neg_prompts * batch_size
-
-        # seed generator to create the inital latent noise. Also handle out of range seeds.
-        uint32_info = np.iinfo(np.uint32)
-        uint32_min, uint32_max = uint32_info.min, uint32_info.max
-        if seed < uint32_min or seed >= uint32_max:
-            seed = randint(uint32_min, uint32_max)
-        generator = torch.manual_seed(seed)
-
-        # Get initial latents
-        init_latents = self.prepare_latents(
-            batch_size=batch_size,
-            height=height,
-            width=width,
-            generator=generator,
-            num_inference_steps=num_inference_steps,
-            dtype=dtype,
-        )
-
-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
-
-        # guidance scale as a float32 tensor.
-        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
-
-        # Get Image latents
-        latents = self.produce_img_latents(
-            latents=init_latents,
-            text_embeddings=text_embeddings,
-            guidance_scale=guidance_scale,
-            total_timesteps=self.scheduler.timesteps,
-            dtype=dtype,
-            cpu_scheduling=cpu_scheduling,
-        )
-
-        # Img latents -> PIL images
-        all_imgs = []
-        for i in tqdm(range(0, latents.shape[0], batch_size)):
-            imgs = self.decode_latents(
-                latents=latents[i : i + batch_size],
-                use_base_vae=use_base_vae,
-                cpu_scheduling=cpu_scheduling,
-            )
-            all_imgs.extend(imgs)
-
-        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -1,204 +0,0 @@
-import torch
-from transformers import CLIPTokenizer
-from PIL import Image
-from tqdm.auto import tqdm
-import time
-from typing import Union
-from diffusers import (
-    DDIMScheduler,
-    PNDMScheduler,
-    LMSDiscreteScheduler,
-    EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler,
-    DPMSolverMultistepScheduler,
-)
-from shark.shark_inference import SharkInference
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
-from apps.stable_diffusion.src.models import (
-    SharkifyStableDiffusionModel,
-    get_vae,
-    get_clip,
-    get_unet,
-    get_tokenizer,
-)
-from apps.stable_diffusion.src.utils import (
-    start_profiling,
-    end_profiling,
-)
-
-
-class StableDiffusionPipeline:
-    def __init__(
-        self,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-            SharkEulerDiscreteScheduler,
-        ],
-    ):
-        self.vae = vae
-        self.text_encoder = text_encoder
-        self.tokenizer = tokenizer
-        self.unet = unet
-        self.scheduler = scheduler
-        # TODO: Implement using logging python utility.
-        self.log = ""
-
-    def encode_prompts(self, prompts, neg_prompts, max_length):
-        # Tokenize text and get embeddings
-        text_input = self.tokenizer(
-            prompts,
-            padding="max_length",
-            max_length=max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        # Get unconditional embeddings as well
-        uncond_input = self.tokenizer(
-            neg_prompts,
-            padding="max_length",
-            max_length=max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])
-
-        clip_inf_start = time.time()
-        text_embeddings = self.text_encoder("forward", (text_input,))
-        clip_inf_time = (time.time() - clip_inf_start) * 1000
-        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"
-
-        return text_embeddings
-
-    def decode_latents(self, latents, use_base_vae, cpu_scheduling):
-        if use_base_vae:
-            latents = 1 / 0.18215 * latents
-
-        latents_numpy = latents
-        if cpu_scheduling:
-            latents_numpy = latents.detach().numpy()
-
-        profile_device = start_profiling(file_path="vae.rdc")
-        vae_start = time.time()
-        images = self.vae("forward", (latents_numpy,))
-        vae_inf_time = (time.time() - vae_start) * 1000
-        end_profiling(profile_device)
-        self.log += f"\nVAE Inference time (ms): {vae_inf_time:.3f}"
-
-        if use_base_vae:
-            images = torch.from_numpy(images)
-            images = (images.detach().cpu() * 255.0).numpy()
-            images = images.round()
-
-        images = torch.from_numpy(images).to(torch.uint8).permute(0, 2, 3, 1)
-        pil_images = [Image.fromarray(image) for image in images.numpy()]
-        return pil_images
-
-    def produce_img_latents(
-        self,
-        latents,
-        text_embeddings,
-        guidance_scale,
-        total_timesteps,
-        dtype,
-        cpu_scheduling,
-        return_all_latents=False,
-    ):
-        step_time_sum = 0
-        latent_history = [latents]
-        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
-        text_embeddings_numpy = text_embeddings.detach().numpy()
-        for i, t in tqdm(enumerate(total_timesteps)):
-            step_start_time = time.time()
-            timestep = torch.tensor([t]).to(dtype).detach().numpy()
-            latent_model_input = self.scheduler.scale_model_input(latents, t)
-            if cpu_scheduling:
-                latent_model_input = latent_model_input.detach().numpy()
-
-            # Profiling Unet.
-            profile_device = start_profiling(file_path="unet.rdc")
-            noise_pred = self.unet(
-                "forward",
-                (
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                ),
-                send_to_host=False,
-            )
-            end_profiling(profile_device)
-
-            if cpu_scheduling:
-                noise_pred = torch.from_numpy(noise_pred.to_host())
-                latents = self.scheduler.step(
-                    noise_pred, t, latents
-                ).prev_sample
-            else:
-                latents = self.scheduler.step(noise_pred, t, latents)
-
-            latent_history.append(latents)
-            step_time = (time.time() - step_start_time) * 1000
-            #  self.log += (
-            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
-            #  )
-            step_time_sum += step_time
-
-        avg_step_time = step_time_sum / len(total_timesteps)
-        self.log += f"\nAverage step time: {avg_step_time}ms/it"
-
-        if not return_all_latents:
-            return latents
-        all_latents = torch.cat(latent_history, dim=0)
-        return all_latents
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-            SharkEulerDiscreteScheduler,
-        ],
-        import_mlir: bool,
-        model_id: str,
-        ckpt_loc: str,
-        precision: str,
-        max_length: int,
-        batch_size: int,
-        height: int,
-        width: int,
-        use_base_vae: bool,
-        use_tuned: bool,
-    ):
-        if import_mlir:
-            mlir_import = SharkifyStableDiffusionModel(
-                model_id,
-                ckpt_loc,
-                precision,
-                max_len=max_length,
-                batch_size=batch_size,
-                height=height,
-                width=width,
-                use_base_vae=use_base_vae,
-                use_tuned=use_tuned,
-            )
-            clip, unet, vae = mlir_import()
-            return cls(vae, clip, get_tokenizer(), unet, scheduler)
-        return cls(
-            get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
-        )
--- a/apps/stable_diffusion/src/schedulers/init.py
+++ b/apps/stable_diffusion/src/schedulers/init.py
@@ -1,4 +0,0 @@
-from apps.stable_diffusion.src.schedulers.sd_schedulers import get_schedulers
-from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
-    SharkEulerDiscreteScheduler,
-)
--- a/apps/stable_diffusion/src/schedulers/sd_schedulers.py
+++ b/apps/stable_diffusion/src/schedulers/sd_schedulers.py
@@ -1,51 +0,0 @@
-from diffusers import (
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler,
-)
-from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
-    SharkEulerDiscreteScheduler,
-)
-
-
-def get_schedulers(model_id):
-    schedulers = dict()
-    schedulers["PNDM"] = PNDMScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["DDIM"] = DDIMScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers[
-        "DPMSolverMultistep"
-    ] = DPMSolverMultistepScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers[
-        "EulerAncestralDiscrete"
-    ] = EulerAncestralDiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers[
-        "SharkEulerDiscrete"
-    ] = SharkEulerDiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["SharkEulerDiscrete"].compile()
-    return schedulers
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -1,25 +0,0 @@
-from apps.stable_diffusion.src.utils.profiler import (
-    start_profiling,
-    end_profiling,
-)
-from apps.stable_diffusion.src.utils.resources import (
-    prompt_examples,
-    models_db,
-    base_models,
-    opt_flags,
-    resource_path,
-)
-from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
-from apps.stable_diffusion.src.utils.stable_args import args
-from apps.stable_diffusion.src.utils.utils import (
-    get_shark_model,
-    compile_through_fx,
-    set_iree_runtime_flags,
-    map_device_to_name_path,
-    set_init_device_flags,
-    get_available_devices,
-    get_opt_flags,
-    preprocessCKPT,
-    fetch_or_delete_vmfbs,
-    get_path_to_diffusers_checkpoint,
-)
--- a/apps/stable_diffusion/src/utils/profiler.py
+++ b/apps/stable_diffusion/src/utils/profiler.py
@@ -1,18 +0,0 @@
-from apps.stable_diffusion.src.utils.stable_args import args
-
-
-# Helper function to profile the vulkan device.
-def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
-    if args.vulkan_debug_utils and "vulkan" in args.device:
-        import iree
-
-        print(f"Profiling and saving to {file_path}.")
-        vulkan_device = iree.runtime.get_device(args.device)
-        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
-        return vulkan_device
-    return None
-
-
-def end_profiling(device):
-    if device:
-        return device.end_profiling()
--- a/apps/stable_diffusion/src/utils/resources/opt_flags.json
+++ b/apps/stable_diffusion/src/utils/resources/opt_flags.json
@@ -1,101 +0,0 @@
-{
-  "unet": {
-    "tuned": {
-      "fp16": {
-        "default_compilation_flags": []
-      },
-      "fp32": {
-        "default_compilation_flags": []
-      }
-    },
-    "untuned": {
-      "fp16": {
-        "default_compilation_flags": [
-          "--iree-flow-enable-padding-linalg-ops",
-          "--iree-flow-linalg-ops-padding-size=32"
-        ],
-        "specified_compilation_flags": {
-          "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
-          "default_device": ["--iree-flow-enable-conv-img2col-transform"]
-        }
-      },
-      "fp32": {
-        "default_compilation_flags": [
-          "--iree-flow-enable-conv-nchw-to-nhwc-transform",
-          "--iree-flow-enable-padding-linalg-ops",
-          "--iree-flow-linalg-ops-padding-size=16"
-        ]
-      }
-    }
-  },
-  "vae": {
-    "tuned": {
-      "fp16": {
-        "default_compilation_flags": [],
-        "specified_compilation_flags": {
-          "cuda": [],
-          "default_device": ["--iree-flow-enable-padding-linalg-ops",
-                             "--iree-flow-linalg-ops-padding-size=32",
-                             "--iree-flow-enable-conv-img2col-transform"]
-        }
-      },
-      "fp32": {
-        "default_compilation_flags": [],
-        "specified_compilation_flags": {
-          "cuda": [],
-          "default_device": [
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=32",
-            "--iree-flow-enable-conv-img2col-transform"
-          ]
-        }
-      }
-    },
-    "untuned": {
-      "fp16": {
-        "default_compilation_flags": [
-          "--iree-flow-enable-padding-linalg-ops",
-          "--iree-flow-linalg-ops-padding-size=32",
-          "--iree-flow-enable-conv-img2col-transform"
-        ]
-      },
-      "fp32": {
-        "default_compilation_flags": [
-          "--iree-flow-enable-conv-nchw-to-nhwc-transform",
-          "--iree-flow-enable-padding-linalg-ops",
-          "--iree-flow-linalg-ops-padding-size=16"
-        ]
-      }
-    }
-  },
-  "clip": {
-    "tuned": {
-      "fp16": {
-        "default_compilation_flags": [
-          "--iree-flow-linalg-ops-padding-size=16",
-          "--iree-flow-enable-padding-linalg-ops"
-        ]
-      },
-      "fp32": {
-        "default_compilation_flags": [
-          "--iree-flow-linalg-ops-padding-size=16",
-          "--iree-flow-enable-padding-linalg-ops"
-        ]
-      }
-    },
-    "untuned": {
-      "fp16": {
-        "default_compilation_flags": [
-          "--iree-flow-linalg-ops-padding-size=16",
-          "--iree-flow-enable-padding-linalg-ops"
-        ]
-      },
-      "fp32": {
-        "default_compilation_flags": [
-          "--iree-flow-linalg-ops-padding-size=16",
-          "--iree-flow-enable-padding-linalg-ops"
-        ]
-      }
-    }
-  }
-}
--- a/apps/stable_diffusion/web/gradio/img2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/img2img_ui.py
--- a/apps/stable_diffusion/web/gradio/txt2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/txt2img_ui.py
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,270 +0,0 @@
-import os
-import sys
-from pathlib import Path
-import glob
-
-if "AMD_ENABLE_LLPC" not in os.environ:
-    os.environ["AMD_ENABLE_LLPC"] = "1"
-
-if sys.platform == "darwin":
-    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
-
-
-def resource_path(relative_path):
-    """Get absolute path to resource, works for dev and for PyInstaller"""
-    base_path = getattr(
-        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
-    )
-    return os.path.join(base_path, relative_path)
-
-
-import gradio as gr
-from PIL import Image
-from apps.stable_diffusion.src import (
-    prompt_examples,
-    args,
-    get_available_devices,
-)
-from apps.stable_diffusion.scripts import txt2img_inf
-
-nodlogo_loc = resource_path("logos/nod-logo.png")
-sdlogo_loc = resource_path("logos/sd-demo-logo.png")
-
-
-demo_css = resource_path("css/sd_dark_theme.css")
-
-
-with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:
-    with gr.Row(elem_id="ui_title"):
-        nod_logo = Image.open(nodlogo_loc)
-        logo2 = Image.open(sdlogo_loc)
-        with gr.Row():
-            with gr.Column(scale=1, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=nod_logo,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="top_logo",
-                ).style(width=150, height=100)
-            with gr.Column(scale=5, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=logo2,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="demo_title",
-                ).style(width=150, height=100)
-
-    with gr.Row(elem_id="ui_body"):
-        with gr.Row():
-            with gr.Column(scale=1, min_width=600):
-                with gr.Row():
-                    ckpt_path = (
-                        Path(args.ckpt_dir)
-                        if args.ckpt_dir
-                        else Path(Path.cwd(), "models")
-                    )
-                    ckpt_path.mkdir(parents=True, exist_ok=True)
-                    types = (
-                        "*.ckpt",
-                        "*.safetensors",
-                    )  # the tuple of file types
-                    ckpt_files = ["None"]
-                    for extn in types:
-                        files = glob.glob(os.path.join(ckpt_path, extn))
-                        ckpt_files.extend(files)
-                    custom_model = gr.Dropdown(
-                        label=f"Models (Custom Model path: {ckpt_path})",
-                        value="None",
-                        choices=ckpt_files
-                        + [
-                            "Linaqruf/anything-v3.0",
-                            "prompthero/openjourney",
-                            "wavymulder/Analog-Diffusion",
-                            "stabilityai/stable-diffusion-2-1",
-                            "stabilityai/stable-diffusion-2-1-base",
-                            "CompVis/stable-diffusion-v1-4",
-                        ],
-                    )
-                    hf_model_id = gr.Textbox(
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
-                        value="",
-                        label="HuggingFace Model ID",
-                    )
-
-                with gr.Group(elem_id="prompt_box_outer"):
-                    prompt = gr.Textbox(
-                        label="Prompt",
-                        value="cyberpunk forest by Salvador Dali",
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                    negative_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value="trees, green",
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                with gr.Accordion(label="Advanced Options", open=False):
-                    with gr.Row():
-                        scheduler = gr.Dropdown(
-                            label="Scheduler",
-                            value="SharkEulerDiscrete",
-                            choices=[
-                                "DDIM",
-                                "PNDM",
-                                "LMSDiscrete",
-                                "DPMSolverMultistep",
-                                "EulerDiscrete",
-                                "EulerAncestralDiscrete",
-                                "SharkEulerDiscrete",
-                            ],
-                        )
-                        batch_size = gr.Slider(
-                            1, 4, value=1, step=1, label="Number of Images"
-                        )
-                    with gr.Row():
-                        height = gr.Slider(
-                            384, 786, value=512, step=8, label="Height"
-                        )
-                        width = gr.Slider(
-                            384, 786, value=512, step=8, label="Width"
-                        )
-                        precision = gr.Radio(
-                            label="Precision",
-                            value="fp16",
-                            choices=[
-                                "fp16",
-                                "fp32",
-                            ],
-                            visible=False,
-                        )
-                        max_length = gr.Radio(
-                            label="Max Length",
-                            value=64,
-                            choices=[
-                                64,
-                                77,
-                            ],
-                            visible=False,
-                        )
-                    with gr.Row():
-                        steps = gr.Slider(
-                            1, 100, value=50, step=1, label="Steps"
-                        )
-                        guidance_scale = gr.Slider(
-                            0,
-                            50,
-                            value=7.5,
-                            step=0.1,
-                            label="CFG Scale",
-                        )
-                    with gr.Row():
-                        save_metadata_to_png = gr.Checkbox(
-                            label="Save prompt information to PNG",
-                            value=True,
-                            interactive=True,
-                        )
-                        save_metadata_to_json = gr.Checkbox(
-                            label="Save prompt information to JSON file",
-                            value=False,
-                            interactive=True,
-                        )
-                with gr.Row():
-                    seed = gr.Number(value=-1, precision=0, label="Seed")
-                    available_devices = get_available_devices()
-                    device = gr.Dropdown(
-                        label="Device",
-                        value=available_devices[0],
-                        choices=available_devices,
-                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        None,
-                        inputs=[],
-                        outputs=[seed],
-                        _js="() => Math.floor(Math.random() * 4294967295)",
-                    )
-                    stable_diffusion = gr.Button("Generate Image")
-                with gr.Accordion(label="Prompt Examples!", open=False):
-                    ex = gr.Examples(
-                        examples=prompt_examples,
-                        inputs=prompt,
-                        cache_examples=False,
-                        elem_id="prompt_examples",
-                    )
-
-            with gr.Column(scale=1, min_width=600):
-                with gr.Group():
-                    gallery = gr.Gallery(
-                        label="Generated images",
-                        show_label=False,
-                        elem_id="gallery",
-                    ).style(grid=[2], height="auto")
-                    std_output = gr.Textbox(
-                        value="Nothing to show.",
-                        lines=4,
-                        show_label=False,
-                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
-
-        prompt.submit(
-            txt2img_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                height,
-                width,
-                steps,
-                guidance_scale,
-                seed,
-                batch_size,
-                scheduler,
-                custom_model,
-                hf_model_id,
-                precision,
-                device,
-                max_length,
-                save_metadata_to_json,
-                save_metadata_to_png,
-            ],
-            outputs=[gallery, std_output],
-            show_progress=args.progress_bar,
-        )
-        stable_diffusion.click(
-            txt2img_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                height,
-                width,
-                steps,
-                guidance_scale,
-                seed,
-                batch_size,
-                scheduler,
-                custom_model,
-                hf_model_id,
-                precision,
-                device,
-                max_length,
-                save_metadata_to_json,
-                save_metadata_to_png,
-            ],
-            outputs=[gallery, std_output],
-            show_progress=args.progress_bar,
-        )
-
-shark_web.queue()
-shark_web.launch(
-    share=args.share,
-    inbrowser=True,
-    server_name="0.0.0.0",
-    server_port=args.server_port,
-)
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -1,5 +1,5 @@
 import argparse
-from PIL import Image
+import torchvision
 import numpy as np

 import requests
@@ -22,24 +22,20 @@ def get_image(url, local_filename):
    if res.status_code == 200:
        with open(local_filename, "wb") as f:
            shutil.copyfileobj(res.raw, f)
-
-
-def compare_images(new_filename, golden_filename):
-    new = np.array(Image.open(new_filename)) / 255.0
-    golden = np.array(Image.open(golden_filename)) / 255.0
-    diff = np.abs(new - golden)
-    mean = np.mean(diff)
-    if mean > 0.01:
-        subprocess.run(
-            ["gsutil", "cp", new_filename, "gs://shark_tank/testdata/builder/"]
-        )
-        raise SystemExit("new and golden not close")
-    else:
-        print("SUCCESS")
+    return torchvision.io.read_image(local_filename).numpy()


 if __name__ == "__main__":
    args = parser.parse_args()
+    new = torchvision.io.read_image(args.newfile).numpy() / 255.0
    tempfile_name = os.path.join(os.getcwd(), "golden.png")
-    get_image(args.golden_url, tempfile_name)
-    compare_images(args.newfile, tempfile_name)
+    golden = get_image(args.golden_url, tempfile_name) / 255.0
+    diff = np.abs(new - golden)
+    mean = np.mean(diff)
+    if not mean < 0.2:
+        subprocess.run(
+            ["gsutil", "cp", args.newfile, "gs://shark_tank/testdata/builder/"]
+        )
+        raise SystemExit("new and golden not close")
+    else:
+        print("SUCCESS")
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -1,77 +0,0 @@
-import os
-import subprocess
-from apps.stable_diffusion.src.utils.resources import (
-    get_json_file,
-)
-from shark.shark_downloader import download_public_file
-from image_comparison import compare_images
-import argparse
-from glob import glob
-import shutil
-
-model_config_dicts = get_json_file(
-    os.path.join(
-        os.getcwd(),
-        "apps/stable_diffusion/src/utils/resources/model_config.json",
-    )
-)
-
-
-def test_loop(device="vulkan", beta=False, extra_flags=[]):
-    # Get golden values from tank
-    shutil.rmtree("./test_images", ignore_errors=True)
-    os.mkdir("./test_images")
-    os.mkdir("./test_images/golden")
-    hf_model_names = model_config_dicts[0].values()
-    tuned_options = ["--no-use_tuned"]  #'use_tuned']
-    devices = ["vulkan"]
-    if beta:
-        extra_flags.append("--beta_models=True")
-    for model_name in hf_model_names:
-        for use_tune in tuned_options:
-            command = [
-                "python",
-                "apps/stable_diffusion/scripts/txt2img.py",
-                "--device=" + device,
-                "--output_dir=./test_images/" + model_name,
-                "--hf_model_id=" + model_name,
-                use_tune,
-            ]
-            command += extra_flags
-            generated_image = not subprocess.call(
-                command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
-            )
-            if generated_image:
-                os.makedirs(
-                    "./test_images/golden/" + model_name, exist_ok=True
-                )
-                download_public_file(
-                    "gs://shark_tank/testdata/golden/" + model_name,
-                    "./test_images/golden/" + model_name,
-                )
-                comparison = [
-                    "python",
-                    "build_tools/image_comparison.py",
-                    "--golden_url=gs://shark_tank/testdata/golden/"
-                    + model_name
-                    + "/*.png",
-                    "--newfile=./test_images/" + model_name + "/*.png",
-                ]
-                test_file = glob("./test_images/" + model_name + "/*.png")[0]
-                golden_path = "./test_images/golden/" + model_name + "/*.png"
-                golden_file = glob(golden_path)[0]
-                compare_images(test_file, golden_file)
-
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("-d", "--device", default="vulkan")
-parser.add_argument(
-    "-b", "--beta", action=argparse.BooleanOptionalAction, default=False
-)
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    print(args)
-    test_loop(args.device, args.beta, [])
--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -16,6 +16,7 @@ nodlogo_loc = shark_root.joinpath(


 with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
+
    with gr.Row(elem_id="ui_title"):
        nod_logo = Image.open(nodlogo_loc)
        with gr.Column(scale=1, elem_id="demo_title_outer"):
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -18,10 +18,10 @@ import subprocess as sp
 import hashlib
 import numpy as np
 from pathlib import Path
-from apps.stable_diffusion.src.models import (
+from shark.examples.shark_inference.stable_diffusion import (
    model_wrappers as mw,
 )
-from apps.stable_diffusion.src.utils.stable_args import (
+from shark.examples.shark_inference.stable_diffusion.stable_args import (
    args,
 )

@@ -58,6 +58,7 @@ def save_torch_model(torch_model_list):
            model = None
            input = None
            if model_type == "stable_diffusion":
+
                args.use_tuned = False
                args.import_mlir = True
                args.use_tuned = False
@@ -280,3 +281,8 @@ if __name__ == "__main__":

    if args.tflite_model_csv:
        save_tflite_model(args.tflite_model_csv)
+
+    if args.upload:
+        git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
+        print("uploading files to gs://shark_tank/" + git_hash)
+        os.system(f"gsutil cp -r {WORKDIR}* gs://shark_tank/" + git_hash)
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,8 +21,6 @@ scipy
 ftfy
 gradio
 altair
-omegaconf
-safetensors

 # Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
 pefile
--- a/setup.py
+++ b/setup.py
@@ -2,12 +2,11 @@ from setuptools import find_packages
 from setuptools import setup

 import os
-import glob

 with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

-PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.5"
+PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
 backend_deps = []
 if "NO_BACKEND" in os.environ.keys():
    backend_deps = [
@@ -35,7 +34,6 @@ setup(
    ],
    packages=find_packages(exclude=("examples")),
    python_requires=">=3.9",
-    data_files=glob.glob("apps/stable_diffusion/resources/**"),
    install_requires=[
        "numpy",
        "PyYAML",
--- a/shark/examples/shark_inference/ESRGAN/esrgan.py
+++ b/shark/examples/shark_inference/ESRGAN/esrgan.py
@@ -128,6 +128,7 @@ def load_mlir(mlir_loc):


 def compile_through_fx(model, inputs, mlir_loc=None):
+
    module = load_mlir(mlir_loc)
    if module == None:
        fx_g = make_fx(
--- a/shark/examples/shark_inference/simple_dlrm.py
+++ b/shark/examples/shark_inference/simple_dlrm.py
@@ -151,6 +151,7 @@ class DLRM_Net(nn.Module):
            and (ln_top is not None)
            and (arch_interaction_op is not None)
        ):
+
            # save arguments
            self.output_d = 0
            self.arch_interaction_op = arch_interaction_op
@@ -215,6 +216,7 @@ class DLRM_Net(nn.Module):
        return ly

    def interact_features(self, x, ly):
+
        if self.arch_interaction_op == "dot":
            # concatenate dense and sparse features
            (batch_size, d) = x.shape
--- a/shark/examples/shark_inference/sparse_arch.py
+++ b/shark/examples/shark_inference/sparse_arch.py
@@ -99,6 +99,7 @@ class SparseArchShark(nn.Module):
        )

    def forward(self, *batched_inputs):
+
        concatenated_list = []
        input_enum, embedding_enum = 0, 0

@@ -120,6 +121,7 @@ class SparseArchShark(nn.Module):


 def test_sparse_arch() -> None:
+
    D = 3
    eb1_config = EmbeddingBagConfig(
        name="t1",
@@ -209,6 +211,7 @@ class DLRMShark(nn.Module):
    def forward(
        self, dense_features: torch.Tensor, *sparse_features
    ) -> torch.Tensor:
+
        embedded_dense = self.dense_arch(dense_features)
        embedded_sparse = self.sparse_arch(*sparse_features)
        concatenated_dense = self.inter_arch(
--- a/shark/examples/shark_inference/stable_diff.py
+++ b/shark/examples/shark_inference/stable_diff.py
@@ -0,0 +1,272 @@
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
+import torch
+from PIL import Image
+from diffusers import LMSDiscreteScheduler
+from tqdm.auto import tqdm
+from shark.shark_inference import SharkInference
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+import torch_mlir
+import tempfile
+import numpy as np
+
+# pip install diffusers
+# pip install scipy
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a photograph of an astronaut riding a horse",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument("--steps", type=int, default=10, help="the device to use")
+p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
+p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
+args = p.parse_args()
+
+#####################################################
+
+
+def load_mlir(mlir_loc):
+    import os
+
+    if mlir_loc == None:
+        return None
+    print(f"Trying to load the model from {mlir_loc}.")
+    with open(os.path.join(mlir_loc)) as f:
+        mlir_module = f.read()
+    return mlir_module
+
+
+def compile_through_fx(model, inputs, mlir_loc=None, extra_args=[]):
+
+    module = load_mlir(mlir_loc)
+    if mlir_loc == None:
+        fx_g = make_fx(
+            model,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(*inputs)
+
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+
+        def strip_overloads(gm):
+            """
+            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+            Args:
+                gm(fx.GraphModule): The input Fx graph module to be modified
+            """
+            for node in gm.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverload):
+                    node.target = node.target.overloadpacket
+            gm.recompile()
+
+        strip_overloads(fx_g)
+
+        ts_g = torch.jit.script(fx_g)
+
+        module = torch_mlir.compile(
+            ts_g,
+            inputs,
+            torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=False,
+            verbose=False,
+        )
+
+    mlir_model = module
+    func_name = "forward"
+
+    shark_module = SharkInference(
+        mlir_model,
+        func_name,
+        device=args.device,
+        mlir_dialect="tm_tensor",
+    )
+    shark_module.compile(extra_args)
+
+    return shark_module
+
+
+if __name__ == "__main__":
+
+    YOUR_TOKEN = "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"
+
+    # 1. Load the autoencoder model which will be used to decode the latents into image space.
+    vae = AutoencoderKL.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        subfolder="vae",
+        use_auth_token=YOUR_TOKEN,
+    )
+
+    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModel.from_pretrained(
+        "openai/clip-vit-large-patch14"
+    )
+
+    class VaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="vae",
+                use_auth_token=YOUR_TOKEN,
+            )
+
+        def forward(self, input):
+            return self.vae.decode(input, return_dict=False)[0]
+
+    vae = VaeModel()
+    vae_input = torch.rand(1, 4, 64, 64)
+    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
+
+    # Wrap the unet model to return tuples.
+    class UnetModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.unet = UNet2DConditionModel.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="unet",
+                use_auth_token=YOUR_TOKEN,
+            )
+            self.in_channels = self.unet.in_channels
+            self.train(False)
+
+        def forward(self, x, y, z):
+            return self.unet.forward(x, y, z, return_dict=False)[0]
+
+    # 3. The UNet model for generating the latents.
+    unet = UnetModel()
+    latent_model_input = torch.rand([2, 4, 64, 64])
+    text_embeddings = torch.rand([2, 77, 768])
+    shark_unet = compile_through_fx(
+        unet,
+        (latent_model_input, torch.tensor([1.0]), text_embeddings),
+        args.mlir_loc,
+        ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+    )
+
+    # torch.jit.script(unet)
+
+    scheduler = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+
+    prompt = [args.prompt]
+
+    height = 512  # default height of Stable Diffusion
+    width = 512  # default width of Stable Diffusion
+
+    num_inference_steps = args.steps  # Number of denoising steps
+
+    guidance_scale = 7.5  # Scale for classifier-free guidance
+
+    generator = torch.manual_seed(
+        42
+    )  # Seed generator to create the inital latent noise
+
+    batch_size = len(prompt)
+
+    text_input = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+    text_embeddings = text_encoder(text_input.input_ids)[0]
+
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
+        [""] * batch_size,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
+
+    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+    latents = torch.randn(
+        (batch_size, unet.in_channels, height // 8, width // 8),
+        generator=generator,
+    )
+    # latents = latents.to(torch_device)
+
+    scheduler.set_timesteps(num_inference_steps)
+
+    latents = latents * scheduler.sigmas[0]
+    # print(latents, latents.shape)
+
+    for i, t in tqdm(enumerate(scheduler.timesteps)):
+
+        print(f"i = {i} t = {t}")
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        latent_model_input = torch.cat([latents] * 2)
+        sigma = scheduler.sigmas[i]
+        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+
+        # predict the noise residual
+
+        # with torch.no_grad():
+        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
+
+        latent_model_input_numpy = latent_model_input.detach().numpy()
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+
+        noise_pred = shark_unet.forward(
+            (
+                latent_model_input_numpy,
+                np.array([t]).astype(np.float32),
+                text_embeddings_numpy,
+            )
+        )
+        noise_pred = torch.from_numpy(noise_pred)
+
+        # perform guidance
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (
+            noise_pred_text - noise_pred_uncond
+        )
+
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
+
+    # print("Latents shape : ", latents.shape)
+
+    # scale and decode the image latents with vae
+    latents = 1 / 0.18215 * latents
+    latents_numpy = latents.detach().numpy()
+    image = shark_vae.forward((latents_numpy,))
+    image = torch.from_numpy(image)
+
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_f16.py
+++ b/shark/examples/shark_inference/stable_diff_f16.py
@@ -0,0 +1,280 @@
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
+import torch
+from PIL import Image
+from diffusers import LMSDiscreteScheduler
+from tqdm.auto import tqdm
+from shark.shark_inference import SharkInference
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+import torch_mlir
+import tempfile
+import numpy as np
+
+# pip install diffusers
+# pip install scipy
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a photograph of an astronaut riding a horse",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument("--steps", type=int, default=50, help="the device to use")
+p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
+p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
+args = p.parse_args()
+
+#####################################################
+
+
+def fp16_unet():
+    from shark.shark_downloader import download_model
+
+    mlir_model, func_name, inputs, golden_out = download_model(
+        "stable_diff_f16_18_OCT",
+        tank_url="gs://shark_tank/prashant_nod",
+        frontend="torch",
+    )
+    shark_module = SharkInference(
+        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
+    )
+    shark_module.compile()
+    return shark_module
+
+
+def load_mlir(mlir_loc):
+    import os
+
+    if mlir_loc == None:
+        return None
+    print(f"Trying to load the model from {mlir_loc}.")
+    with open(os.path.join(mlir_loc)) as f:
+        mlir_module = f.read()
+    return mlir_module
+
+
+def compile_through_fx(model, inputs, mlir_loc=None):
+
+    module = load_mlir(mlir_loc)
+    if mlir_loc == None:
+        fx_g = make_fx(
+            model,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(*inputs)
+
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+
+        def strip_overloads(gm):
+            """
+            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+            Args:
+                gm(fx.GraphModule): The input Fx graph module to be modified
+            """
+            for node in gm.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverload):
+                    node.target = node.target.overloadpacket
+            gm.recompile()
+
+        strip_overloads(fx_g)
+
+        ts_g = torch.jit.script(fx_g)
+
+        module = torch_mlir.compile(
+            ts_g,
+            inputs,
+            torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=False,
+            verbose=False,
+        )
+
+    mlir_model = module
+    func_name = "forward"
+
+    shark_module = SharkInference(
+        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
+    )
+    shark_module.compile()
+
+    return shark_module
+
+
+if __name__ == "__main__":
+
+    YOUR_TOKEN = "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"
+
+    # 1. Load the autoencoder model which will be used to decode the latents into image space.
+    vae = AutoencoderKL.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        subfolder="vae",
+        use_auth_token=YOUR_TOKEN,
+    )
+
+    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModel.from_pretrained(
+        "openai/clip-vit-large-patch14"
+    )
+
+    class VaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="vae",
+                use_auth_token=YOUR_TOKEN,
+            )
+
+        def forward(self, input):
+            return self.vae.decode(input, return_dict=False)[0]
+
+    vae = VaeModel()
+    vae_input = torch.rand(1, 4, 64, 64)
+    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
+
+    # Wrap the unet model to return tuples.
+    class UnetModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.unet = UNet2DConditionModel.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="unet",
+                use_auth_token=YOUR_TOKEN,
+            )
+            self.in_channels = self.unet.in_channels
+            self.train(False)
+
+    def forward(self, x, y, z):
+        return self.unet.forward(x, y, z, return_dict=False)[0]
+
+    # # 3. The UNet model for generating the latents.
+    unet = UnetModel()
+
+    shark_unet = fp16_unet()
+
+    scheduler = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+
+    prompt = [args.prompt]
+
+    height = 512  # default height of Stable Diffusion
+    width = 512  # default width of Stable Diffusion
+
+    num_inference_steps = args.steps  # Number of denoising steps
+
+    guidance_scale = 7.5  # Scale for classifier-free guidance
+
+    generator = torch.manual_seed(
+        42
+    )  # Seed generator to create the inital latent noise
+
+    batch_size = len(prompt)
+
+    text_input = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+    text_embeddings = text_encoder(text_input.input_ids)[0]
+
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
+        [""] * batch_size,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
+
+    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+    latents = torch.randn(
+        (batch_size, unet.in_channels, height // 8, width // 8),
+        generator=generator,
+    )
+    # latents = latents.to(torch_device)
+
+    scheduler.set_timesteps(num_inference_steps)
+
+    latents = latents * scheduler.sigmas[0]
+    # print(latents, latents.shape)
+
+    for i, t in tqdm(enumerate(scheduler.timesteps)):
+
+        print(f"i = {i} t = {t}")
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        latent_model_input = torch.cat([latents] * 2)
+        sigma = scheduler.sigmas[i]
+        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+
+        # predict the noise residual
+
+        # with torch.no_grad():
+        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
+
+        latent_model_input_numpy = (
+            latent_model_input.detach().numpy().astype(np.half)
+        )
+        text_embeddings_numpy = (
+            text_embeddings.detach().numpy().astype(np.half)
+        )
+
+        noise_pred = shark_unet.forward(
+            (
+                latent_model_input_numpy,
+                np.array([t]).astype(np.half),
+                text_embeddings_numpy,
+            )
+        )
+        noise_pred = torch.from_numpy(noise_pred).to(torch.float32)
+
+        # perform guidance
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (
+            noise_pred_text - noise_pred_uncond
+        )
+
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
+
+    # print("Latents shape : ", latents.shape)
+
+    # scale and decode the image latents with vae
+    latents = 1 / 0.18215 * latents
+    latents_numpy = latents.detach().numpy()
+    image = shark_vae.forward((latents_numpy,))
+    image = torch.from_numpy(image)
+
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_tf.py
+++ b/shark/examples/shark_inference/stable_diff_tf.py
@@ -0,0 +1,313 @@
+import math
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from keras_cv.models.generative.stable_diffusion.clip_tokenizer import (
+    SimpleTokenizer,
+)
+from keras_cv.models.generative.stable_diffusion.constants import (
+    _ALPHAS_CUMPROD,
+)
+from keras_cv.models.generative.stable_diffusion.constants import (
+    _UNCONDITIONAL_TOKENS,
+)
+from keras_cv.models.generative.stable_diffusion.decoder import Decoder
+from keras_cv.models.generative.stable_diffusion.text_encoder import (
+    TextEncoder,
+)
+
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_model
+from PIL import Image
+
+# pip install "git+https://github.com/keras-team/keras-cv.git"
+# pip install tensorflow_dataset
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a photograph of an astronaut riding a horse",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument(
+    "--steps", type=int, default=10, help="the number of steps to use"
+)
+p.add_argument(
+    "--save_path",
+    type=str,
+    default=None,
+    help="the file to save the resulting image to. (default to <input prompt>.jpg)",
+)
+args = p.parse_args()
+
+#####################################################
+
+MAX_PROMPT_LENGTH = 77
+
+
+class SharkStableDiffusion:
+    """Shark implementation of Stable Diffusion based on model from keras_cv.
+    Stable Diffusion is a powerful image generation model that can be used,
+    among other things, to generate pictures according to a short text description
+    (called a "prompt").
+    Arguments:
+        device: Device to use with SHARK. Default: cpu
+        jit_compile: Whether to compile the underlying models to XLA.
+            This can lead to a significant speedup on some systems. Default: False.
+    References:
+    - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
+    - [Original implementation](https://github.com/CompVis/stable-diffusion)
+    """
+
+    def __init__(self, device="cpu", jit_compile=True):
+        self.img_height = 512
+        self.img_width = 512
+        self.tokenizer = SimpleTokenizer()
+
+        # Create models
+        self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
+
+        mlir_model, func_name, inputs, golden_out = download_model(
+            "stable_diff", tank_url="gs://shark_tank/quinn", frontend="tf"
+        )
+        shark_module = SharkInference(
+            mlir_model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        self.diffusion_model = shark_module
+        self.decoder = Decoder(self.img_height, self.img_width)
+        if jit_compile:
+            self.text_encoder.compile(jit_compile=True)
+            self.decoder.compile(jit_compile=True)
+
+        print(
+            "By using this model checkpoint, you acknowledge that its usage is "
+            "subject to the terms of the CreativeML Open RAIL-M license at "
+            "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE"
+        )
+        # Load weights
+        text_encoder_weights_fpath = keras.utils.get_file(
+            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_encoder.h5",
+            file_hash="4789e63e07c0e54d6a34a29b45ce81ece27060c499a709d556c7755b42bb0dc4",
+        )
+        decoder_weights_fpath = keras.utils.get_file(
+            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_decoder.h5",
+            file_hash="ad350a65cc8bc4a80c8103367e039a3329b4231c2469a1093869a345f55b1962",
+        )
+        self.text_encoder.load_weights(text_encoder_weights_fpath)
+        self.decoder.load_weights(decoder_weights_fpath)
+
+    def text_to_image(
+        self,
+        prompt,
+        batch_size=1,
+        num_steps=25,
+        unconditional_guidance_scale=7.5,
+        seed=None,
+    ):
+        encoded_text = self.encode_text(prompt)
+
+        return self.generate_image(
+            encoded_text,
+            batch_size=batch_size,
+            num_steps=num_steps,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            seed=seed,
+        )
+
+    def encode_text(self, prompt):
+        """Encodes a prompt into a latent text encoding.
+        The encoding produced by this method should be used as the
+        `encoded_text` parameter of `StableDiffusion.generate_image`. Encoding
+        text separately from generating an image can be used to arbitrarily
+        modify the text encoding priot to image generation, e.g. for walking
+        between two prompts.
+        Args:
+            prompt: a string to encode, must be 77 tokens or shorter.
+        Example:
+        ```python
+        from keras_cv.models import StableDiffusion
+        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+        encoded_text  = model.encode_text("Tacos at dawn")
+        img = model.generate_image(encoded_text)
+        ```
+        """
+        # Tokenize prompt (i.e. starting context)
+        inputs = self.tokenizer.encode(prompt)
+        if len(inputs) > MAX_PROMPT_LENGTH:
+            raise ValueError(
+                f"Prompt is too long (should be <= {MAX_PROMPT_LENGTH} tokens)"
+            )
+        phrase = inputs + [49407] * (MAX_PROMPT_LENGTH - len(inputs))
+        phrase = tf.convert_to_tensor([phrase], dtype=tf.int32)
+
+        context = self.text_encoder.predict_on_batch(
+            [phrase, self._get_pos_ids()]
+        )
+
+        return context
+
+    def generate_image(
+        self,
+        encoded_text,
+        batch_size=1,
+        num_steps=25,
+        unconditional_guidance_scale=7.5,
+        diffusion_noise=None,
+        seed=None,
+    ):
+        """Generates an image based on encoded text.
+        The encoding passed to this method should be derived from
+        `StableDiffusion.encode_text`.
+        Args:
+            encoded_text: Tensor of shape (`batch_size`, 77, 768), or a Tensor
+            of shape (77, 768). When the batch axis is omitted, the same encoded
+            text will be used to produce every generated image.
+            batch_size: number of images to generate. Default: 1.
+            num_steps: number of diffusion steps (controls image quality).
+                Default: 25.
+            unconditional_guidance_scale: float controling how closely the image
+                should adhere to the prompt. Larger values result in more
+                closely adhering to the prompt, but will make the image noisier.
+                Default: 7.5.
+            diffusion_noise: Tensor of shape (`batch_size`, img_height // 8,
+                img_width // 8, 4), or a Tensor of shape (img_height // 8,
+                img_width // 8, 4). Optional custom noise to seed the diffusion
+                process. When the batch axis is omitted, the same noise will be
+                used to seed diffusion for every generated image.
+            seed: integer which is used to seed the random generation of
+                diffusion noise, only to be specified if `diffusion_noise` is
+                None.
+        Example:
+        ```python
+        from keras_cv.models import StableDiffusion
+        batch_size = 8
+        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+        e_tacos = model.encode_text("Tacos at dawn")
+        e_watermelons = model.encode_text("Watermelons at dusk")
+        e_interpolated = tf.linspace(e_tacos, e_watermelons, batch_size)
+        images = model.generate_image(e_interpolated, batch_size=batch_size)
+        ```
+        """
+        if diffusion_noise is not None and seed is not None:
+            raise ValueError(
+                "`diffusion_noise` and `seed` should not both be passed to "
+                "`generate_image`. `seed` is only used to generate diffusion "
+                "noise when it's not already user-specified."
+            )
+
+        encoded_text = tf.squeeze(encoded_text)
+        if encoded_text.shape.rank == 2:
+            encoded_text = tf.repeat(
+                tf.expand_dims(encoded_text, axis=0), batch_size, axis=0
+            )
+
+        context = encoded_text
+        unconditional_context = tf.repeat(
+            self._get_unconditional_context(), batch_size, axis=0
+        )
+        context = tf.concat([context, unconditional_context], 0)
+
+        if diffusion_noise is not None:
+            diffusion_noise = tf.squeeze(diffusion_noise)
+            if diffusion_noise.shape.rank == 3:
+                diffusion_noise = tf.repeat(
+                    tf.expand_dims(diffusion_noise, axis=0), batch_size, axis=0
+                )
+            latent = diffusion_noise
+        else:
+            latent = self._get_initial_diffusion_noise(batch_size, seed)
+
+        # Iterative reverse diffusion stage
+        timesteps = tf.range(1, 1000, 1000 // num_steps)
+        alphas, alphas_prev = self._get_initial_alphas(timesteps)
+        progbar = keras.utils.Progbar(len(timesteps))
+        iteration = 0
+        for index, timestep in list(enumerate(timesteps))[::-1]:
+            latent_prev = latent  # Set aside the previous latent vector
+            t_emb = self._get_timestep_embedding(timestep, batch_size)
+
+            # Prepare the latent and unconditional latent to be run with a single forward call
+            latent = tf.concat([latent, latent], 0)
+            t_emb = tf.concat([t_emb, t_emb], 0)
+            latent_numpy = self.diffusion_model.forward(
+                [latent.numpy(), t_emb.numpy(), context.numpy()]
+            )
+            latent = tf.convert_to_tensor(latent_numpy, dtype=tf.float32)
+            latent, unconditional_latent = tf.split(latent, 2)
+
+            latent = unconditional_latent + unconditional_guidance_scale * (
+                latent - unconditional_latent
+            )
+            a_t, a_prev = alphas[index], alphas_prev[index]
+            pred_x0 = (latent_prev - math.sqrt(1 - a_t) * latent) / math.sqrt(
+                a_t
+            )
+            latent = (
+                latent * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0
+            )
+            iteration += 1
+            progbar.update(iteration)
+
+        # Decoding stage
+        decoded = self.decoder.predict_on_batch(latent)
+        decoded = ((decoded + 1) / 2) * 255
+        return np.clip(decoded, 0, 255).astype("uint8")
+
+    def _get_unconditional_context(self):
+        unconditional_tokens = tf.convert_to_tensor(
+            [_UNCONDITIONAL_TOKENS], dtype=tf.int32
+        )
+        unconditional_context = self.text_encoder.predict_on_batch(
+            [unconditional_tokens, self._get_pos_ids()]
+        )
+
+        return unconditional_context
+
+    def _get_timestep_embedding(
+        self, timestep, batch_size, dim=320, max_period=10000
+    ):
+        half = dim // 2
+        freqs = tf.math.exp(
+            -math.log(max_period) * tf.range(0, half, dtype=tf.float32) / half
+        )
+        args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
+        embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
+        embedding = tf.reshape(embedding, [1, -1])
+        return tf.repeat(embedding, batch_size, axis=0)
+
+    def _get_initial_alphas(self, timesteps):
+        alphas = [_ALPHAS_CUMPROD[t] for t in timesteps]
+        alphas_prev = [1.0] + alphas[:-1]
+
+        return alphas, alphas_prev
+
+    def _get_initial_diffusion_noise(self, batch_size, seed):
+        return tf.random.normal(
+            (batch_size, self.img_height // 8, self.img_width // 8, 4),
+            seed=seed,
+        )
+
+    @staticmethod
+    def _get_pos_ids():
+        return tf.convert_to_tensor(
+            [list(range(MAX_PROMPT_LENGTH))], dtype=tf.int32
+        )
+
+
+if __name__ == "__main__":
+    SD = SharkStableDiffusion(device=args.device)
+    images = SD.text_to_image(args.prompt, num_steps=args.steps)
+    pil_images = [Image.fromarray(image) for image in images]
+    save_fname = args.prompt + ".jpg"
+    if args.save_path is not None:
+        save_fname = args.save_path
+    pil_images[0].save(save_fname)
--- a/shark/examples/shark_inference/stable_diffusion/.gitignore
+++ b/shark/examples/shark_inference/stable_diffusion/.gitignore
@@ -0,0 +1,2 @@
+*.vmfb
+*.jpg
--- a/shark/examples/shark_inference/stable_diffusion/README.md
+++ b/shark/examples/shark_inference/stable_diffusion/README.md
@@ -0,0 +1,100 @@
+# STABLE DIFFUSION
+
+## Installation
+
+Follow setup instructions in the main [README.md](https://github.com/nod-ai/SHARK#readme) for regular usage. 
+
+ 
+## Using other supported Stable Diffusion variants with SHARK:
+
+Currently we support fine-tuned versions of Stable Diffusion such as:
+- [AnythingV3](https://huggingface.co/Linaqruf/anything-v3.0)
+- [Analog Diffusion](https://huggingface.co/wavymulder/Analog-Diffusion)
+
+use the flag `--hf_model_id=` to specify the repo-id of the model to be used.
+
+```shell
+python .\shark\examples\shark_inference\stable_diffusion\main.py --hf_model_id="Linaqruf/anything-v3.0" --max_length=77 --prompt="1girl, brown hair, green eyes, colorful, autumn, cumulonimbus clouds, lighting, blue sky, falling leaves, garden"
+```
+
+## Run a custom model using a HuggingFace `.ckpt` file:
+* Install the following by running :-
+```shell
+pip install omegaconf safetensors pytorch_lightning
+```
+* Download a [.ckpt](https://huggingface.co/andite/anything-v4.0/resolve/main/anything-v4.0-pruned-fp32.ckpt) file in case you don't have a locally generated `.ckpt` file for StableDiffusion.
+
+* Now pass the above `.ckpt` file to `ckpt_loc` command-line argument using the following :-
+```shell
+python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --ckpt_loc="/path/to/.ckpt/file"
+```
+* We use a combination of 2 flags to make this feature work : `import_mlir` and `ckpt_loc`.
+* In case `ckpt_loc` is NOT specified then a [default](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) HuggingFace repo-id is run via `hf_model_id`. So, you can use `import_mlir` and `hf_model_id` to run HuggingFace's StableDiffusion variants.
+
+* Use custom model `.ckpt` files from [HuggingFace-StableDiffusion](https://huggingface.co/models?other=stable-diffusion) to generate images.
+
+
+## Running the model for a `batch_size` and for a set of `runs`:
+We currently support batch size in the range `[1, 3]`.
+You can specify batch size using `batch_size` flag (defaults to `1`) and the number of times you want to run the model using `runs` flag (defaults to `1`).
+In total, you'll be able to generate `batch_size * runs` number of images.
+- Usage 1: Using the same prompt -
+```shell
+python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=3
+```
+The example above generates `3` different images in total with the same prompt `tajmahal, oil on canvas, sunflowers, 4k, uhd`.
+- Usage 2: Using different prompts -
+```shell
+python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=3 -p="batman riding a horse, oil on canvas, 4k, uhd" -p="superman riding a horse, oil on canvas, 4k, uhd"
+```
+The example above generates `1` image for each different prompt, thus generating `3` images in total.
+- Usage 3: Using `runs` -
+```shell
+python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=2 --runs=3
+```
+The example above generates `6` different images in total, `2` images for each `runs`.
+
+</details>
+  <details>
+  <summary>Debug Commands</summary>
+
+## Debug commands and other advanced usage follows.
+
+```shell
+python main.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text" 
+
+```
+
+## dump all dispatch .spv and isa using amdllpc
+
+```shell
+python main.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
+```
+
+## Compile and save the .vmfb (using vulkan fp16 as an example):
+
+```shell
+python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
+```
+
+## Capture an RGP trace
+
+```shell
+python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
+```
+
+## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
+
+```shell
+iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf16  
+```
+
+## Run the unet module with iree-benchmark-module (same config as above):
+```shell
+##if you want to use .npz inputs:
+unzip ~/.local/shark_tank/<your unet>/inputs.npz
+
+iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --function_input=@arr_0.npy --function_input=1xf16 --function_input=@arr_2.npy --function_input=@arr_3.npy --function_input=@arr_4.npy  
+```
+
+</details>
--- a/shark/examples/shark_inference/stable_diffusion/download_hf_models.py
+++ b/shark/examples/shark_inference/stable_diffusion/download_hf_models.py
@@ -0,0 +1,25 @@
+from PIL import Image
+import requests
+
+from transformers import CLIPProcessor, CLIPModel
+
+model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(
+    text=["a photo of a cat", "a photo of a dog"],
+    images=image,
+    return_tensors="pt",
+    padding=True,
+)
+
+outputs = model(**inputs)
+logits_per_image = (
+    outputs.logits_per_image
+)  # this is the image-text similarity score
+probs = logits_per_image.softmax(
+    dim=1
+)  # we can take the softmax to get the label probabilities
--- a/shark/examples/shark_inference/stable_diffusion/main.py
+++ b/shark/examples/shark_inference/stable_diffusion/main.py
@@ -0,0 +1,330 @@
+import os
+import sys
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"
+
+if sys.platform == "darwin":
+    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
+
+from transformers import CLIPTextModel, CLIPTokenizer
+import torch
+from PIL import Image
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from stable_args import args
+from datetime import datetime as dt
+import json
+import re
+from pathlib import Path
+from model_wrappers import SharkifyStableDiffusionModel
+
+# This has to come before importing cache objects
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+from utils import set_init_device_flags, disk_space_check, preprocessCKPT
+
+from schedulers import (
+    SharkEulerDiscreteScheduler,
+)
+import time
+from shark.iree_utils.compile_utils import dump_isas
+
+# Helper function to profile the vulkan device.
+def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
+    if args.vulkan_debug_utils and "vulkan" in args.device:
+        import iree
+
+        print(f"Profiling and saving to {file_path}.")
+        vulkan_device = iree.runtime.get_device(args.device)
+        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
+        return vulkan_device
+    return None
+
+
+def end_profiling(device):
+    if device:
+        return device.end_profiling()
+
+
+if __name__ == "__main__":
+
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+
+    # Make it as default prompt
+    if len(args.prompts) == 0:
+        args.prompts = ["cyberpunk forest by Salvador Dali"]
+
+    prompt = args.prompts
+    neg_prompt = args.negative_prompts
+    height = args.height
+    width = args.width
+    num_inference_steps = args.steps  # Number of denoising steps
+
+    # Scale for classifier-free guidance
+    guidance_scale = torch.tensor(args.guidance_scale).to(torch.float32)
+
+    batch_size = args.batch_size
+    prompt = prompt * batch_size if len(prompt) == 1 else prompt
+    len_of_prompt = len(prompt)
+    assert (
+        len_of_prompt == batch_size
+    ), f"no. of prompts ({len_of_prompt}) is not equal to batch_size ({batch_size})"
+    print("Running StableDiffusion with the following config :-")
+    print(f"Batch size : {batch_size}")
+    print(f"Prompts : {prompt}")
+    print(f"Runs : {args.runs}")
+
+    # Try to make neg_prompt equal to batch_size by appending blank strings.
+    for i in range(batch_size - len(neg_prompt)):
+        neg_prompt.append("")
+
+    set_init_device_flags()
+    disk_space_check(Path.cwd())
+
+    if not args.import_mlir:
+        from opt_params import get_unet, get_vae, get_clip
+
+        clip = get_clip()
+        unet = get_unet()
+        vae = get_vae()
+    else:
+        if ".ckpt" in args.ckpt_loc:
+            preprocessCKPT()
+        mlir_import = SharkifyStableDiffusionModel(
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.precision,
+            max_len=args.max_length,
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            use_base_vae=args.use_base_vae,
+            use_tuned=args.use_tuned,
+        )
+        clip, unet, vae = mlir_import()
+
+    if args.dump_isa:
+        dump_isas(args.dispatch_benchmarks_dir)
+
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    scheduler = DPMSolverMultistepScheduler.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        subfolder="scheduler",
+    )
+    cpu_scheduling = True
+    if args.hf_model_id == "stabilityai/stable-diffusion-2-1":
+        tokenizer = CLIPTokenizer.from_pretrained(
+            "stabilityai/stable-diffusion-2-1", subfolder="tokenizer"
+        )
+
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-2-1",
+            subfolder="scheduler",
+        )
+
+    if args.hf_model_id == "stabilityai/stable-diffusion-2-1-base":
+        tokenizer = CLIPTokenizer.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer"
+        )
+
+        if args.use_compiled_scheduler:
+            scheduler = SharkEulerDiscreteScheduler.from_pretrained(
+                "stabilityai/stable-diffusion-2-1-base",
+                subfolder="scheduler",
+            )
+            scheduler.compile()
+            cpu_scheduling = False
+        else:
+            scheduler = EulerDiscreteScheduler.from_pretrained(
+                "stabilityai/stable-diffusion-2-1-base",
+                subfolder="scheduler",
+            )
+    for run in range(args.runs):
+        # Handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        seed = args.seed
+        if run >= 1 or seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(
+            seed
+        )  # Seed generator to create the inital latent noise
+
+        # create a random initial latent.
+        latents = torch.randn(
+            (batch_size, 4, height // 8, width // 8),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+        if run == 0:
+            # Warmup phase to improve performance.
+            if args.warmup_count >= 1:
+                vae_warmup_input = torch.clone(latents).detach().numpy()
+                clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
+            for i in range(args.warmup_count):
+                vae("forward", (vae_warmup_input,))
+                clip("forward", (clip_warmup_input,))
+
+        start = time.time()
+        if run == 0:
+            text_input = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=args.max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = tokenizer(
+                neg_prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input = torch.cat(
+                [uncond_input.input_ids, text_input.input_ids]
+            )
+
+            clip_inf_start = time.time()
+            text_embeddings = clip("forward", (text_input,))
+            clip_inf_end = time.time()
+            text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+            text_embeddings_numpy = text_embeddings.detach().numpy()
+
+            scheduler.set_timesteps(num_inference_steps)
+            scheduler.is_scale_input_called = True
+
+        latents = latents * scheduler.init_noise_sigma
+
+        avg_ms = 0
+        for i, t in tqdm(
+            enumerate(scheduler.timesteps), disable=args.hide_steps
+        ):
+            step_start = time.time()
+            if not args.hide_steps:
+                print(f"i = {i} t = {t}", end="")
+            timestep = torch.tensor([t]).to(dtype).detach().numpy()
+            latent_model_input = scheduler.scale_model_input(latents, t)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            profile_device = start_profiling(file_path="unet.rdc")
+
+            noise_pred = unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                ),
+                send_to_host=False,
+            )
+
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = scheduler.step(noise_pred, t, latents).prev_sample
+            else:
+                latents = scheduler.step(noise_pred, t, latents)
+            step_time = time.time() - step_start
+            avg_ms += step_time
+            step_ms = int((step_time) * 1000)
+            if not args.hide_steps:
+                print(f" ({step_ms}ms)")
+
+        # scale and decode the image latents with vae
+        if args.use_base_vae:
+            latents = 1 / 0.18215 * latents
+        latents_numpy = latents
+        if cpu_scheduling:
+            latents_numpy = latents.detach().numpy()
+        profile_device = start_profiling(file_path="vae.rdc")
+        vae_start = time.time()
+        images = vae("forward", (latents_numpy,))
+        vae_end = time.time()
+        end_profiling(profile_device)
+        if args.use_base_vae:
+            image = torch.from_numpy(images)
+            image = (image.detach().cpu() * 255.0).numpy()
+            images = image.round()
+        end_time = time.time()
+
+        avg_ms = 1000 * avg_ms / args.steps
+        clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
+        vae_inf_time = (vae_end - vae_start) * 1000
+        total_time = end_time - start
+
+        print(f"\nStats for run {run}:")
+        print(f"Average step time: {avg_ms}ms/it")
+        print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
+        print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
+        print(f"\nTotal image generation time: {total_time}sec")
+
+        images = torch.from_numpy(images).to(torch.uint8).permute(0, 2, 3, 1)
+        pil_images = [Image.fromarray(image) for image in images.numpy()]
+
+        if args.output_dir is not None:
+            output_path = Path(args.output_dir)
+            output_path.mkdir(parents=True, exist_ok=True)
+        else:
+            output_path = Path.cwd()
+        disk_space_check(output_path, lim=5)
+        for i in range(batch_size):
+            json_store = {
+                "prompt": prompt[i],
+                "negative prompt": args.negative_prompts[i],
+                "seed": seed,
+                "hf_model_id": args.hf_model_id,
+                "precision": args.precision,
+                "steps": args.steps,
+                "guidance_scale": args.guidance_scale,
+                "scheduler": args.scheduler,
+            }
+            prompt_slice = re.sub("[^a-zA-Z0-9]", "_", prompt[i][:15])
+            img_name = f"{prompt_slice}_{seed}_{run}_{i}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+            if args.output_img_format == "jpg":
+                pil_images[i].save(
+                    output_path / f"{img_name}.jpg",
+                    quality=95,
+                    subsampling=0,
+                    optimize=True,
+                    progressive=True,
+                )
+            else:
+                pil_images[i].save(output_path / f"{img_name}.png", "PNG")
+                if args.output_img_format not in ["png", "jpg"]:
+                    print(
+                        f"[ERROR] Format {args.output_img_format} is not supported yet."
+                        "saving image as png. Supported formats png / jpg"
+                    )
+            with open(output_path / f"{img_name}.json", "w") as f:
+                f.write(json.dumps(json_store, indent=4))
--- a/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
+++ b/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
@@ -1,19 +1,13 @@
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 from diffusers import AutoencoderKL, UNet2DConditionModel
 from transformers import CLIPTextModel
+from utils import compile_through_fx, get_opt_flags
+from resources import base_models
 from collections import defaultdict
 import torch
-import traceback
-import re
-import sys
-from apps.stable_diffusion.src.utils import (
-    compile_through_fx,
-    get_opt_flags,
-    base_models,
-    args,
-    fetch_or_delete_vmfbs,
-    preprocessCKPT,
-    get_path_to_diffusers_checkpoint,
-)


 # These shapes are parameter dependent.
@@ -72,23 +66,21 @@ class SharkifyStableDiffusionModel:
        batch_size: int = 1,
        use_base_vae: bool = False,
        use_tuned: bool = False,
+        debug: bool = False,
+        sharktank_dir: str = "",
+        generate_vmfb: bool = True,
    ):
        self.check_params(max_len, width, height)
        self.max_len = max_len
        self.height = height // 8
        self.width = width // 8
        self.batch_size = batch_size
-        self.custom_weights = custom_weights
-        if self.custom_weights != "":
-            assert self.custom_weights.lower().endswith(
-                (".ckpt", ".safetensors")
-            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
-            custom_weights = get_path_to_diffusers_checkpoint(custom_weights)
        self.model_id = model_id if custom_weights == "" else custom_weights
        self.precision = precision
        self.base_vae = use_base_vae
        self.model_name = (
-            str(batch_size)
+            "_"
+            + str(batch_size)
            + "_"
            + str(max_len)
            + "_"
@@ -99,8 +91,9 @@ class SharkifyStableDiffusionModel:
            + precision
        )
        self.use_tuned = use_tuned
-        if use_tuned:
-            self.model_name = self.model_name + "_tuned"
+        self.debug = debug
+        self.sharktank_dir = sharktank_dir
+        self.generate_vmfb = generate_vmfb
        # We need a better naming convention for the .vmfbs because despite
        # using the custom model variant the .vmfb names remain the same and
        # it'll always pick up the compiled .vmfb instead of compiling the
@@ -108,6 +101,8 @@ class SharkifyStableDiffusionModel:
        # So, currently, we add `self.model_id` in the `self.model_name` of
        # .vmfb file.
        # TODO: Have a better way of naming the vmfbs using self.model_name.
+        import re
+
        model_name = re.sub(r"\W+", "_", self.model_id)
        if model_name[0] == "_":
            model_name = model_name[1:]
@@ -145,13 +140,20 @@ class SharkifyStableDiffusionModel:
        inputs = tuple(self.inputs["vae"])
        is_f16 = True if self.precision == "fp16" else False
        vae_name = "base_vae" if self.base_vae else "vae"
+        vae_model_name = vae_name + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, vae_model_name), exist_ok=True
+            )
        shark_vae = compile_through_fx(
            vae,
            inputs,
            is_f16=is_f16,
            use_tuned=self.use_tuned,
-            model_name=vae_name + self.model_name,
+            model_name=vae_model_name,
            extra_args=get_opt_flags("vae", precision=self.precision),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
        )
        return shark_vae

@@ -184,14 +186,22 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False
        inputs = tuple(self.inputs["unet"])
        input_mask = [True, True, True, False]
+        unet_model_name = "unet" + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, unet_model_name),
+                exist_ok=True,
+            )
        shark_unet = compile_through_fx(
            unet,
            inputs,
-            model_name="unet" + self.model_name,
+            model_name=unet_model_name,
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
        )
        return shark_unet

@@ -208,26 +218,33 @@ class SharkifyStableDiffusionModel:
                return self.text_encoder(input)[0]

        clip_model = CLIPText()
+        clip_model_name = "clip" + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, clip_model_name),
+                exist_ok=True,
+            )
+
        shark_clip = compile_through_fx(
            clip_model,
            tuple(self.inputs["clip"]),
-            model_name="clip" + self.model_name,
+            model_name=clip_model_name,
            extra_args=get_opt_flags("clip", precision="fp32"),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
        )
        return shark_clip

    def __call__(self):
-        vmfbs = fetch_or_delete_vmfbs(
-            self.model_name, self.base_vae, self.precision
-        )
-        if vmfbs[0]:
-            print("Loading vmfbs from cache")
-            return vmfbs
-        if self.custom_weights != "":
-            assert self.custom_weights.lower().endswith(
-                (".ckpt", ".safetensors")
-            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
-            preprocessCKPT(self.custom_weights)
+        from utils import get_vmfb_path_name
+        from stable_args import args
+        import traceback, functools, operator, os
+
+        model_name = ["clip", "base_vae" if self.base_vae else "vae", "unet"]
+        vmfb_path = [
+            get_vmfb_path_name(model + self.model_name)[0]
+            for model in model_name
+        ]
        for model_id in base_models:
            self.inputs = get_input_info(
                base_models[model_id],
@@ -243,6 +260,16 @@ class SharkifyStableDiffusionModel:
            except Exception as e:
                if args.enable_stack_trace:
                    traceback.print_exc()
+                vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
+                all_vmfb_present = functools.reduce(
+                    operator.__and__, vmfb_present
+                )
+                # We need to delete vmfbs only if some of the models were compiled.
+                if not all_vmfb_present:
+                    for i in range(len(vmfb_path)):
+                        if vmfb_present[i]:
+                            os.remove(vmfb_path[i])
+                            print("Deleted: ", vmfb_path[i])
                print("Retrying with a different base model configuration")
                continue
            # This is done just because in main.py we are basing the choice of tokenizer and scheduler
@@ -253,5 +280,5 @@ class SharkifyStableDiffusionModel:
                args.hf_model_id = model_id
            return compiled_clip, compiled_unet, compiled_vae
        sys.exit(
-            "Cannot compile the model. Please re-run the command with `--enable_stack_trace` flag and create an issue with detailed log at https://github.com/nod-ai/SHARK/issues"
+            "Cannot compile the model. Please use `enable_stack_trace` and create an issue at https://github.com/nod-ai/SHARK/issues"
        )
--- a/shark/examples/shark_inference/stable_diffusion/opt_params.py
+++ b/shark/examples/shark_inference/stable_diffusion/opt_params.py
@@ -1,21 +1,26 @@
 import sys
-from transformers import CLIPTokenizer
-from apps.stable_diffusion.src.utils import models_db, args, get_shark_model
+import resources
+from stable_args import args
+from utils import get_shark_model

+models_db = (
+    resources.beta_models_db if args.beta_models else resources.models_db
+)
+BATCH_SIZE = len(args.prompts)
+if BATCH_SIZE != 1:
+    sys.exit("Only batch size 1 is supported.")

 hf_model_variant_map = {
    "Linaqruf/anything-v3.0": ["anythingv3", "v2_1base"],
    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v2_1base"],
    "prompthero/openjourney": ["openjourney", "v2_1base"],
    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v2_1base"],
-    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1base"],
+    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1"],
    "stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
    "CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
 }

-
-def get_variant_version(hf_model_id):
-    return hf_model_variant_map[hf_model_id]
+variant, version = hf_model_variant_map[args.hf_model_id]


 def get_params(bucket_key, model_key, model, is_tuned, precision):
@@ -64,7 +69,6 @@ def get_params(bucket_key, model_key, model, is_tuned, precision):


 def get_unet():
-    variant, version = get_variant_version(args.hf_model_id)
    # Tuned model is present only for `fp16` precision.
    is_tuned = "tuned" if args.use_tuned else "untuned"
    if "vulkan" not in args.device and args.use_tuned:
@@ -81,7 +85,6 @@ def get_unet():


 def get_vae():
-    variant, version = get_variant_version(args.hf_model_id)
    # Tuned model is present only for `fp16` precision.
    is_tuned = "tuned" if args.use_tuned else "untuned"
    is_base = "/base" if args.use_base_vae else ""
@@ -99,7 +102,6 @@ def get_vae():


 def get_clip():
-    variant, version = get_variant_version(args.hf_model_id)
    bucket_key = f"{variant}/untuned"
    model_key = (
        f"{variant}/{version}/clip/fp32/length_{args.max_length}/untuned"
@@ -108,10 +110,3 @@ def get_clip():
        bucket_key, model_key, "clip", "untuned", "fp32"
    )
    return get_shark_model(bucket, model_name, iree_flags)
-
-
-def get_tokenizer():
-    tokenizer = CLIPTokenizer.from_pretrained(
-        args.hf_model_id, subfolder="tokenizer"
-    )
-    return tokenizer
--- a/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
+++ b/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
@@ -42,46 +42,3 @@ To build the vulkan app for profiling UNet follow the instructions [here](https:
 ```shell
 ./build/vulkan_gui/iree-vulkan-gui --module_file=/path/to/unet.vmfb --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
 ```
-
-</details>
-  <details>
-  <summary>Debug Commands</summary>
-
-## Debug commands and other advanced usage follows.
-
-```shell
-python txt2img.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text" 
-```
-
-## dump all dispatch .spv and isa using amdllpc
-
-```shell
-python txt2img.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
-```
-
-## Compile and save the .vmfb (using vulkan fp16 as an example):
-
-```shell
-python txt2img.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
-```
-
-## Capture an RGP trace
-
-```shell
-python txt2img.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
-```
-
-## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
-
-```shell
-iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf16  
-```
-
-## Run the unet module with iree-benchmark-module (same config as above):
-```shell
-##if you want to use .npz inputs:
-unzip ~/.local/shark_tank/<your unet>/inputs.npz
-iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --function_input=@arr_0.npy --function_input=1xf16 --function_input=@arr_2.npy --function_input=@arr_3.npy --function_input=@arr_4.npy  
-```
-
-</details>
--- a/shark/examples/shark_inference/stable_diffusion/resources.py
+++ b/shark/examples/shark_inference/stable_diffusion/resources.py
@@ -26,8 +26,9 @@ def get_json_file(path):

 # TODO: This shouldn't be called from here, every time the file imports
 # it will run all the global vars.
-prompt_examples = get_json_file("resources/prompts.json")
+prompts_examples = get_json_file("resources/prompts.json")
 models_db = get_json_file("resources/model_db.json")
+beta_models_db = get_json_file("resources/beta_model_db.json")

 # The base_model contains the input configuration for the different
 # models and also helps in providing information for the variants.
--- a/shark/examples/shark_inference/stable_diffusion/resources/base_model.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/base_model.json
--- a/shark/examples/shark_inference/stable_diffusion/resources/beta_model_db.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/beta_model_db.json
@@ -1,6 +1,6 @@
 [
  {
-    "stablediffusion/untuned":"gs://shark_tank/sd_untuned",
+    "stablediffusion/untuned":"gs://shark_tank/latest",
    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
@@ -27,20 +27,20 @@
    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
-    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet64_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet64_512_512_fp16_stabilityai_stable_diffusion_2_1_basec",
    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
-    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip64_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
--- a/shark/examples/shark_inference/stable_diffusion/resources/model_config.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/model_config.json
@@ -0,0 +1,21 @@
+[
+    {
+      "stablediffusion/v1_4":"CompVis/stable-diffusion-v1-4",
+      "stablediffusion/v2_1base":"stabilityai/stable-diffusion-2-1-base",
+      "stablediffusion/v2_1":"stabilityai/stable-diffusion-2-1",
+      "anythingv3/v1_4":"Linaqruf/anything-v3.0",
+      "analogdiffusion/v1_4":"wavymulder/Analog-Diffusion",
+      "openjourney/v1_4":"prompthero/openjourney",
+      "dreamlike/v1_4":"dreamlike-art/dreamlike-diffusion-1.0"
+    },
+    {
+      "stablediffusion/fp16":"fp16",
+      "stablediffusion/fp32":"main",
+      "anythingv3/fp16":"diffusers",
+      "anythingv3/fp32":"diffusers",
+      "analogdiffusion/fp16":"main",
+      "analogdiffusion/fp32":"main",
+      "openjourney/fp16":"main",
+      "openjourney/fp32":"main"
+    }
+  ]
--- a/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
@@ -0,0 +1,177 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip2base_18dec_fp32",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_19dec_v2p1base_fp32_64",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet2_14dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae2_19dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/shark/examples/shark_inference/stable_diffusion/resources/opt_flags.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/opt_flags.json
@@ -0,0 +1,101 @@
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [],
+          "specified_compilation_flags": {
+            "cuda": [],
+            "default_device": ["--iree-flow-enable-padding-linalg-ops",
+                               "--iree-flow-linalg-ops-padding-size=32",
+                               "--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [],
+          "specified_compilation_flags": {
+            "cuda": [],
+            "default_device": [
+              "--iree-flow-enable-padding-linalg-ops",
+              "--iree-flow-linalg-ops-padding-size=32",
+              "--iree-flow-enable-conv-img2col-transform"
+            ]
+          }
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
--- a/shark/examples/shark_inference/stable_diffusion/resources/prompts.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/prompts.json
--- a/shark/examples/shark_inference/stable_diffusion/schedulers.py
+++ b/shark/examples/shark_inference/stable_diffusion/schedulers.py
@@ -9,13 +9,30 @@ from diffusers import (
    EulerDiscreteScheduler,
 )
 from diffusers.configuration_utils import register_to_config
-from apps.stable_diffusion.src.utils import (
-    compile_through_fx,
-    get_shark_model,
-    args,
-)
+from utils import compile_through_fx, get_shark_model
+from stable_args import args
 import torch

+SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+
+
+BATCH_SIZE = len(args.prompts)
+if len(args.prompts) == 0:
+    BATCH_SIZE = 1
+
+model_input = {
+    "euler": {
+        "latent": torch.randn(
+            BATCH_SIZE, 4, args.height // 8, args.width // 8
+        ),
+        "output": torch.randn(
+            BATCH_SIZE, 4, args.height // 8, args.width // 8
+        ),
+        "sigma": torch.tensor(1).to(torch.float32),
+        "dt": torch.tensor(1).to(torch.float32),
+    },
+}
+

 class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
    @register_to_config
@@ -38,22 +55,6 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
        )

    def compile(self):
-        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
-        BATCH_SIZE = args.batch_size
-
-        model_input = {
-            "euler": {
-                "latent": torch.randn(
-                    BATCH_SIZE, 4, args.height // 8, args.width // 8
-                ),
-                "output": torch.randn(
-                    BATCH_SIZE, 4, args.height // 8, args.width // 8
-                ),
-                "sigma": torch.tensor(1).to(torch.float32),
-                "dt": torch.tensor(1).to(torch.float32),
-            },
-        }
-
        example_latent = model_input["euler"]["latent"]
        example_output = model_input["euler"]["output"]
        if args.precision == "fp16":
--- a/shark/examples/shark_inference/stable_diffusion/sd_annotation.py
+++ b/shark/examples/shark_inference/stable_diffusion/sd_annotation.py
@@ -7,26 +7,17 @@ from shark.shark_downloader import (
    WORKDIR,
 )
 from shark.parser import shark_args
-from apps.stable_diffusion.src.utils.stable_args import args
+from stable_args import args


-def get_device():
-    device = (
-        args.device
-        if "://" not in args.device
-        else args.device.split("://")[0]
-    )
-    return device
+device = (
+    args.device if "://" not in args.device else args.device.split("://")[0]
+)


 # Download the model (Unet or VAE fp16) from shark_tank
 def load_model_from_tank():
-    from apps.stable_diffusion.src.models import (
-        get_params,
-        get_variant_version,
-    )
-
-    version, variant = get_variant_version(args.hf_model_id)
+    from opt_params import get_params, version, variant

    shark_args.local_tank_cache = args.local_tank_cache
    bucket_key = f"{variant}/untuned"
@@ -49,7 +40,6 @@ def load_model_from_tank():

 # Download the tuned config files from shark_tank
 def load_winograd_configs():
-    device = get_device()
    config_bucket = "gs://shark_tank/sd_tuned/configs/"
    config_name = f"{args.annotation_model}_winograd_{device}.json"
    full_gs_url = config_bucket + config_name
@@ -60,9 +50,7 @@ def load_winograd_configs():


 def load_lower_configs():
-    from apps.stable_diffusion.src.models import get_variant_version
-
-    version, variant = get_variant_version(args.hf_model_id)
+    from opt_params import version, variant

    config_bucket = "gs://shark_tank/sd_tuned/configs/"
    config_version = version
@@ -71,7 +59,6 @@ def load_lower_configs():
        config_version = "v1_4"
    if args.annotation_model == "vae":
        args.max_length = 77
-    device = get_device()
    config_name = f"{args.annotation_model}_{config_version}_{args.precision}_len{args.max_length}_{device}.json"
    full_gs_url = config_bucket + config_name
    lowering_config_dir = f"{WORKDIR}configs/" + config_name
@@ -114,7 +101,6 @@ def annotate_with_lower_configs(

    # Dump IR after padding/img2col/winograd passes
    device_spec_args = ""
-    device = get_device()
    if device == "cuda":
        from shark.iree_utils.gpu_utils import get_iree_gpu_args

@@ -166,7 +152,6 @@ def annotate_with_lower_configs(


 def sd_model_annotation(mlir_model, model_name, model_from_tank=False):
-    device = get_device()
    if args.annotation_model == "unet" and device == "vulkan":
        use_winograd = True
        winograd_config_dir = load_winograd_configs()
--- a/shark/examples/shark_inference/stable_diffusion/shark_sd_cli.spec
+++ b/shark/examples/shark_inference/stable_diffusion/shark_sd_cli.spec
@@ -19,17 +19,14 @@ datas += copy_metadata('torchvision')
 datas += copy_metadata('torch-mlir')
 datas += copy_metadata('diffusers')
 datas += copy_metadata('transformers')
-datas += copy_metadata('omegaconf')
-datas += copy_metadata('safetensors')
-datas += collect_data_files('gradio')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
 datas += collect_data_files('shark')
 datas += [
-         ( 'src/utils/resources/prompts.json', 'resources' ),
-         ( 'src/utils/resources/model_db.json', 'resources' ),
-         ( 'src/utils/resources/opt_flags.json', 'resources' ),
-         ( 'src/utils/resources/base_model.json', 'resources' ),
+         ( 'resources/prompts.json', 'resources'),
+         ( 'resources/model_db.json', 'resources'),
+         ( 'resources/base_model.json', 'resources'),
+         ( 'resources/opt_flags.json', 'resources'),
         ]

 binaries = []
@@ -38,11 +35,11 @@ block_cipher = None


 a = Analysis(
-    ['scripts/txt2img.py'],
+    ['main.py'],
    pathex=['.'],
    binaries=binaries,
    datas=datas,
-    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core' ],
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
--- a/shark/examples/shark_inference/stable_diffusion/stable_args.py
+++ b/shark/examples/shark_inference/stable_diffusion/stable_args.py
@@ -1,3 +1,4 @@
+import os
 import argparse
 from pathlib import Path

@@ -6,6 +7,13 @@ def path_expand(s):
    return Path(s).expanduser().resolve()


+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
 p = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
 )
@@ -23,7 +31,7 @@ p.add_argument(
 )

 p.add_argument(
-    "--negative_prompts",
+    "--negative-prompts",
    nargs="+",
    default=[""],
    help="text you don't want to see in the generated image.",
@@ -174,7 +182,12 @@ p.add_argument(
    action=argparse.BooleanOptionalAction,
    help="Enable showing the stack trace when retrying the base model configuration",
 )
-
+p.add_argument(
+    "--beta_models",
+    default=False,
+    type=bool,
+    help="(False/True), use beta model files",
+)
 ##############################################################################
 ### IREE - Vulkan supported flags
 ##############################################################################
@@ -270,20 +283,6 @@ p.add_argument(
    help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
 )

-p.add_argument(
-    "--save_metadata_to_json",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="flag for whether or not to save a generation information json file with the image.",
-)
-
-p.add_argument(
-    "--write_metadata_to_png",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="flag for whether or not to save generation information in PNG chunk text to generated images.",
-)
-
 ##############################################################################
 ### Web UI flags
 ##############################################################################
@@ -295,28 +294,6 @@ p.add_argument(
    help="flag for removing the pregress bar animation during image generation",
 )

-p.add_argument(
-    "--ckpt_dir",
-    type=str,
-    default="",
-    help="Path to directory where all .ckpts are stored in order to populate them in the web UI",
-)
-
-
-p.add_argument(
-    "--share",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="flag for generating a public URL",
-)
-
-p.add_argument(
-    "--server_port",
-    type=int,
-    default=8080,
-    help="flag for setting server port",
-)
-
 ##############################################################################
 ### SD model auto-annotation flags
 ##############################################################################
@@ -341,5 +318,41 @@ p.add_argument(
    action=argparse.BooleanOptionalAction,
    help="Apply Winograd on selected conv ops.",
 )
+##############################################################################
+### CI generation tags
+##############################################################################

-args, unknown = p.parse_known_args()
+# TODO: remove from here once argparse is not required by half of sd, none of these are relevant to main.py
+
+p.add_argument(
+    "--ci_tank_dir",
+    default=True,
+    type=bool,
+    help="used for CI generation purposes only.",
+)
+p.add_argument(
+    "--upload",
+    default=False,
+    type=bool,
+    help="upload generated models to shark tank (builder only), irrelevant to main.py",
+)
+p.add_argument(
+    "--torch_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/torch_model_list.csv",
+    help="""Contains the file with torch_model name and args.
+		 Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
+)
+p.add_argument(
+    "--tf_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/tf_model_list.csv",
+    help="Contains the file with tf model name and args.",
+)
+p.add_argument(
+    "--tflite_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/tflite/tflite_model_list.csv",
+    help="Contains the file with tf model name and args.",
+)
+args = p.parse_args()
--- a/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
+++ b/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
@@ -1,6 +1,6 @@
 # Stable Diffusion optimized for AMD RDNA2/RDNA3 GPUs

-Before you start, please be aware that this is beta software that relies on a special AMD driver. Like all StableDiffusion GUIs published so far, you need some technical expertise to set it up. We apologize in advance if you bump into issues. If that happens, please don't hesitate to ask our Discord community for help! Please be assured that we (Nod and AMD) are working hard to improve the user experience in coming months.
+Before you start, please be aware that this is beta software that relies on a special AMD driver. Like all StableDiffusion GUIs published so far, you need some technical expertise to set it up. We apologize in advance if you bump into issues. If that happens, please don't hesitate to ask our Discord community for help! If you still can't get it to work, we're sorry, and please be assured that we (Nod and AMD) are working hard to improve the user experience in coming months.
 If it works well for you, please "star" the following GitHub projects... this is one of the best ways to help and spread the word!

 * https://github.com/nod-ai/SHARK
@@ -23,10 +23,10 @@ KNOWN ISSUES with this special AMD driver:

 ## Installation

-Download the latest Windows SHARK SD binary [492 here](https://github.com/nod-ai/SHARK/releases/download/20230203.492/shark_sd_20230203_492.exe) in a folder of your choice. If you want nighly builds, you can look for them on the GitHub releases page.
+Download the latest Windows SHARK SD binary [469 here](https://github.com/nod-ai/SHARK/releases/download/20230124.469/shark_sd_20230124_469.exe) in a folder of your choice. If you want nighly builds, you can look for them on the GitHub releases page.

 Notes:
-* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR which can be outdated if you run a new EXE from the same folder. You can use `--clear_all` flag once to clean all the old files. 
+* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR which can be outdated if you run a new EXE from the same folder. You can use `--clean_all` flag once to clean all the old files. 
 * If you recently updated the driver or this binary (EXE file), we recommend you:
  * clear all the local artifacts with `--clear_all` OR 
  * clear the Vulkan shader cache: For Windows users this can be done by clearing the contents of `C:\Users\%username%\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
@@ -56,6 +56,84 @@ Here are some samples generated:
 ![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)


+<details>
+  <summary>Advanced Installation </summary>
+
+
+## Setup your Python Virtual Environment and Dependencies
+<details>
+ <summary> Windows 10/11 Users </summary>
+
+* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
+
+* Install Git for Windows from [here](https://git-scm.com/download/win)
+
+#### Allow the install script to run in Powershell
+```powershell
+set-executionpolicy remotesigned 
+```
+
+#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
+```powershell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+./setup_venv.ps1 #You can re-run this script to get the latest version
+```
+</details> 
+
+ <details>
+  <summary>Linux</summary>
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+./setup_venv.sh
+source shark.venv/bin/activate
+```
+ </details>
+
+### Run Stable Diffusion on your device - WebUI
+
+<details>
+ <summary>Windows 10/11 Users</summary>
+ 
+```powershell
+(shark.venv) PS C:\Users\nod\SHARK> cd web
+(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
+```
+ 
+ </details>
+ 
+<details>
+ <summary>Linux Users</summary>
+ 
+```shell
+(shark.venv) > cd web
+(shark.venv) > python index.py
+```
+ 
+</details>
+
+### Run Stable Diffusion on your device - Commandline
+
+<details>
+ <summary>Windows 10/11 Users</summary>
+ 
+```powershell
+(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+```
+ 
+  </details>
+
+<details>
+ <summary>Linux</summary>
+ 
+```shell
+python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+```
+ 
+  </details>
+
 The output on a 7900XTX would like:

 ```shell 
@@ -67,4 +145,10 @@ VAE Inference time (ms): 78.590
 Total image generation time: 2.5788655281066895sec
 ```

+For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
+ 
+</details>
+  <details>
+  <summary>Discord link</summary>
 Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
+</details>
--- a/shark/examples/shark_inference/stable_diffusion/stable_diffusion_telegram_bot.md
+++ b/shark/examples/shark_inference/stable_diffusion/stable_diffusion_telegram_bot.md
--- a/shark/examples/shark_inference/stable_diffusion/utils.py
+++ b/shark/examples/shark_inference/stable_diffusion/utils.py
@@ -1,20 +1,18 @@
 import os
 import gc
-from pathlib import Path
+import tempfile
+import torch
 from shark.shark_inference import SharkInference
+from shark.examples.shark_inference.stable_diffusion.stable_args import args
 from shark.shark_importer import import_with_fx
 from shark.iree_utils.vulkan_utils import (
    set_iree_vulkan_runtime_flags,
    get_vulkan_target_triple,
 )
 from shark.iree_utils.gpu_utils import get_cuda_sm_cc
-from apps.stable_diffusion.src.utils.stable_args import args
-from apps.stable_diffusion.src.utils.resources import opt_flags
-from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
-import sys, functools, operator
-from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
-    load_pipeline_from_original_stable_diffusion_ckpt,
-)
+from resources import opt_flags
+from sd_annotation import sd_model_annotation
+import sys


 def get_vmfb_path_name(model_name):
@@ -82,7 +80,11 @@ def compile_through_fx(
    f16_input_mask=None,
    use_tuned=False,
    extra_args=[],
+    save_dir=tempfile.gettempdir(),
+    debug=False,
+    generate_vmfb=True,
 ):
+
    from shark.parser import shark_args

    if "cuda" in args.device:
@@ -93,6 +95,7 @@ def compile_through_fx(
    )

    if use_tuned:
+        model_name = model_name + "_tuned"
        tuned_model_path = f"{args.annotation_output}/{model_name}_torch.mlir"
        if not os.path.exists(tuned_model_path):
            if "vae" in model_name.split("_")[0]:
@@ -108,15 +111,29 @@ def compile_through_fx(
            mlir_module = f.read()
            f.close()

-    shark_module = SharkInference(
-        mlir_module,
-        device=args.device,
-        mlir_dialect="linalg",
+    save_dir = os.path.join(args.local_tank_cache, model_name)
+
+    mlir_module, func_name, = import_with_fx(
+        model=model,
+        inputs=inputs,
+        is_f16=is_f16,
+        f16_input_mask=f16_input_mask,
+        debug=debug,
+        model_name=model_name,
+        save_dir=save_dir,
    )
-    return _compile_module(shark_module, model_name, extra_args)
+    if generate_vmfb:
+        shark_module = SharkInference(
+            mlir_module,
+            device=args.device,
+            mlir_dialect="linalg",
+        )
+
+        return _compile_module(shark_module, model_name, extra_args)


 def set_iree_runtime_flags():
+
    vulkan_runtime_flags = [
        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
@@ -234,8 +251,8 @@ def set_init_device_flags():

    # Use tuned models in the case of fp16, vulkan rdna3 or cuda sm devices.
    if (
-        args.hf_model_id == "prompthero/openjourney"
-        or args.ckpt_loc != ""
+        args.hf_model_id
+        in ["prompthero/openjourney", "dreamlike-art/dreamlike-diffusion-1.0"]
        or args.precision != "fp16"
        or args.height != 512
        or args.width != 512
@@ -254,6 +271,7 @@ def set_init_device_flags():
        "sm_80",
        "sm_84",
        "sm_86",
+        "sm_89",
    ]:
        args.use_tuned = False

@@ -263,29 +281,28 @@ def set_init_device_flags():
    ]:
        args.use_tuned = False

+    # Use tuned model in the case of stablediffusion/fp16 and cuda device sm_80
+    if (
+        args.hf_model_id
+        in [
+            "stabilityai/stable-diffusion-2-1-base",
+            "Linaqruf/anything-v3.0",
+            "wavymulder/Analog-Diffusion",
+        ]
+        and args.precision == "fp16"
+        and "cuda" in args.device
+        and get_cuda_sm_cc() in ["sm_80", "sm_89"]
+        and args.use_tuned  # required to avoid always forcing true on these cards
+    ):
+        args.use_tuned = True
+    else:
+        args.use_tuned = False
+
    if args.use_tuned:
-        print(f"Using tuned models for {args.hf_model_id}/fp16/{args.device}.")
+        print(f"Using {args.device} tuned models for stablediffusion/fp16.")
    else:
        print("Tuned models are currently not supported for this setting.")

-    # set import_mlir to True for unuploaded models.
-    if args.ckpt_loc != "":
-        args.import_mlir = True
-
-    elif args.hf_model_id not in [
-        "Linaqruf/anything-v3.0",
-        "dreamlike-art/dreamlike-diffusion-1.0",
-        "prompthero/openjourney",
-        "wavymulder/Analog-Diffusion",
-        "stabilityai/stable-diffusion-2-1",
-        "stabilityai/stable-diffusion-2-1-base",
-        "CompVis/stable-diffusion-v1-4",
-    ]:
-        args.import_mlir = True
-
-    elif args.height != 512 or args.width != 512 or args.batch_size != 1:
-        args.import_mlir = True
-

 # Utility to get list of devices available.
 def get_available_devices():
@@ -301,7 +318,7 @@ def get_available_devices():
            print(f"{driver_name} devices are not available.")
        else:
            for i, device in enumerate(device_list_dict):
-                device_list.append(f"{device['name']} => {driver_name}://{i}")
+                device_list.append(f"{driver_name}://{i} => {device['name']}")
        return device_list

    set_iree_runtime_flags()
@@ -360,74 +377,40 @@ def get_opt_flags(model, precision="fp16"):
    return iree_flags


-def get_path_to_diffusers_checkpoint(custom_weights):
-    path = Path(custom_weights)
+def preprocessCKPT():
+    from pathlib import Path
+
+    path = Path(args.ckpt_loc)
    diffusers_path = path.parent.absolute()
    diffusers_directory_name = path.stem
    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
-    path_to_diffusers = complete_path_to_diffusers.as_posix()
-    return path_to_diffusers
-
-
-def preprocessCKPT(custom_weights):
-    path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights)
-    if next(Path(path_to_diffusers).iterdir(), None):
-        print("Checkpoint already loaded at : ", path_to_diffusers)
-        return
-    else:
-        print(
-            "Diffusers' checkpoint will be identified here : ",
-            path_to_diffusers,
-        )
-    from_safetensors = (
-        True if custom_weights.lower().endswith(".safetensors") else False
-    )
-    # EMA weights usually yield higher quality images for inference but non-EMA weights have
-    # been yielding better results in our case.
-    # TODO: Add an option `--ema` (`--no-ema`) for users to specify if they want to go for EMA
-    #       weight extraction or not.
-    extract_ema = False
    print(
-        "Loading diffusers' pipeline from original stable diffusion checkpoint"
+        "Created directory : ",
+        diffusers_directory_name,
+        " at -> ",
+        diffusers_path,
    )
-    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
-        checkpoint_path=custom_weights,
-        extract_ema=extract_ema,
-        from_safetensors=from_safetensors,
-    )
-    pipe.save_pretrained(path_to_diffusers)
-    print("Loading complete")
+    path_to_diffusers = complete_path_to_diffusers.as_posix()
+    # TODO: Use the SD to Diffusers CKPT pipeline once it's included in the release.
+    sd_to_diffusers = os.path.join(os.getcwd(), "sd_to_diffusers.py")
+    if not os.path.isfile(sd_to_diffusers):
+        url = "https://raw.githubusercontent.com/huggingface/diffusers/8a3f0c1f7178f4a3d5a5b21ae8c2906f473e240d/scripts/convert_original_stable_diffusion_to_diffusers.py"
+        import requests

-
-def load_vmfb(vmfb_path, model, precision):
-    model = "vae" if "base_vae" in model else model
-    precision = "fp32" if "clip" in model else precision
-    extra_args = get_opt_flags(model, precision)
-    shark_module = SharkInference(mlir_module=None, device=args.device)
-    shark_module.load_module(vmfb_path, extra_args=extra_args)
-    return shark_module
-
-
-# This utility returns vmfbs of Clip, Unet and Vae, in case all three of them
-# are present; deletes them otherwise.
-def fetch_or_delete_vmfbs(basic_model_name, use_base_vae, precision="fp32"):
-    model_name = ["clip", "unet", "base_vae" if use_base_vae else "vae"]
-    vmfb_path = [
-        get_vmfb_path_name(model + basic_model_name)[0] for model in model_name
-    ]
-    vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
-    all_vmfb_present = functools.reduce(operator.__and__, vmfb_present)
-    compiled_models = [None] * 3
-    # We need to delete vmfbs only if some of the models were compiled.
-    if not all_vmfb_present:
-        for i in range(len(vmfb_path)):
-            if vmfb_present[i]:
-                os.remove(vmfb_path[i])
-                print("Deleted: ", vmfb_path[i])
+        req = requests.get(url)
+        open(sd_to_diffusers, "wb").write(req.content)
+        print("Downloaded SD to Diffusers converter")
    else:
-        for i in range(len(vmfb_path)):
-            compiled_models[i] = load_vmfb(
-                vmfb_path[i], model_name[i], precision
-            )
-    return compiled_models
+        print("SD to Diffusers converter already exists")
+
+    os.system(
+        "python "
+        + sd_to_diffusers
+        + " --checkpoint_path="
+        + args.ckpt_loc
+        + " --dump_path="
+        + path_to_diffusers
+    )
+    args.ckpt_loc = path_to_diffusers
+    print("Custom model path is : ", args.ckpt_loc)
--- a/shark/examples/shark_inference/upscaler/model_wrappers.py
+++ b/shark/examples/shark_inference/upscaler/model_wrappers.py
@@ -18,6 +18,7 @@ model_input = {


 def get_clip_mlir(model_name="clip_text", extra_args=[]):
+
    text_encoder = CLIPTextModel.from_pretrained(
        model_id,
        subfolder="text_encoder",
--- a/shark/examples/shark_inference/upscaler/pipeline_shark_stable_diffusion_upscale.py
+++ b/shark/examples/shark_inference/upscaler/pipeline_shark_stable_diffusion_upscale.py
@@ -339,6 +339,7 @@ class SharkStableDiffusionUpscalePipeline:
        ] = None,
        callback_steps: Optional[int] = 1,
    ):
+
        # 1. Check inputs
        self.check_inputs(prompt, image, noise_level, callback_steps)

--- a/shark/examples/shark_inference/upscaler/utils.py
+++ b/shark/examples/shark_inference/upscaler/utils.py
@@ -62,6 +62,7 @@ def get_shark_model(tank_url, model_name, extra_args=[]):
 def compile_through_fx(
    model, inputs, model_name, is_f16=False, f16_input_mask=None, extra_args=[]
 ):
+
    mlir_module, func_name = import_with_fx(
        model, inputs, is_f16, f16_input_mask
    )
@@ -75,6 +76,7 @@ def compile_through_fx(


 def set_iree_runtime_flags():
+
    vulkan_runtime_flags = [
        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
--- a/shark/examples/shark_training/stable-diffusion-img2img/stable_diffusion_img2img.py
+++ b/shark/examples/shark_training/stable-diffusion-img2img/stable_diffusion_img2img.py
@@ -169,7 +169,6 @@ imagenet_style_templates_small = [
    "a large painting in the style of {}",
 ]

-
 # Setup the dataset
 class TextualInversionDataset(Dataset):
    def __init__(
@@ -185,6 +184,7 @@ class TextualInversionDataset(Dataset):
        placeholder_token="*",
        center_crop=False,
    ):
+
        self.data_root = data_root
        self.tokenizer = tokenizer
        self.learnable_property = learnable_property
@@ -244,10 +244,7 @@ class TextualInversionDataset(Dataset):

        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            h, w, = (
                img.shape[0],
                img.shape[1],
            )
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -33,9 +33,8 @@ def run_cmd(cmd):
        )
        result_str = result.stdout.decode()
        return result_str
-    except subprocess.CalledProcessError as e:
-        print(e.output)
-        sys.exit(f"Exiting program due to error running {cmd}")
+    except Exception:
+        sys.exit("Exiting program due to error running:", cmd)


 def iree_device_map(device):
--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -18,7 +18,6 @@ from shark.iree_utils.cpu_utils import get_cpu_count
 import numpy as np
 import os
 import re
-import platform

 UNIT_TO_SECOND_MAP = {"us": 1e-6, "ms": 0.001, "s": 1}

@@ -63,16 +62,7 @@ def build_benchmark_args(
    Outputs: string that execute benchmark-module on target model.
    """
    path = benchmark_module.__path__[0]
-    if platform.system() == "Windows":
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module.exe"
-        )
-        time_extractor = None
-    else:
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module"
-        )
-        time_extractor = "| awk 'END{{print $2 $3}}'"
+    benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
    benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
    # TODO: The function named can be passed as one of the args.
    fn_name = "forward"
@@ -88,8 +78,8 @@ def build_benchmark_args(
        num_cpus = get_cpu_count()
        if num_cpus is not None:
            benchmark_cl.append(f"--task_topology_max_group_count={num_cpus}")
-    # if time_extractor:
-    #    benchmark_cl.append(time_extractor)
+    time_extractor = "| awk 'END{{print $2 $3}}'"
+    benchmark_cl.append(time_extractor)
    return benchmark_cl


@@ -106,14 +96,7 @@ def build_benchmark_args_non_tensor_input(
    Outputs: string that execute benchmark-module on target model.
    """
    path = benchmark_module.__path__[0]
-    if platform.system() == "Windows":
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module.exe"
-        )
-    else:
-        benchmarker_path = os.path.join(
-            path, "..", "..", "iree-benchmark-module"
-        )
+    benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
    benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
    # TODO: The function named can be passed as one of the args.
    if function_name:
@@ -121,9 +104,8 @@ def build_benchmark_args_non_tensor_input(
    benchmark_cl.append(f"--device={iree_device_map(device)}")
    for input in inputs:
        benchmark_cl.append(f"--function_input={input}")
-    if platform.system() != "Windows":
-        time_extractor = "| awk 'END{{print $2 $3}}'"
-        benchmark_cl.append(time_extractor)
+    time_extractor = "| awk 'END{{print $2 $3}}'"
+    benchmark_cl.append(time_extractor)
    return benchmark_cl


@@ -139,9 +121,8 @@ def run_benchmark_module(benchmark_cl):
        benchmark_path
    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
    bench_result = run_cmd(" ".join(benchmark_cl))
-    print(bench_result)
-    regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
-    match = regex_split.search(bench_result)
+    regex_split = re.compile("([0-9]+[.]*[0-9]*)([a-zA-Z]+)")
+    match = regex_split.match(bench_result)
    time = float(match.group(1))
-    unit = match.group(3)
-    return 1.0 / (time * 0.001)
+    unit = match.group(2)
+    return 1.0 / (time * UNIT_TO_SECOND_MAP[unit])
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -81,10 +81,6 @@ def get_model_specific_args():
    ms_args = []
    if shark_args.enable_conv_transform == True:
        ms_args += ["--iree-flow-enable-conv-nchw-to-nhwc-transform"]
-    if shark_args.enable_img2col_transform == True:
-        ms_args += ["--iree-flow-enable-conv-img2col-transform"]
-    if shark_args.use_winograd == True:
-        ms_args += ["--iree-flow-enable-conv-winograd-transform"]
    return ms_args


@@ -147,6 +143,7 @@ def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
                    in_dispatches = True
            if all_dispatches or in_dispatches:
                for f_ in os.listdir(f"{bench_dir}/{d_}"):
+
                    if "benchmark.mlir" in f_:
                        dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
                        module = dispatch_file.read()
@@ -279,19 +276,9 @@ def compile_module_to_flatbuffer(
    return flatbuffer_blob


-def get_iree_module(flatbuffer_blob, device, device_idx=None):
+def get_iree_module(flatbuffer_blob, device):
    # Returns the compiled module and the configs.
-    if device_idx is not None:
-        device = iree_device_map(device)
-        print("registering device id: ", device_idx)
-        haldriver = ireert.get_driver(device)
-
-        haldevice = haldriver.create_device(
-            haldriver.query_available_devices()[device_idx]["device_id"]
-        )
-        config = ireert.Config(device=haldevice)
-    else:
-        config = get_iree_runtime_config(device)
+    config = get_iree_runtime_config(device)
    vm_module = ireert.VmModule.from_flatbuffer(
        config.vm_instance, flatbuffer_blob
    )
@@ -307,20 +294,20 @@ def get_iree_compiled_module(
    frontend: str = "torch",
    model_config_path: str = None,
    extra_args: list = [],
-    device_idx: int = None,
 ):
    """Given a module returns the compiled .vmfb and configs"""
    flatbuffer_blob = compile_module_to_flatbuffer(
        module, device, frontend, model_config_path, extra_args
    )
-    return get_iree_module(flatbuffer_blob, device, device_idx=device_idx)
+    return get_iree_module(flatbuffer_blob, device)


-def load_flatbuffer(flatbuffer_path: str, device: str, device_idx: int = None):
+def load_flatbuffer(flatbuffer_path: str, device: str):
+
    with open(os.path.join(flatbuffer_path), "rb") as f:
        flatbuffer_blob = f.read()

-    return get_iree_module(flatbuffer_blob, device, device_idx=device_idx)
+    return get_iree_module(flatbuffer_blob, device)


 def export_iree_module_to_vmfb(
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -18,7 +18,6 @@ import iree.runtime as ireert
 import ctypes
 from shark.parser import shark_args

-
 # Get the default gpu args given the architecture.
 def get_iree_gpu_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
@@ -40,17 +39,8 @@ def get_iree_gpu_args():
 # Get the default gpu args given the architecture.
 def get_iree_rocm_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    # get arch from rocminfo.
-    import re
-    import subprocess
-
-    rocm_arch = re.match(
-        r".*(gfx\w+)",
-        subprocess.check_output(
-            "rocminfo | grep -i 'gfx'", shell=True, text=True
-        ),
-    ).group(1)
-    print(f"Found rocm arch {rocm_arch}...")
+    # TODO: find a way to get arch from code.
+    rocm_arch = "gfx908"
    return [
        f"--iree-rocm-target-chip={rocm_arch}",
        "--iree-rocm-link-bc=true",
--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -16,6 +16,7 @@ from collections import OrderedDict


 def get_vulkan_target_env(vulkan_target_triple):
+
    arch, product, os = vulkan_target_triple.split("=")[1].split("-")
    triple = (arch, product, os)
    # get version
@@ -36,6 +37,7 @@ def get_vulkan_target_env(vulkan_target_triple):


 def get_vulkan_target_env_flag(vulkan_target_triple):
+
    target_env = get_vulkan_target_env(vulkan_target_triple)
    target_env_flag = f"--iree-vulkan-target-env={target_env}"
    return target_env_flag
@@ -122,6 +124,7 @@ def get_extensions(triple):


 def get_vendor(triple):
+
    arch, product, os = triple
    if arch == "unknown":
        return "Unknown"
@@ -203,6 +206,7 @@ def get_vulkan_target_capabilities(triple):
    cap["coopmatCases"] = None

    if arch in ["rdna1", "rdna2", "rdna3"]:
+
        cap["maxComputeSharedMemorySize"] = 65536
        cap["maxComputeWorkGroupInvocations"] = 1024
        cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
@@ -283,6 +287,7 @@ def get_vulkan_target_capabilities(triple):
        cap["variablePointersStorageBuffer"] = True

    elif arch == "m1":
+
        cap["maxComputeSharedMemorySize"] = 32768
        cap["maxComputeWorkGroupInvocations"] = 1024
        cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
@@ -357,6 +362,7 @@ def get_vulkan_target_capabilities(triple):
            ]

    elif arch in ["ampere", "turing"]:
+
        cap["maxComputeSharedMemorySize"] = 49152
        cap["maxComputeWorkGroupInvocations"] = 1024
        cap["maxComputeWorkGroupSize"] = [1024, 1024, 1024]
@@ -396,6 +402,7 @@ def get_vulkan_target_capabilities(triple):
        ]

    elif arch == "adreno":
+
        cap["maxComputeSharedMemorySize"] = 32768
        cap["maxComputeWorkGroupInvocations"] = 1024
        cap["maxComputeWorkGroupSize"] = [1024, 1024, 64]
@@ -440,6 +447,7 @@ def get_vulkan_target_capabilities(triple):

    res = ""
    for k, v in cap.items():
+
        if v is None or v == False:
            continue
        if isinstance(v, bool):
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -44,7 +44,7 @@ parser.add_argument(
    "--repro_dir",
    help="Directory to which module files will be saved for reproduction or debugging.",
    type=dir_path,
-    default="shark_tmp",
+    default="./shark_tmp",
 )
 parser.add_argument(
    "--enable_tf32",
@@ -89,7 +89,7 @@ parser.add_argument(
 )
 parser.add_argument(
    "--local_tank_cache",
-    default=None,
+    default="",
    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
 )

@@ -112,18 +112,4 @@ parser.add_argument(
    help="Enables the --iree-flow-enable-conv-nchw-to-nhwc-transform flag.",
 )

-parser.add_argument(
-    "--enable_img2col_transform",
-    default=False,
-    action="store_true",
-    help="Enables the --iree-flow-enable-conv-img2col-transform flag.",
-)
-
-parser.add_argument(
-    "--use_winograd",
-    default=False,
-    action="store_true",
-    help="Enables the --iree-flow-enable-conv-winograd-transform flag.",
-)
-
 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -23,6 +23,8 @@ from datetime import datetime
 import time
 import csv
 import os
+import torch
+import torch._dynamo as dynamo


 class OnnxFusionOptions(object):
@@ -104,7 +106,6 @@ class SharkBenchmarkRunner(SharkRunner):

    def benchmark_torch(self, modelname):
        import torch
-        import torch._dynamo as dynamo
        from tank.model_utils import get_torch_model

        if self.device == "cuda":
@@ -118,7 +119,7 @@ class SharkBenchmarkRunner(SharkRunner):
        )
        HFmodel, input = get_torch_model(modelname)[:2]
        frontend_model = HFmodel.model
-        # frontend_model = dynamo.optimize("inductor")(frontend_model)
+        frontend_model = dynamo.optimize("inductor")(frontend_model)
        frontend_model.to(torch_device)
        input.to(torch_device)

@@ -157,10 +158,7 @@ class SharkBenchmarkRunner(SharkRunner):
        # tf_device = "/GPU:0" if self.device == "cuda" else "/CPU:0"
        tf_device = "/CPU:0"
        with tf.device(tf_device):
-            (
-                model,
-                input,
-            ) = get_tf_model(
+            model, input, = get_tf_model(
                modelname
            )[:2]
            frontend_model = model
@@ -280,8 +278,7 @@ for currently supported models. Exiting benchmark ONNX."
            ]

    def get_metadata(self, modelname):
-        metadata_path = os.path.join(".", "tank", "model_metadata.csv")
-        with open(metadata_path, mode="r") as csvfile:
+        with open("./tank/model_metadata.csv", mode="r") as csvfile:
            torch_reader = csv.reader(csvfile, delimiter=",")
            fields = next(torch_reader)
            for row in torch_reader:
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -34,6 +34,7 @@ def download_public_file(
    dest_filename = None
    desired_file = None
    if single_file:
+
        desired_file = full_gs_url.split("/")[-1]
        source_blob_name = "/".join(full_gs_url.split("/")[3:-1])
        destination_folder_name, dest_filename = os.path.split(
@@ -79,17 +80,13 @@ input_type_to_np_dtype = {
 # Save the model in the home local so it needn't be fetched everytime in the CI.
 home = str(Path.home())
 alt_path = os.path.join(os.path.dirname(__file__), "../gen_shark_tank/")
-custom_path_list = None
-if shark_args.local_tank_cache is not None:
-    custom_path_list = shark_args.local_tank_cache.split("/")
-
+custom_path = shark_args.local_tank_cache
 if os.path.exists(alt_path):
    WORKDIR = alt_path
    print(
        f"Using {WORKDIR} as shark_tank directory. Delete this directory if you aren't working from locally generated shark_tank."
    )
-if custom_path_list:
-    custom_path = os.path.join(*custom_path_list)
+if custom_path:
    if not os.path.exists(custom_path):
        os.mkdir(custom_path)

--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -257,6 +257,7 @@ class SharkImporter:


 def get_f16_inputs(inputs, is_f16, f16_input_mask):
+
    if is_f16 == False:
        return inputs
    if f16_input_mask == None:
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -69,13 +69,11 @@ class SharkInference:
        is_benchmark: bool = False,
        dispatch_benchmark: str = None,
        dispatch_benchmark_dir: str = "temp_dispatch_benchmarks",
-        device_idx: int = None,
    ):
        self.mlir_module = mlir_module
        self.device = shark_args.device if device == "none" else device
        self.mlir_dialect = mlir_dialect
        self.is_benchmark = is_benchmark
-        self.device_idx = device_idx
        self.dispatch_benchmarks = (
            shark_args.dispatch_benchmarks
            if dispatch_benchmark is None
@@ -90,6 +88,7 @@ class SharkInference:
        self.shark_runner = None

    def compile(self, extra_args=[]):
+
        if self.dispatch_benchmarks is not None:
            extra_args.append(
                f"--iree-hal-dump-executable-sources-to={self.dispatch_benchmarks_dir}"
@@ -121,7 +120,6 @@ class SharkInference:
                self.device,
                self.mlir_dialect,
                extra_args=extra_args,
-                device_idx=self.device_idx,
            )

        if self.dispatch_benchmarks is not None:
@@ -207,6 +205,5 @@ class SharkInference:
        ) = load_flatbuffer(
            path,
            self.device,
-            self.device_idx,
        )
        return
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -64,13 +64,11 @@ class SharkRunner:
        mlir_dialect: str = "linalg",
        extra_args: list = [],
        compile_vmfb: bool = True,
-        device_idx: int = None,
    ):
        self.mlir_module = mlir_module
        self.device = shark_args.device if device == "none" else device
        self.mlir_dialect = mlir_dialect
        self.extra_args = extra_args
-        self.device_idx = device_idx

        if check_device_drivers(self.device):
            print(device_driver_info(self.device))
@@ -86,7 +84,6 @@ class SharkRunner:
                self.device,
                self.mlir_dialect,
                extra_args=self.extra_args,
-                device_idx=self.device_idx,
            )

    def run(self, function_name, inputs: tuple, send_to_host=False):
--- a/shark/sharkdynamo/utils.py
+++ b/shark/sharkdynamo/utils.py
@@ -9,7 +9,6 @@ from torch._decomp import get_decompositions

 import torch_mlir

-
 # TODO: Control decompositions.
 def default_decompositions():
    return get_decompositions(
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -1,36 +1,36 @@
-resnet50,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-albert-base-v2,mhlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
-roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-camembert-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/iree-org/iree/issues/9971",""
-distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342",""
-funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/201",""
-google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile",""
-google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,False,"",""
-microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile.",""
-microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
-albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir",""
-alexnet,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/879",""
-bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,False,True,"",""
-facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile.",""
-google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311",""
-microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390",""
-microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-microsoft/resnet-50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
-google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"https://github.com/nod-ai/SHARK/issues/344",""
-mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/388","macos"
-nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/343","macos"
-resnet101,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
-resnet18,linalg,torch,1e-2,1e-3,default,None,True,True,False,"","macos"
-resnet50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-resnet50_fp16,linalg,torch,1e-2,1e-2,default,nhcw-nhwc/img2col,True,False,True,"",""
-squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
-efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+resnet50,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error: mostly conv"
+albert-base-v2,mhlo,tf,1e-2,1e-2,default,None,False,False,False,""
+roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,""
+bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+camembert-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"https://github.com/iree-org/iree/issues/9971"
+distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,True,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342"
+funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,True,"https://github.com/nod-ai/SHARK/issues/201"
+google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile."
+google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile."
+microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,False,False,False,""
+albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir"
+alexnet,linalg,torch,1e-2,1e-3,default,None,True,False,True,"https://github.com/nod-ai/SHARK/issues/879"
+bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,""
+bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,""
+bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,False,True,""
+facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile."
+google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311"
+microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390"
+microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,True,""
+microsoft/resnet-50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,True,"https://github.com/nod-ai/SHARK/issues/344"
+mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,True,"https://github.com/nod-ai/SHARK/issues/388"
+nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,True,"https://github.com/nod-ai/SHARK/issues/343"
+resnet101,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+resnet18,linalg,torch,1e-2,1e-3,default,None,True,True,True,""
+resnet50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+resnet50_fp16,linalg,torch,1e-2,1e-2,default,nhcw-nhwc,True,False,True,""
+squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/388"
+wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/575"
+mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/388"
--- a/tank/examples/opt/hacked_hf_opt.py
+++ b/tank/examples/opt/hacked_hf_opt.py
@@ -338,6 +338,7 @@ class OPTDecoderLayer(nn.Module):
        torch.FloatTensor,
        Optional[Tuple[torch.FloatTensor, torch.FloatTensor]],
    ]:
+
        # TODO: Refactor this function

        residual = hidden_states
@@ -508,6 +509,7 @@ class OPTDecoder(OPTPreTrainedModel):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
+
        # TODO: Refactor this function

        output_attentions = (
@@ -786,6 +788,7 @@ class OPTForCausalLM(OPTPreTrainedModel):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
        # TODO: Refactor this function

        output_attentions = (
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -83,10 +83,10 @@ def get_hf_img_cls_model(name):
    # you can use preprocess_input_image to get the test_input or just random value.
    test_input = preprocess_input_image(name)
    # test_input = torch.FloatTensor(1, 3, 224, 224).uniform_(-1, 1)
-    # print("test_input.shape: ", test_input.shape)
+    print("test_input.shape: ", test_input.shape)
    # test_input.shape:  torch.Size([1, 3, 224, 224])
    actual_out = model(test_input)
-    # print("actual_out.shape： ", actual_out.shape)
+    print("actual_out.shape： ", actual_out.shape)
    # actual_out.shape：  torch.Size([1, 1000])
    return model, test_input, actual_out

--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -43,7 +43,6 @@ def load_csv_and_convert(filename, gen=False):
                    "xfail_cuda": row[8],
                    "xfail_vkm": row[9],
                    "xfail_reason": row[10],
-                    "xfail_other": row[11],
                }
            )
    # This is a pytest workaround
@@ -90,8 +89,6 @@ def get_valid_test_params():
 def is_valid_case(test_params):
    if test_params[0] == True and test_params[2]["framework"] == "tf":
        return False
-    elif "fp16" in test_params[2]["model_name"] and test_params[1] != "cuda":
-        return False
    else:
        return True

@@ -135,18 +132,13 @@ class SharkModuleTester:
        self.config = config

    def create_and_check_module(self, dynamic, device):
+
        shark_args.local_tank_cache = self.local_tank_cache
        shark_args.update_tank = self.update_tank
        if "nhcw-nhwc" in self.config["flags"] and not os.path.isfile(
            ".use-iree"
        ):
            shark_args.enable_conv_transform = True
-        else:
-            shark_args.enable_conv_transform = False
-        if "img2col" in self.config["flags"]:
-            shark_args.enable_img2col_transform = True
-        if "winograd" in self.config["flags"]:
-            shark_args.use_winograd = True

        model, func_name, inputs, golden_out = download_model(
            self.config["model_name"],
@@ -187,9 +179,8 @@ class SharkModuleTester:
            if self.benchmark == True:
                self.benchmark_module(shark_module, inputs, dynamic, device)
                print(msg)
-                pytest.xfail(
-                    reason=f"Numerics Mismatch: Use -s flag to print stderr during pytests."
-                )
+                pytest.xfail(reason="Numerics Issue, awaiting triage.")
+
        if self.benchmark == True:
            self.benchmark_module(shark_module, inputs, dynamic, device)

@@ -213,11 +204,10 @@ class SharkModuleTester:

    def save_reproducers(self):
        # Saves contents of IREE TempFileSaver temporary directory to ./shark_tmp/saved/<test_case>.
-        src = os.path.join(*self.temp_dir.split("/"))
-        saves = os.path.join(".", "shark_tmp", "saved")
-        trg = os.path.join(saves, self.tmp_prefix)
-        if not os.path.isdir(saves):
-            os.mkdir(saves)
+        src = self.temp_dir
+        trg = f"./shark_tmp/saved/{self.tmp_prefix}"
+        if not os.path.isdir("./shark_tmp/saved/"):
+            os.mkdir("./shark_tmp/saved/")
        if not os.path.isdir(trg):
            os.mkdir(trg)
        files = os.listdir(src)
@@ -227,12 +217,7 @@ class SharkModuleTester:
    def upload_repro(self):
        import subprocess

-        src = os.path.join(*self.temp_dir.split("/"))
-        repro_path = os.path.join(
-            ".", "shark_tmp", "saved", self.tmp_prefix, "*"
-        )
-
-        bashCommand = f"gsutil cp -r {repro_path} gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
+        bashCommand = f"gsutil cp -r ./shark_tmp/saved/{self.tmp_prefix}/* gs://shark-public/builder/repro_artifacts/{self.ci_sha}/{self.tmp_prefix}/"
        process = subprocess.run(bashCommand.split())

    def postprocess_outputs(self, golden_out, result):
@@ -293,15 +278,31 @@ class SharkModuleTest(unittest.TestCase):
            pytest.xfail(reason=config["xfail_reason"])

        # Special cases that need to be marked.
-        if "macos" in config["xfail_other"] and device in [
+        if config["model_name"] == "resnet50" and device in [
            "metal",
            "vulkan",
        ]:
            if get_vulkan_triple_flag() is not None:
                if "m1-moltenvk-macos" in get_vulkan_triple_flag():
                    pytest.xfail(
-                        reason="conv-related issue on MacStudio, returns VK_ERROR_DEVICE_LOST."
+                        reason="M2: Assert Error & M1: CompilerToolError"
                    )
+        if (
+            config["model_name"] == "camembert-base"
+            and dynamic == False
+            and device in ["metal", "vulkan"]
+        ):
+            pytest.xfail(
+                reason="chlo.broadcast_compare failed to satify constraint"
+            )
+        if (
+            config["model_name"] == "roberta-base"
+            and dynamic == False
+            and device in ["metal", "vulkan"]
+        ):
+            pytest.xfail(
+                reason="chlo.broadcast_compare failed to satify constraint"
+            )
        if (
            config["model_name"]
            in [
@@ -329,11 +330,11 @@ class SharkModuleTest(unittest.TestCase):
        )
        self.module_tester.tmp_prefix = safe_name.replace("/", "_")

-        if not os.path.isdir("shark_tmp"):
-            os.mkdir("shark_tmp")
+        if not os.path.isdir("./shark_tmp/"):
+            os.mkdir("./shark_tmp/")

        tempdir = tempfile.TemporaryDirectory(
-            prefix=self.module_tester.tmp_prefix, dir="shark_tmp"
+            prefix=self.module_tester.tmp_prefix, dir="./shark_tmp/"
        )
        self.module_tester.temp_dir = tempdir.name

--- a/tank/tflite/albert_lite_base/albert_lite_base_tflite_test.py
+++ b/tank/tflite/albert_lite_base/albert_lite_base_tflite_test.py
@@ -9,7 +9,6 @@ from shark.parser import shark_args
 # model_path = "https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1?lite-format=tflite"
 # model_path = model_path

-
 # Inputs modified to be useful albert inputs.
 def generate_inputs(input_details):
    for input in input_details:
--- a/web/README.md
+++ b/web/README.md
@@ -0,0 +1,16 @@
+In order to launch SHARK-web, from the root SHARK directory, run:
+
+## Linux
+```shell
+IMPORTER=1 ./setup_venv.sh
+source shark.venv/bin/activate
+cd web
+python index.py
+```
+
+## Windows
+```shell
+./setup_venv.ps1
+cd web
+python index.py --local_tank_cache=<current_working_dir>
+```
--- a/apps/stable_diffusion/web/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/css/sd_dark_theme.css
@@ -64,4 +64,4 @@

 footer {
    display: none !important;
-}
+}
--- a/web/index.py
+++ b/web/index.py
@@ -0,0 +1,177 @@
+import os
+import sys
+from pathlib import Path
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"
+
+if sys.platform == "darwin":
+    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
+
+import gradio as gr
+from PIL import Image
+from models.stable_diffusion.resources import resource_path, prompt_examples
+from models.stable_diffusion.main import stable_diff_inf
+from models.stable_diffusion.stable_args import args
+from models.stable_diffusion.utils import get_available_devices
+
+nodlogo_loc = resource_path("logos/nod-logo.png")
+sdlogo_loc = resource_path("logos/sd-demo-logo.png")
+
+
+demo_css = Path(__file__).parent.joinpath("demo.css").resolve()
+
+
+with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:
+
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        logo2 = Image.open(sdlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=100)
+            with gr.Column(scale=5, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=logo2,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="demo_title",
+                ).style(width=150, height=100)
+
+    with gr.Row(elem_id="ui_body"):
+
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value="cyberpunk forest by Salvador Dali",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="trees, green",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                with gr.Row():
+                    variant = gr.Dropdown(
+                        label="Model Variant",
+                        value="stablediffusion",
+                        choices=[
+                            "stablediffusion",
+                            "anythingv3",
+                            "analogdiffusion",
+                            "openjourney",
+                            "dreamlike",
+                        ],
+                    )
+                    scheduler_key = gr.Dropdown(
+                        label="Scheduler",
+                        value="SharkEulerDiscrete",
+                        choices=[
+                            "DDIM",
+                            "PNDM",
+                            "LMSDiscrete",
+                            "DPMSolverMultistep",
+                            "EulerDiscrete",
+                            "EulerAncestralDiscrete",
+                            "SharkEulerDiscrete",
+                        ],
+                    )
+                with gr.Row():
+                    steps = gr.Slider(1, 100, value=50, step=1, label="Steps")
+                    guidance_scale = gr.Slider(
+                        0,
+                        50,
+                        value=7.5,
+                        step=0.1,
+                        label="CFG Scale",
+                    )
+                with gr.Row():
+                    seed = gr.Number(value=-1, precision=0, label="Seed")
+                    available_devices = get_available_devices()
+                    device_key = gr.Dropdown(
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        None,
+                        inputs=[],
+                        outputs=[seed],
+                        _js="() => Math.floor(Math.random() * 4294967295)",
+                    )
+                    stable_diffusion = gr.Button("Generate Image")
+                with gr.Accordion(label="Prompt Examples!"):
+                    ex = gr.Examples(
+                        examples=prompt_examples,
+                        inputs=prompt,
+                        cache_examples=False,
+                        elem_id="prompt_examples",
+                    )
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    generated_img = gr.Image(
+                        type="pil", interactive=False
+                    ).style(height=512)
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=4,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+
+        prompt.submit(
+            stable_diff_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                steps,
+                guidance_scale,
+                seed,
+                scheduler_key,
+                variant,
+                device_key,
+            ],
+            outputs=[generated_img, std_output],
+            show_progress=args.progress_bar,
+        )
+        stable_diffusion.click(
+            stable_diff_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                steps,
+                guidance_scale,
+                seed,
+                scheduler_key,
+                variant,
+                device_key,
+            ],
+            outputs=[generated_img, std_output],
+            show_progress=args.progress_bar,
+        )
+
+shark_web.queue()
+shark_web.launch(
+    share=args.share,
+    inbrowser=True,
+    server_name="0.0.0.0",
+    server_port=args.server_port,
+)
--- a/web/models/init.py
+++ b/web/models/init.py
--- a/web/models/albert_maskfill.py
+++ b/web/models/albert_maskfill.py
@@ -0,0 +1,108 @@
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import torch
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+import numpy as np
+
+################################## Albert Module #########################
+
+
+class AlbertModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForMaskedLM.from_pretrained("albert-base-v2")
+        self.model.eval()
+
+    def forward(self, input_ids, attention_mask):
+        return self.model(
+            input_ids=input_ids, attention_mask=attention_mask
+        ).logits
+
+
+################################## Preprocessing inputs ####################
+
+DEBUG = False
+compiled_module = {}
+compiled_module["tokenizer"] = AutoTokenizer.from_pretrained("albert-base-v2")
+
+
+def preprocess_data(text):
+
+    global compiled_module
+
+    # Preparing Data
+    tokenizer = compiled_module["tokenizer"]
+    encoded_inputs = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=512,
+        return_tensors="pt",
+    )
+    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
+    return inputs
+
+
+def top5_possibilities(text, inputs, token_logits, log_write):
+
+    global DEBUG
+    global compiled_module
+
+    if DEBUG:
+        log_write.write("Retrieving top 5 possible outcomes.\n")
+    tokenizer = compiled_module["tokenizer"]
+    mask_id = torch.where(inputs[0] == tokenizer.mask_token_id)[1]
+    mask_token_logits = token_logits[0, mask_id, :]
+    percentage = torch.nn.functional.softmax(mask_token_logits, dim=1)[0]
+    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+    top5 = {}
+    for token in top_5_tokens:
+        label = text.replace(tokenizer.mask_token, tokenizer.decode(token))
+        top5[label] = percentage[token].item()
+    if DEBUG:
+        log_write.write("Done.\n")
+    return top5
+
+
+##############################################################################
+
+
+def albert_maskfill_inf(masked_text, device):
+
+    global DEBUG
+    global compiled_module
+
+    DEBUG = False
+    log_write = open(r"logs/albert_maskfill_log.txt", "w")
+    if log_write:
+        DEBUG = True
+
+    inputs = preprocess_data(masked_text)
+    if device not in compiled_module.keys():
+        if DEBUG:
+            log_write.write("Compiling the Albert Maskfill module.\n")
+        mlir_importer = SharkImporter(
+            AlbertModule(),
+            inputs,
+            frontend="torch",
+        )
+        minilm_mlir, func_name = mlir_importer.import_mlir(
+            is_dynamic=False, tracing_required=True
+        )
+        shark_module = SharkInference(
+            minilm_mlir, func_name, mlir_dialect="linalg", device=device
+        )
+        shark_module.compile()
+        compiled_module[device] = shark_module
+        if DEBUG:
+            log_write.write("Compilation successful.\n")
+
+    token_logits = torch.tensor(compiled_module[device].forward(inputs))
+    output = top5_possibilities(masked_text, inputs, token_logits, log_write)
+    log_write.close()
+
+    std_output = ""
+    with open(r"logs/albert_maskfill_log.txt", "r") as log_read:
+        std_output = log_read.read()
+
+    return output, std_output
--- a/apps/stable_diffusion/init.py
+++ b/apps/stable_diffusion/init.py
--- a/web/models/diffusion/setup_vdiffusion.sh
+++ b/web/models/diffusion/setup_vdiffusion.sh
@@ -0,0 +1,5 @@
+git clone --recursive https://github.com/crowsonkb/v-diffusion-pytorch.git
+pip install ftfy regex tqdm
+
+mkdir checkpoints
+wget https://the-eye.eu/public/AI/models/v-diffusion/cc12m_1_cfg.pth -P checkpoints/
--- a/web/models/diffusion/v_diffusion.py
+++ b/web/models/diffusion/v_diffusion.py
@@ -0,0 +1,215 @@
+"""classifier-free guidance sampling from a diffusion model."""
+
+from functools import partial
+from pathlib import Path
+
+from PIL import Image
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torchvision import transforms
+from torchvision.transforms import functional as TF
+from tqdm import trange
+
+from shark.shark_inference import SharkInference
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+import torch_mlir
+
+import sys
+
+sys.path.append("models/diffusion/v-diffusion-pytorch")
+
+from CLIP import clip
+from diffusion import get_model, get_models, sampling, utils
+
+import gradio as gr
+
+MODULE_DIR = Path(__file__).resolve().parent
+
+set_global_parameters = False
+device = None
+model = None
+checkpoint = None
+clip_model = None
+
+
+def parse_prompt(prompt, default_weight=3.0):
+    if prompt.startswith("http://") or prompt.startswith("https://"):
+        vals = prompt.rsplit(":", 2)
+        vals = [vals[0] + ":" + vals[1], *vals[2:]]
+    else:
+        vals = prompt.rsplit(":", 1)
+    vals = vals + ["", default_weight][len(vals) :]
+    print(vals[1])
+    print(vals[0])
+    return vals[0], float(vals[1])
+
+
+def run(x, steps, shark_module, args):
+    def compiled_cfg_model_fn(x, t):
+        x_ny = x.detach().numpy()
+        t_ny = t.detach().numpy()
+        inputs = (x_ny, t_ny)
+        result = shark_module.forward(inputs)
+        return torch.from_numpy(result)
+
+    return sampling.plms_sample(compiled_cfg_model_fn, x, steps, {})
+
+
+def run_all(
+    x,
+    t,
+    steps,
+    n,
+    batch_size,
+    side_x,
+    side_y,
+    shark_module,
+    args,
+):
+    x = torch.randn([n, 3, side_y, side_x], device=device)
+    t = torch.linspace(1, 0, args["steps"] + 1, device=device)[:-1]
+    steps = utils.get_spliced_ddpm_cosine_schedule(t)
+    pil_images = []
+    for i in trange(0, n, batch_size):
+        cur_batch_size = min(n - i, batch_size)
+        outs = run(x[i : i + cur_batch_size], steps, shark_module, args)
+        for j, out in enumerate(outs):
+            pil_images.append(utils.to_pil_image(out))
+    return pil_images[0]
+
+
+def cache_model():
+    global set_global_parameters
+    global device
+    global model
+    global checkpoint
+    global clip_model
+    if not set_global_parameters:
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        model = get_model("cc12m_1_cfg")()
+        checkpoint = MODULE_DIR / f"checkpoints/cc12m_1_cfg.pth"
+        model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
+        if device.type == "cuda":
+            model = model.half()
+        model = model.to(device).eval().requires_grad_(False)
+        clip_model_name = (
+            model.clip_model if hasattr(model, "clip_model") else "ViT-B/16"
+        )
+        clip_model = clip.load(clip_model_name, jit=False, device=device)[0]
+        clip_model.eval().requires_grad_(False)
+        set_global_parameters = True
+
+
+def vdiff_inf(prompts: str, n, bs, steps, _device):
+
+    global device
+    global model
+    global checkpoint
+    global clip_model
+
+    args = {}
+    target_embeds = []
+    weights = []
+    args["prompts"] = prompts
+    args["batch_size"] = int(bs)
+    args["n"] = int(n)
+    args["seed"] = 0
+    args["steps"] = int(steps)
+    args["device"] = _device
+
+    cache_model()
+
+    _, side_y, side_x = model.shape
+    normalize = transforms.Normalize(
+        mean=[0.48145466, 0.4578275, 0.40821073],
+        std=[0.26862954, 0.26130258, 0.27577711],
+    )
+
+    zero_embed = torch.zeros([1, clip_model.visual.output_dim], device=device)
+    target_embeds.append(zero_embed)
+
+    prompt_list = args["prompts"].rsplit(";")
+    for prompt in prompt_list:
+        txt, weight = parse_prompt(prompt)
+        target_embeds.append(
+            clip_model.encode_text(clip.tokenize(txt).to(device)).float()
+        )
+        weights.append(weight)
+    weights = torch.tensor([1 - sum(weights), *weights], device=device)
+
+    torch.manual_seed(args["seed"])
+
+    x = torch.randn([args["n"], 3, side_y, side_x], device=device)
+    t = torch.linspace(1, 0, args["steps"] + 1, device=device)[:-1]
+    steps = utils.get_spliced_ddpm_cosine_schedule(t)
+    min_batch_size = min(args["n"], args["batch_size"])
+    x_in = x[0:min_batch_size, :, :, :]
+    ts = x_in.new_ones([x_in.shape[0]])
+    t_in = t[0] * ts
+
+    def cfg_model_fn(x, t):
+        n = x.shape[0]
+        n_conds = len(target_embeds)
+        x_in = x.repeat([n_conds, 1, 1, 1])
+        t_in = t.repeat([n_conds])
+        clip_embed_in = torch.cat([*target_embeds]).repeat([n, 1])
+        vs = model(x_in, t_in, clip_embed_in).view([n_conds, n, *x.shape[1:]])
+        v = vs.mul(weights[:, None, None, None, None]).sum(0)
+        return v
+
+    fx_g = make_fx(
+        cfg_model_fn,
+        decomposition_table=get_decompositions(
+            [
+                torch.ops.aten.embedding_dense_backward,
+                torch.ops.aten.native_layer_norm_backward,
+                torch.ops.aten.slice_backward,
+                torch.ops.aten.select_backward,
+                torch.ops.aten.norm.ScalarOpt_dim,
+                torch.ops.aten.native_group_norm,
+                torch.ops.aten.upsample_bilinear2d.vec,
+                torch.ops.aten.split.Tensor,
+                torch.ops.aten.split_with_sizes,
+            ]
+        ),
+    )(x_in, t_in)
+
+    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+    fx_g.recompile()
+
+    for node in fx_g.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    fx_g.recompile()
+
+    ts_g = torch.jit.script(fx_g)
+
+    module = torch_mlir.compile(
+        ts_g,
+        [x_in, t_in],
+        torch_mlir.OutputType.LINALG_ON_TENSORS,
+        use_tracing=False,
+    )
+
+    mlir_model = module
+    func_name = "forward"
+    shark_module = SharkInference(
+        mlir_model, func_name, device=args["device"], mlir_dialect="linalg"
+    )
+    shark_module.compile()
+    return (
+        run_all(
+            x,
+            t,
+            args["steps"],
+            args["n"],
+            args["batch_size"],
+            side_x,
+            side_y,
+            shark_module,
+            args,
+        ),
+        "Testing..",
+    )
--- a/web/models/resnet50.py
+++ b/web/models/resnet50.py
@@ -0,0 +1,92 @@
+from PIL import Image
+import requests
+import torch
+from torchvision import transforms
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_model
+
+################################## Preprocessing inputs and helper functions ########
+
+DEBUG = False
+compiled_module = {}
+
+
+def preprocess_image(img):
+    image = Image.fromarray(img)
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+            ),
+        ]
+    )
+    img_preprocessed = preprocess(image)
+    return torch.unsqueeze(img_preprocessed, 0)
+
+
+def load_labels():
+    classes_text = requests.get(
+        "https://raw.githubusercontent.com/cathyzhyi/ml-data/main/imagenet-classes.txt",
+        stream=True,
+    ).text
+    labels = [line.strip() for line in classes_text.splitlines()]
+    return labels
+
+
+def top3_possibilities(res, log_write):
+
+    global DEBUG
+
+    if DEBUG:
+        log_write.write("Retrieving top 3 possible outcomes.\n")
+    labels = load_labels()
+    _, indexes = torch.sort(res, descending=True)
+    percentage = torch.nn.functional.softmax(res, dim=1)[0]
+    top3 = dict(
+        [(labels[idx], percentage[idx].item()) for idx in indexes[0][:3]]
+    )
+    if DEBUG:
+        log_write.write("Done.\n")
+    return top3
+
+
+##############################################################################
+
+
+def resnet_inf(numpy_img, device):
+
+    global DEBUG
+    global compiled_module
+
+    DEBUG = False
+    log_write = open(r"logs/resnet50_log.txt", "w")
+    if log_write:
+        DEBUG = True
+
+    if device not in compiled_module.keys():
+        if DEBUG:
+            log_write.write("Compiling the Resnet50 module.\n")
+        mlir_model, func_name, inputs, golden_out = download_model(
+            "resnet50", frontend="torch"
+        )
+        shark_module = SharkInference(
+            mlir_model, func_name, device=device, mlir_dialect="linalg"
+        )
+        shark_module.compile()
+        compiled_module[device] = shark_module
+        if DEBUG:
+            log_write.write("Compilation successful.\n")
+
+    img = preprocess_image(numpy_img)
+    result = compiled_module[device].forward((img.detach().numpy(),))
+    output = top3_possibilities(torch.from_numpy(result), log_write)
+    log_write.close()
+
+    std_output = ""
+    with open(r"logs/resnet50_log.txt", "r") as log_read:
+        std_output = log_read.read()
+
+    return output, std_output
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
--- a/web/models/stable_diffusion/cache_objects.py
+++ b/web/models/stable_diffusion/cache_objects.py
@@ -0,0 +1,111 @@
+from transformers import CLIPTokenizer
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+)
+from models.stable_diffusion.opt_params import get_unet, get_vae, get_clip
+from models.stable_diffusion.utils import (
+    set_init_device_flags,
+    set_iree_runtime_flags,
+)
+from models.stable_diffusion.stable_args import args
+from models.stable_diffusion.schedulers import (
+    SharkEulerDiscreteScheduler,
+)
+import gc
+
+
+model_config = {
+    "v2_1": "stabilityai/stable-diffusion-2-1",
+    "v2_1base": "stabilityai/stable-diffusion-2-1-base",
+    "v1_4": "CompVis/stable-diffusion-v1-4",
+}
+
+
+def get_schedulers(version):
+    schedulers = dict()
+    schedulers["PNDM"] = PNDMScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers["DDIM"] = DDIMScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers[
+        "DPMSolverMultistep"
+    ] = DPMSolverMultistepScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers[
+        "EulerAncestralDiscrete"
+    ] = EulerAncestralDiscreteScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers[
+        "SharkEulerDiscrete"
+    ] = SharkEulerDiscreteScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers["SharkEulerDiscrete"].compile()
+    return schedulers
+
+
+def get_tokenizer(version):
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    if version != "v1_4":
+        tokenizer = CLIPTokenizer.from_pretrained(
+            model_config[version], subfolder="tokenizer"
+        )
+    return tokenizer
+
+
+class ModelCache:
+    def __init__(self):
+        self.device = None
+        self.variant = None
+        self.version = None
+        self.schedulers = None
+        self.tokenizer = None
+        self.vae = None
+        self.clip = None
+        self.unet = None
+
+    def set_models(self, device_key):
+        if self.device != device_key or self.variant != args.variant:
+            self.device = device_key
+            self.variant = args.variant
+            self.version = args.version
+            args.device = device_key.split("=>", 1)[1].strip()
+            args.max_length = 64
+            args.use_tuned = True
+            set_init_device_flags()
+            del self.schedulers
+            del self.tokenizer
+            del self.vae
+            del self.unet
+            del self.clip
+            gc.collect()
+            self.schedulers = get_schedulers(args.version)
+            self.tokenizer = get_tokenizer(args.version)
+            self.vae = get_vae()
+            self.unet = get_unet()
+            self.clip = get_clip()
+
+
+model_cache = ModelCache()
--- a/web/models/stable_diffusion/logos/Nod_logo.png
+++ b/web/models/stable_diffusion/logos/Nod_logo.png
--- a/web/models/stable_diffusion/logos/nod-logo.png
+++ b/web/models/stable_diffusion/logos/nod-logo.png
--- a/web/models/stable_diffusion/logos/sd-demo-logo.png
+++ b/web/models/stable_diffusion/logos/sd-demo-logo.png
--- a/web/models/stable_diffusion/main.py
+++ b/web/models/stable_diffusion/main.py
@@ -0,0 +1,283 @@
+import torch
+import os
+from PIL import Image
+from tqdm.auto import tqdm
+from models.stable_diffusion.cache_objects import model_cache
+from models.stable_diffusion.stable_args import args
+from models.stable_diffusion.utils import disk_space_check
+from random import randint
+import numpy as np
+import time
+import sys
+from datetime import datetime as dt
+from csv import DictWriter
+import re
+from pathlib import Path
+
+
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+# Helper function to profile the vulkan device.
+def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
+    if args.vulkan_debug_utils and "vulkan" in args.device:
+        import iree
+
+        print(f"Profiling and saving to {file_path}.")
+        vulkan_device = iree.runtime.get_device(args.device)
+        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
+        return vulkan_device
+    return None
+
+
+def end_profiling(device):
+    if device:
+        return device.end_profiling()
+
+
+def set_ui_params(
+    prompt,
+    negative_prompt,
+    steps,
+    guidance_scale,
+    seed,
+    scheduler_key,
+    variant,
+):
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.steps = steps
+    args.guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+    args.seed = seed
+    args.scheduler = scheduler_key
+    args.variant = variant
+
+
+# save output images and the inputs correspoding to it.
+def save_output_img(output_img):
+    output_path = args.output_dir if args.output_dir else Path.cwd()
+    disk_space_check(output_path, lim=5)
+    generated_imgs_path = Path(output_path, "generated_imgs")
+    generated_imgs_path.mkdir(parents=True, exist_ok=True)
+    csv_path = Path(generated_imgs_path, "imgs_history.csv")
+
+    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
+    out_img_name = (
+        f"{prompt_slice}_{args.seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+    )
+    if args.output_img_format == "jpg":
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+        output_img.save(
+            out_img_path,
+            quality=95,
+            subsampling=0,
+            optimize=True,
+            progressive=True,
+        )
+    else:
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
+        output_img.save(out_img_path, "PNG")
+        if args.output_img_format not in ["png", "jpg"]:
+            print(
+                f"[ERROR] Format {args.output_img_format} is not supported yet."
+                "saving image as png. Supported formats png / jpg"
+            )
+
+    new_entry = {
+        "VARIANT": args.variant,
+        "VERSION": args.version,
+        "SCHEDULER": args.scheduler,
+        "PROMPT": args.prompts[0],
+        "NEG_PROMPT": args.negative_prompts[0],
+        "SEED": args.seed,
+        "CFG_SCALE": float(args.guidance_scale),
+        "PRECISION": args.precision,
+        "STEPS": args.steps,
+        "OUTPUT": out_img_path,
+    }
+
+    with open(csv_path, "a") as csv_obj:
+        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
+        dictwriter_obj.writerow(new_entry)
+        csv_obj.close()
+
+
+def stable_diff_inf(
+    prompt: str,
+    negative_prompt: str,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    scheduler_key: str,
+    variant: str,
+    device_key: str,
+):
+    # Handle out of range seeds.
+    uint32_info = np.iinfo(np.uint32)
+    uint32_min, uint32_max = uint32_info.min, uint32_info.max
+    if seed < uint32_min or seed >= uint32_max:
+        seed = randint(uint32_min, uint32_max)
+
+    set_ui_params(
+        prompt,
+        negative_prompt,
+        steps,
+        guidance_scale,
+        seed,
+        scheduler_key,
+        variant,
+    )
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    generator = torch.manual_seed(
+        args.seed
+    )  # Seed generator to create the inital latent noise
+
+    # set height and width.
+    height = 512  # default height of Stable Diffusion
+    width = 512  # default width of Stable Diffusion
+    if args.version == "v2_1":
+        height = 768
+        width = 768
+
+    # get all cached data.
+    disk_space_check(Path.cwd())
+    model_cache.set_models(device_key)
+    tokenizer = model_cache.tokenizer
+    scheduler = model_cache.schedulers[args.scheduler]
+    vae, unet, clip = model_cache.vae, model_cache.unet, model_cache.clip
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+
+    # create a random initial latent.
+    latents = torch.randn(
+        (1, 4, height // 8, width // 8),
+        generator=generator,
+        dtype=torch.float32,
+    ).to(dtype)
+
+    # Warmup phase to improve performance.
+    if args.warmup_count >= 1:
+        vae_warmup_input = torch.clone(latents).detach().numpy()
+        clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
+    for i in range(args.warmup_count):
+        vae("forward", (vae_warmup_input,))
+        clip("forward", (clip_warmup_input,))
+
+    start = time.time()
+    text_input = tokenizer(
+        args.prompts,
+        padding="max_length",
+        max_length=args.max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
+        args.negative_prompts,
+        padding="max_length",
+        max_length=max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])
+
+    clip_inf_start = time.time()
+    text_embeddings = clip("forward", (text_input,))
+    clip_inf_end = time.time()
+    text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+    text_embeddings_numpy = text_embeddings.detach().numpy()
+
+    scheduler.set_timesteps(args.steps)
+    scheduler.is_scale_input_called = True
+
+    latents = latents * scheduler.init_noise_sigma
+
+    avg_ms = 0
+    for i, t in tqdm(enumerate(scheduler.timesteps)):
+
+        step_start = time.time()
+        timestep = torch.tensor([t]).to(dtype).detach().numpy()
+        latent_model_input = scheduler.scale_model_input(latents, t)
+        if cpu_scheduling:
+            latent_model_input = latent_model_input.detach().numpy()
+
+        profile_device = start_profiling(file_path="unet.rdc")
+        noise_pred = unet(
+            "forward",
+            (
+                latent_model_input,
+                timestep,
+                text_embeddings_numpy,
+                args.guidance_scale,
+            ),
+            send_to_host=False,
+        )
+        end_profiling(profile_device)
+
+        if cpu_scheduling:
+            noise_pred = torch.from_numpy(noise_pred.to_host())
+            latents = scheduler.step(noise_pred, t, latents).prev_sample
+        else:
+            latents = scheduler.step(noise_pred, t, latents)
+        step_time = time.time() - step_start
+        avg_ms += step_time
+        step_ms = int((step_time) * 1000)
+        if not args.hide_steps:
+            print(f" \nIteration = {i}, Time = {step_ms}ms")
+
+    # scale and decode the image latents with vae
+    if args.use_base_vae:
+        latents = 1 / 0.18215 * latents
+    latents_numpy = latents
+    if cpu_scheduling:
+        latents_numpy = latents.detach().numpy()
+    profile_device = start_profiling(file_path="vae.rdc")
+    vae_start = time.time()
+    images = vae("forward", (latents_numpy,))
+    vae_end = time.time()
+    end_profiling(profile_device)
+    if args.use_base_vae:
+        image = torch.from_numpy(images)
+        image = (image.detach().cpu() * 255.0).numpy()
+        images = image.round()
+    end_time = time.time()
+
+    avg_ms = 1000 * avg_ms / args.steps
+    clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
+    vae_inf_time = (vae_end - vae_start) * 1000
+    total_time = end_time - start
+    print(f"\nAverage step time: {avg_ms}ms/it")
+    print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
+    print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
+    print(f"\nTotal image generation time: {total_time}sec")
+
+    # generate outputs to web.
+    images = torch.from_numpy(images).to(torch.uint8).permute(0, 2, 3, 1)
+    pil_images = [Image.fromarray(image) for image in images.numpy()]
+
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nvariant={args.variant}, version={args.version}, scheduler={args.scheduler}"
+    text_output += f"\ndevice={device_key}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={height}x{width}"
+    text_output += f"\nAverage step time: {avg_ms:.4f}ms/it"
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    save_output_img(pil_images[0])
+
+    return pil_images[0], text_output
--- a/web/models/stable_diffusion/model_wrappers.py
+++ b/web/models/stable_diffusion/model_wrappers.py
@@ -0,0 +1,182 @@
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from transformers import CLIPTextModel
+from models.stable_diffusion.utils import compile_through_fx
+from models.stable_diffusion.resources import models_config
+from models.stable_diffusion.stable_args import args
+import torch
+
+
+# clip has 2 variants of max length 77 or 64.
+model_clip_max_length = 64 if args.max_length == 64 else 77
+if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
+    model_clip_max_length = 77
+elif args.variant == "openjourney":
+    model_clip_max_length = 64
+
+model_input = {
+    "v2_1": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 96, 96),),
+        "unet": (
+            torch.randn(1, 4, 96, 96),  # latents
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 1024),  # embedding
+            torch.tensor(1).to(torch.float32),  # guidance_scale
+        ),
+    },
+    "v2_1base": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 64, 64),),
+        "unet": (
+            torch.randn(1, 4, 64, 64),  # latents
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 1024),  # embedding
+            torch.tensor(1).to(torch.float32),  # guidance_scale
+        ),
+    },
+    "v1_4": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 64, 64),),
+        "unet": (
+            torch.randn(1, 4, 64, 64),
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 768),
+            torch.tensor(1).to(torch.float32),
+        ),
+    },
+}
+
+version = args.version if args.variant == "stablediffusion" else "v1_4"
+
+
+def get_configs():
+    model_id_key = f"{args.variant}/{version}"
+    revision_key = f"{args.variant}/{args.precision}"
+    try:
+        model_id = models_config[0][model_id_key]
+        revision = models_config[1][revision_key]
+    except KeyError:
+        raise Exception(
+            f"No entry for {model_id_key} or {revision_key} in the models configuration"
+        )
+
+    return model_id, revision
+
+
+def get_clip_mlir(model_name="clip_text", extra_args=[]):
+    model_id, revision = get_configs()
+
+    class CLIPText(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.text_encoder = CLIPTextModel.from_pretrained(
+                model_id,
+                subfolder="text_encoder",
+                revision=revision,
+            )
+
+        def forward(self, input):
+            return self.text_encoder(input)[0]
+
+    clip_model = CLIPText()
+    shark_clip = compile_through_fx(
+        clip_model,
+        model_input[version]["clip"],
+        model_name=model_name,
+        extra_args=extra_args,
+    )
+    return shark_clip
+
+
+def get_shark_module(model_key, module, model_name, extra_args):
+    if args.precision == "fp16":
+        module = module.half().cuda()
+        inputs = tuple(
+            [
+                inputs.half().cuda() if len(inputs.shape) != 0 else inputs
+                for inputs in model_input[version][model_key]
+            ]
+        )
+    else:
+        inputs = model_input[version][model_key]
+
+    shark_module = compile_through_fx(
+        module,
+        inputs,
+        model_name=model_name,
+        extra_args=extra_args,
+    )
+    return shark_module
+
+
+def get_base_vae_mlir(model_name="vae", extra_args=[]):
+    model_id, revision = get_configs()
+
+    class BaseVaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                model_id,
+                subfolder="vae",
+                revision=revision,
+            )
+
+        def forward(self, input):
+            x = self.vae.decode(input, return_dict=False)[0]
+            return (x / 2 + 0.5).clamp(0, 1)
+
+    vae = BaseVaeModel()
+    return get_shark_module("vae", vae, model_name, extra_args)
+
+
+def get_vae_mlir(model_name="vae", extra_args=[]):
+    model_id, revision = get_configs()
+
+    class VaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                model_id,
+                subfolder="vae",
+                revision=revision,
+            )
+
+        def forward(self, input):
+            input = 1 / 0.18215 * input
+            x = self.vae.decode(input, return_dict=False)[0]
+            x = (x / 2 + 0.5).clamp(0, 1)
+            x = x * 255.0
+            return x.round()
+
+    vae = VaeModel()
+    return get_shark_module("vae", vae, model_name, extra_args)
+
+
+def get_unet_mlir(model_name="unet", extra_args=[]):
+    model_id, revision = get_configs()
+
+    class UnetModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.unet = UNet2DConditionModel.from_pretrained(
+                model_id,
+                subfolder="unet",
+                revision=revision,
+            )
+            self.in_channels = self.unet.in_channels
+            self.train(False)
+
+        def forward(self, latent, timestep, text_embedding, guidance_scale):
+            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+            latents = torch.cat([latent] * 2)
+            unet_out = self.unet.forward(
+                latents, timestep, text_embedding, return_dict=False
+            )[0]
+            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+            return noise_pred
+
+    unet = UnetModel()
+    return get_shark_module("unet", unet, model_name, extra_args)
--- a/web/models/stable_diffusion/opt_params.py
+++ b/web/models/stable_diffusion/opt_params.py
@@ -0,0 +1,99 @@
+import sys
+from models.stable_diffusion.model_wrappers import (
+    get_base_vae_mlir,
+    get_vae_mlir,
+    get_unet_mlir,
+    get_clip_mlir,
+)
+from models.stable_diffusion.resources import models_db
+from models.stable_diffusion.stable_args import args
+from models.stable_diffusion.utils import get_shark_model
+
+BATCH_SIZE = len(args.prompts)
+if BATCH_SIZE != 1:
+    sys.exit("Only batch size 1 is supported.")
+
+
+def get_params(bucket_key, model_key, model, is_tuned, precision):
+    iree_flags = []
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    try:
+        bucket = models_db[0][bucket_key]
+        model_name = models_db[1][model_key]
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "default_compilation_flags"
+        ]
+    except KeyError:
+        raise Exception(
+            f" there is no entry for {model_key} in the models database"
+        )
+
+    if (
+        "specified_compilation_flags"
+        in models_db[2][model][is_tuned][precision]
+    ):
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in models_db[2][model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return bucket, model_name, iree_flags
+
+
+def get_unet():
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    bucket_key = f"{args.variant}/{is_tuned}"
+    model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "unet", is_tuned, args.precision
+    )
+    if not args.use_tuned and args.import_mlir:
+        return get_unet_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_vae():
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    is_base = "/base" if args.use_base_vae else ""
+    bucket_key = f"{args.variant}/{is_tuned}"
+    model_key = f"{args.variant}/{args.version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "vae", is_tuned, args.precision
+    )
+    if not args.use_tuned and args.import_mlir:
+        if args.use_base_vae:
+            return get_base_vae_mlir(model_name, iree_flags)
+        return get_vae_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_clip():
+    bucket_key = f"{args.variant}/untuned"
+    model_key = f"{args.variant}/{args.version}/clip/fp32/length_{args.max_length}/untuned"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "clip", "untuned", "fp32"
+    )
+    if args.import_mlir:
+        return get_clip_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
--- a/web/models/stable_diffusion/resources.py
+++ b/web/models/stable_diffusion/resources.py
@@ -0,0 +1,41 @@
+import os
+import json
+import sys
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+prompt_examples = []
+prompts_loc = resource_path("resources/prompts.json")
+if os.path.exists(prompts_loc):
+    with open(prompts_loc, encoding="utf-8") as fopen:
+        prompt_examples = json.load(fopen)
+
+if not prompt_examples:
+    print("Unable to fetch prompt examples.")
+
+
+models_db = []
+models_loc = resource_path("resources/model_db.json")
+if os.path.exists(models_loc):
+    with open(models_loc, encoding="utf-8") as fopen:
+        models_db = json.load(fopen)
+
+if len(models_db) != 3:
+    sys.exit("Error: Unable to load models database.")
+
+
+models_config = []
+modelconfig_loc = resource_path("resources/model_config.json")
+if os.path.exists(modelconfig_loc):
+    with open(modelconfig_loc, encoding="utf-8") as fopen:
+        models_config = json.load(fopen)
+
+if len(models_config) != 2:
+    sys.exit("Error: Unable to load models configuration.")
--- a/apps/stable_diffusion/src/utils/resources/model_config.json
+++ b/apps/stable_diffusion/src/utils/resources/model_config.json
--- a/web/models/stable_diffusion/resources/model_db.json
+++ b/web/models/stable_diffusion/resources/model_db.json
@@ -0,0 +1,164 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip2base_18dec_fp32",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_19dec_v2p1base_fp32_64",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet2_14dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae2_19dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform",
+            "--iree-flow-enable-conv-winograd-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform",
+            "--iree-flow-enable-conv-winograd-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/web/models/stable_diffusion/resources/prompts.json
+++ b/web/models/stable_diffusion/resources/prompts.json
@@ -0,0 +1,8 @@
+[["A high tech solarpunk utopia in the Amazon rainforest"],
+["A pikachu fine dining with a view to the Eiffel Tower"],
+["A mecha robot in a favela in expressionist style"],
+["an insect robot preparing a delicious meal"],
+["A digital Illustration of the Babel tower, 4k, detailed, trending in artstation, fantasy vivid colors"],
+["Cluttered house in the woods, anime, oil painting, high resolution, cottagecore, ghibli inspired, 4k"],
+["A beautiful mansion beside a waterfall in the woods, by josef thoma, matte painting, trending on artstation HQ"],
+["portrait photo of a asia old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes"]]
--- a/web/models/stable_diffusion/schedulers.py
+++ b/web/models/stable_diffusion/schedulers.py
@@ -0,0 +1,133 @@
+import sys
+import numpy as np
+from typing import List, Optional, Tuple, Union
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
+from diffusers.configuration_utils import register_to_config
+from models.stable_diffusion.utils import compile_through_fx, get_shark_model
+from models.stable_diffusion.stable_args import args
+import torch
+
+SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+
+model_input = {
+    "euler": {
+        "latent": torch.randn(1, 4, 64, 64),
+        "output": torch.randn(1, 4, 64, 64),
+        "sigma": torch.tensor(1).to(torch.float32),
+        "dt": torch.tensor(1).to(torch.float32),
+    },
+}
+
+
+class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
+        super().__init__(
+            num_train_timesteps,
+            beta_start,
+            beta_end,
+            beta_schedule,
+            trained_betas,
+            prediction_type,
+        )
+
+    def compile(self):
+        example_latent = model_input["euler"]["latent"]
+        example_output = model_input["euler"]["output"]
+        if args.precision == "fp16":
+            example_latent = example_latent.half()
+            example_output = example_output.half()
+        example_sigma = model_input["euler"]["sigma"]
+        example_dt = model_input["euler"]["dt"]
+
+        class ScalingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, latent, sigma):
+                return latent / ((sigma**2 + 1) ** 0.5)
+
+        class SchedulerStepModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, noise_pred, sigma, latent, dt):
+                pred_original_sample = latent - sigma * noise_pred
+                derivative = (latent - pred_original_sample) / sigma
+                return latent + derivative * dt
+
+        iree_flags = []
+        if len(args.iree_vulkan_target_triple) > 0:
+            iree_flags.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+        # Disable bindings fusion to work with moltenVK.
+        if sys.platform == "darwin":
+            iree_flags.append("-iree-stream-fuse-binding=false")
+
+        if args.import_mlir:
+            scaling_model = ScalingModel()
+            self.scaling_model = compile_through_fx(
+                scaling_model,
+                (example_latent, example_sigma),
+                model_name="euler_scale_model_input_" + args.precision,
+                extra_args=iree_flags,
+            )
+
+            step_model = SchedulerStepModel()
+            self.step_model = compile_through_fx(
+                step_model,
+                (example_output, example_sigma, example_latent, example_dt),
+                model_name="euler_step_" + args.precision,
+                extra_args=iree_flags,
+            )
+        else:
+            self.scaling_model = get_shark_model(
+                SCHEDULER_BUCKET,
+                "euler_scale_model_input_" + args.precision,
+                iree_flags,
+            )
+            self.step_model = get_shark_model(
+                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
+            )
+
+    def scale_model_input(self, sample, timestep):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        return self.scaling_model(
+            "forward",
+            (
+                sample,
+                sigma,
+            ),
+            send_to_host=False,
+        )
+
+    def step(self, noise_pred, timestep, latent):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        dt = self.sigmas[step_index + 1] - sigma
+        return self.step_model(
+            "forward",
+            (
+                noise_pred,
+                sigma,
+                latent,
+                dt,
+            ),
+            send_to_host=False,
+        )
--- a/web/models/stable_diffusion/stable_args.py
+++ b/web/models/stable_diffusion/stable_args.py
@@ -0,0 +1,256 @@
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Stable Diffusion Params
+##############################################################################
+
+p.add_argument(
+    "--prompts",
+    nargs="+",
+    default=["cyberpunk forest by Salvador Dali"],
+    help="text of which images to be generated.",
+)
+
+p.add_argument(
+    "--negative-prompts",
+    nargs="+",
+    default=[""],
+    help="text you don't want to see in the generated image.",
+)
+
+p.add_argument(
+    "--steps",
+    type=int,
+    default=50,
+    help="the no. of steps to do the sampling.",
+)
+
+p.add_argument(
+    "--seed",
+    type=int,
+    default=42,
+    help="the seed to use.",
+)
+
+p.add_argument(
+    "--guidance_scale",
+    type=float,
+    default=7.5,
+    help="the value to be used for guidance scaling.",
+)
+
+p.add_argument(
+    "--max_length",
+    type=int,
+    default=64,
+    help="max length of the tokenizer output, options are 64 and 77.",
+)
+
+##############################################################################
+### Model Config and Usage Params
+##############################################################################
+
+p.add_argument(
+    "--device", type=str, default="vulkan", help="device to run the model."
+)
+
+p.add_argument(
+    "--version",
+    type=str,
+    default="v2_1base",
+    help="Specify version of stable diffusion model",
+)
+
+p.add_argument(
+    "--precision", type=str, default="fp16", help="precision to run the model."
+)
+
+p.add_argument(
+    "--import_mlir",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
+)
+
+p.add_argument(
+    "--load_vmfb",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
+)
+
+p.add_argument(
+    "--save_vmfb",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="saves the compiled flatbuffer to the local directory",
+)
+
+p.add_argument(
+    "--use_tuned",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Download and use the tuned version of the model if available",
+)
+
+p.add_argument(
+    "--use_base_vae",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Do conversion from the VAE output to pixel space on cpu.",
+)
+
+p.add_argument(
+    "--variant",
+    default="stablediffusion",
+    help="We now support multiple vairants of SD finetuned for different dataset. you can use the following anythingv3, ...",  # TODO add more once supported
+)
+
+p.add_argument(
+    "--scheduler",
+    type=str,
+    default="SharkEulerDiscrete",
+    help="other supported schedulers are [PNDM, DDIM, LMSDiscrete, EulerDiscrete, DPMSolverMultistep]",
+)
+
+p.add_argument(
+    "--output_img_format",
+    type=str,
+    default="png",
+    help="specify the format in which output image is save. Supported options: jpg / png",
+)
+
+p.add_argument(
+    "--output_dir",
+    type=str,
+    default=None,
+    help="Directory path to save the output images and json",
+)
+
+##############################################################################
+### IREE - Vulkan supported flags
+##############################################################################
+
+p.add_argument(
+    "--iree-vulkan-target-triple",
+    type=str,
+    default="",
+    help="Specify target triple for vulkan",
+)
+
+p.add_argument(
+    "--vulkan_debug_utils",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Profiles vulkan device and collects the .rdc info",
+)
+
+p.add_argument(
+    "--vulkan_large_heap_block_size",
+    default="4147483648",
+    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
+)
+
+p.add_argument(
+    "--vulkan_validation_layers",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for disabling vulkan validation layers when benchmarking",
+)
+
+##############################################################################
+### Misc. Debug and Optimization flags
+##############################################################################
+
+p.add_argument(
+    "--use_compiled_scheduler",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="use the default scheduler precompiled into the model if available",
+)
+
+p.add_argument(
+    "--local_tank_cache",
+    default="",
+    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
+)
+
+p.add_argument(
+    "--dump_isa",
+    default=False,
+    action="store_true",
+    help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
+)
+
+p.add_argument(
+    "--dispatch_benchmarks",
+    default=None,
+    help='dispatches to return benchamrk data on.  use "All" for all, and None for none.',
+)
+
+p.add_argument(
+    "--dispatch_benchmarks_dir",
+    default="temp_dispatch_benchmarks",
+    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
+)
+
+p.add_argument(
+    "--enable_rgp",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for inserting debug frames between iterations for use with rgp.",
+)
+
+p.add_argument(
+    "--hide_steps",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for hiding the details of iteration/sec for each step.",
+)
+
+p.add_argument(
+    "--warmup_count",
+    type=int,
+    default=0,
+    help="flag setting warmup count for clip and vae [>= 0].",
+)
+
+p.add_argument(
+    "--clear_all",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
+)
+
+##############################################################################
+### Web UI flags
+##############################################################################
+
+p.add_argument(
+    "--progress_bar",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for removing the pregress bar animation during image generation",
+)
+
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
+##############################################################################
+
+args = p.parse_args()
--- a/web/models/stable_diffusion/utils.py
+++ b/web/models/stable_diffusion/utils.py
@@ -0,0 +1,243 @@
+import os
+import torch
+from shark.shark_inference import SharkInference
+from models.stable_diffusion.stable_args import args
+from shark.shark_importer import import_with_fx
+from shark.iree_utils.vulkan_utils import (
+    set_iree_vulkan_runtime_flags,
+    get_vulkan_target_triple,
+)
+
+
+def _compile_module(shark_module, model_name, extra_args=[]):
+    if args.load_vmfb or args.save_vmfb:
+        device = (
+            args.device
+            if "://" not in args.device
+            else "-".join(args.device.split("://"))
+        )
+        extended_name = "{}_{}".format(model_name, device)
+        vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
+        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
+            print(f"loading existing vmfb from: {vmfb_path}")
+            shark_module.load_module(vmfb_path, extra_args=extra_args)
+        else:
+            if args.save_vmfb:
+                print("Saving to {}".format(vmfb_path))
+            else:
+                print(
+                    "No vmfb found. Compiling and saving to {}".format(
+                        vmfb_path
+                    )
+                )
+            path = shark_module.save_module(
+                os.getcwd(), extended_name, extra_args
+            )
+            shark_module.load_module(path, extra_args=extra_args)
+    else:
+        shark_module.compile(extra_args)
+    return shark_module
+
+
+# Downloads the model from shark_tank and returns the shark_module.
+def get_shark_model(tank_url, model_name, extra_args=[]):
+    from shark.shark_downloader import download_model
+    from shark.parser import shark_args
+
+    # Set local shark_tank cache directory.
+    shark_args.local_tank_cache = args.local_tank_cache
+
+    mlir_model, func_name, inputs, golden_out = download_model(
+        model_name,
+        tank_url=tank_url,
+        frontend="torch",
+    )
+    shark_module = SharkInference(
+        mlir_model, device=args.device, mlir_dialect="linalg"
+    )
+    return _compile_module(shark_module, model_name, extra_args)
+
+
+# Converts the torch-module into a shark_module.
+def compile_through_fx(model, inputs, model_name, extra_args=[]):
+
+    mlir_module, func_name = import_with_fx(model, inputs)
+
+    shark_module = SharkInference(
+        mlir_module,
+        device=args.device,
+        mlir_dialect="linalg",
+    )
+
+    return _compile_module(shark_module, model_name, extra_args)
+
+
+def set_iree_runtime_flags():
+
+    vulkan_runtime_flags = [
+        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
+        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
+    ]
+    if args.enable_rgp:
+        vulkan_runtime_flags += [
+            f"--enable_rgp=true",
+            f"--vulkan_debug_utils=true",
+        ]
+    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
+
+
+def get_all_devices(driver_name):
+    """
+    Inputs: driver_name
+    Returns a list of all the available devices for a given driver sorted by
+    the iree path names of the device as in --list_devices option in iree.
+    """
+    from iree.runtime import get_driver
+
+    driver = get_driver(driver_name)
+    device_list_src = driver.query_available_devices()
+    device_list_src.sort(key=lambda d: d["path"])
+    return device_list_src
+
+
+def get_device_mapping(driver, key_combination=3):
+    """This method ensures consistent device ordering when choosing
+    specific devices for execution
+    Args:
+        driver (str): execution driver (vulkan, cuda, rocm, etc)
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Returns:
+        dict: map to possible device names user can input mapped to desired combination of name/path.
+    """
+    from shark.iree_utils._common import iree_device_map
+
+    driver = iree_device_map(driver)
+    device_list = get_all_devices(driver)
+    device_map = dict()
+
+    def get_output_value(dev_dict):
+        if key_combination == 1:
+            return f"{driver}://{dev_dict['path']}"
+        if key_combination == 2:
+            return dev_dict["name"]
+        if key_combination == 3:
+            return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
+
+    # mapping driver name to default device (driver://0)
+    device_map[f"{driver}"] = get_output_value(device_list[0])
+    for i, device in enumerate(device_list):
+        # mapping with index
+        device_map[f"{driver}://{i}"] = get_output_value(device)
+        # mapping with full path
+        device_map[f"{driver}://{device['path']}"] = get_output_value(device)
+    return device_map
+
+
+def map_device_to_name_path(device, key_combination=3):
+    """Gives the appropriate device data (supported name/path) for user selected execution device
+    Args:
+        device (str): user
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Raises:
+        ValueError:
+    Returns:
+        str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
+    """
+    driver = device.split("://")[0]
+    device_map = get_device_mapping(driver, key_combination)
+    try:
+        device_mapping = device_map[device]
+    except KeyError:
+        raise ValueError(f"Device '{device}' is not a valid device.")
+    return device_mapping
+
+
+def set_init_device_flags():
+    if "vulkan" in args.device:
+        # set runtime flags for vulkan.
+        set_iree_runtime_flags()
+
+        # set triple flag to avoid multiple calls to get_vulkan_triple_flag
+        device_name, args.device = map_device_to_name_path(args.device)
+        if not args.iree_vulkan_target_triple:
+            triple = get_vulkan_target_triple(device_name)
+            if triple is not None:
+                args.iree_vulkan_target_triple = triple
+        print(
+            f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
+        )
+    elif "cuda" in args.device:
+        args.device = "cuda"
+    elif "cpu" in args.device:
+        args.device = "cpu"
+
+    # set max_length based on availability.
+    if args.version == "v1_4":
+        args.max_length = 77
+    elif args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
+        args.max_length = 77
+    elif args.variant == "openjourney":
+        args.max_length = 64
+
+    # use tuned models only in the case of stablediffusion/fp16 and rdna3 cards.
+    if (
+        args.variant in ["openjourney", "dreamlike"]
+        or args.precision != "fp16"
+        or args.version == "v1_4"
+        or "vulkan" not in args.device
+        or "rdna3" not in args.iree_vulkan_target_triple
+    ):
+        args.use_tuned = False
+        print("Tuned models are currently not supported for this setting.")
+
+    elif args.use_base_vae and args.variant != "stablediffusion":
+        args.use_tuned = False
+        print("Tuned models are currently not supported for this setting.")
+
+    if args.use_tuned:
+        print("Using tuned models for stablediffusion/fp16 and rdna3 card.")
+
+
+# Utility to get list of devices available.
+def get_available_devices():
+    def get_devices_by_name(driver_name):
+        from shark.iree_utils._common import iree_device_map
+
+        device_list = []
+        try:
+            driver_name = iree_device_map(driver_name)
+            device_list_dict = get_all_devices(driver_name)
+            print(f"{driver_name} devices are available.")
+        except:
+            print(f"{driver_name} devices are not available.")
+        else:
+            for i, device in enumerate(device_list_dict):
+                device_list.append(f"{device['name']} => {driver_name}://{i}")
+        return device_list
+
+    set_iree_runtime_flags()
+
+    available_devices = []
+    vulkan_devices = get_devices_by_name("vulkan")
+    available_devices.extend(vulkan_devices)
+    cuda_devices = get_devices_by_name("cuda")
+    available_devices.extend(cuda_devices)
+    #  available_devices.append("cpu")
+    return available_devices
+
+
+def disk_space_check(path, lim=20):
+    from shutil import disk_usage
+
+    du = disk_usage(path)
+    free = du.free / (1024 * 1024 * 1024)
+    if free <= lim:
+        print(f"[WARNING] Only {free:.2f}GB space available in {path}.")
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -19,19 +19,17 @@ datas += copy_metadata('torchvision')
 datas += copy_metadata('torch-mlir')
 datas += copy_metadata('diffusers')
 datas += copy_metadata('transformers')
-datas += copy_metadata('omegaconf')
-datas += copy_metadata('safetensors')
 datas += collect_data_files('gradio')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
 datas += collect_data_files('shark')
 datas += [
-         ( 'src/utils/resources/prompts.json', 'resources' ),
-         ( 'src/utils/resources/model_db.json', 'resources' ),
-         ( 'src/utils/resources/opt_flags.json', 'resources' ),
-         ( 'src/utils/resources/base_model.json', 'resources' ),
-         ( 'web/logos/*', 'logos' )
+         ( 'models/stable_diffusion/resources/prompts.json', 'resources' ),
+         ( 'models/stable_diffusion/resources/model_db.json', 'resources' ),
+         ( 'models/stable_diffusion/resources/model_config.json', 'resources' ),
+         ( 'models/stable_diffusion/logos/*', 'logos' )
         ]
+datas += [('demo.css', '.')]

 binaries = []

@@ -39,11 +37,11 @@ block_cipher = None


 a = Analysis(
-    ['web/index.py'],
+    ['index.py'],
    pathex=['.'],
    binaries=binaries,
    datas=datas,
-    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio'],
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
--- a/apps/stable_diffusion/scripts/telegram_bot.py
+++ b/apps/stable_diffusion/scripts/telegram_bot.py
				`@@ -1 +0,0 @@`
				`from apps.stable_diffusion.scripts.txt2img import txt2img_inf`