Update to build 423

Post pytorch security breach
replaced <username> with %username% for easy copy/paste (#744 )
2026-04-20 03:00:34 -04:00 · 2023-01-01 12:10:23 -08:00 · 2022-12-31 21:29:37 -08:00 · 2022-12-28 15:14:36 -08:00 · 2022-12-28 12:27:43 -06:00 · 2022-12-27 14:25:20 -08:00
102 changed files with 4905 additions and 2169 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -9,7 +9,80 @@ on:
  workflow_dispatch:

 jobs:
-  build:
+  windows-build:
+    runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Compute version
+      shell: powershell
+      run: |
+        $package_version = $(Get-Date -UFormat "%Y%m%d")+"."+${{ github.run_number }}
+        $package_version_ = $(Get-Date -UFormat "%Y%m%d")+"_"+${{ github.run_number }}
+        $tag_name=$package_version
+        echo "package_version=$package_version" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+        echo "package_version_=$package_version_" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+        echo "tag_name=$tag_name" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+
+    - name: Create Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        tag_name: ${{ env.tag_name }}
+        release_name: nod.ai SHARK ${{ env.tag_name }}
+        body: |
+          Automatic snapshot release of nod.ai SHARK.
+        draft: true
+        prerelease: false
+
+    - name: Build Package 
+      shell: powershell
+      run: |
+        ./setup_venv.ps1
+        pyinstaller web/shark_sd.spec
+        mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
+
+        
+    # GHA windows VM OOMs so disable for now
+    #- name: Build and validate the SHARK Runtime package
+    #  shell: powershell
+    #  run: |
+    #    $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
+    #    pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
+
+    - uses: actions/upload-artifact@v2
+      with:
+        path: dist/*
+    
+    - name: Upload Release Assets
+      id: upload-release-assets
+      uses: dwenegar/upload-release-assets@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+        assets_path: ./dist/*
+
+    - name: Publish Release
+      id: publish_release
+      uses: eregon/publish-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+
+  linux-build:

    runs-on: a100
    strategy:
@@ -32,31 +105,10 @@ jobs:
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
-    
-    - name: Compute version
-      run: |
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        tag_name="${package_version}"
-        echo "package_version=${package_version}" >> $GITHUB_ENV
-        echo "tag_name=${tag_name}" >> $GITHUB_ENV    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-    - name: Create Release
-      id: create_release
-      uses: actions/create-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        tag_name: ${{ env.tag_name }}
-        release_name: nod.ai SHARK ${{ env.tag_name }}
-        body: |
-          Automatic snapshot release of nod.ai SHARK.
-        draft: true
-        prerelease: false
+
    - name: Install dependencies
      run: |
+        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
        python -m pip install --upgrade pip
        python -m pip install flake8 pytest toml
        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html; fi
@@ -68,6 +120,7 @@ jobs:
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
    - name: Build and validate the IREE package
      if: ${{ matrix.backend == 'IREE' }}
+      continue-on-error: true
      run: |
        cd $GITHUB_WORKSPACE
        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
@@ -79,7 +132,7 @@ jobs:
        pip install ./wheelhouse/nodai*
        # Validate the Models
        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
          tail -n 1 |
          tee -a pytest_results.txt
        if !(grep -Fxq " failed" pytest_results.txt) 
@@ -102,25 +155,6 @@ jobs:
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} tank/test_models.py |
+        pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
          tail -n 1 |
          tee -a pytest_results.txt
-    
-    - name: Upload Release Assets
-      if: ${{ matrix.backend == 'SHARK' }}
-      id: upload-release-assets
-      uses: dwenegar/upload-release-assets@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ./wheelhouse/nodai_*.whl
-
-    - name: Publish Release
-      if: ${{ matrix.backend == 'SHARK' }}
-      id: publish_release
-      uses: eregon/publish-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -6,8 +6,14 @@ name: Validate Models on Shark Runtime
 on:
  push:
    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
  pull_request:
    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
  workflow_dispatch:

 # Ensure that only a single job or workflow using the same
@@ -36,8 +42,6 @@ jobs:
            suite: cuda
          - os: ubuntu-latest
            suite: cpu
-          - os: MacStudio
-            suite: vulkan
          - os: MacStudio
            suite: cuda
          - os: MacStudio
@@ -118,19 +122,15 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        echo "VULKAN SDK PATH wo setup: $VULKAN_SDK"
-        cd /Users/anush/VulkanSDK/1.3.224.1/
-        source setup-env.sh
-        cd $GITHUB_WORKSPACE
-        echo "VULKAN SDK PATH with setup: $VULKAN_SDK"
+        export DYLD_LIBRARY_PATH=/usr/local/lib/
        echo $PATH
        pip list | grep -E "torch|iree"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
+        pytest -s --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" tank/test_models.py -k vulkan --update_tank

    - name: Validate Vulkan Models (a100)
      if: matrix.suite == 'vulkan' && matrix.os != 'MacStudio'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
+        pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
--- a/.gitignore
+++ b/.gitignore
@@ -31,7 +31,6 @@ MANIFEST
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
-*.spec

 # Installer logs
 pip-log.txt
@@ -163,6 +162,9 @@ cython_debug/
 # Shark related artefacts
 *venv/
 shark_tmp/
+*.vmfb
+.use-iree
+tank/dict_configs.py

 # ORT related artefacts
 cache_models/
--- a/README.md
+++ b/README.md
@@ -5,16 +5,114 @@ High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerator
 [![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
 [![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)

-## Communication Channels

-*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
-*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+## Installation (Windows, Linux and macOS)
+
+## Check out the code
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+```
+
+## Setup your Python VirtualEnvironment and Dependencies
+
+### Windows 10/11 Users
+
+* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
+
+* Install Git for Windows from [here](https://git-scm.com/download/win)
+
+#### Allow the install script to run in Powershell
+```powershell
+set-executionpolicy remotesigned
+```
+
+#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
+```powershell
+./setup_venv.ps1 #You can re-run this script to get the latest version
+```
+
+### Linux / macOS Users
+
+```shell
+./setup_venv.sh
+source shark.venv/bin/activate
+```


-## Installation
+### Run Stable Diffusion on your device - WebUI
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\Users\nod\SHARK> cd web
+(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
+```
+#### Linux Users
+```shell
+(shark.venv) > cd web
+(shark.venv) > python index.py
+```
+
+#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
+
+
+<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
+
+
+
+### Run Stable Diffusion on your device - Commandline
+
+#### Install your hardware drivers
+* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree)
+* [macOS Users] Download and install the latest Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home)
+* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
+
+Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
+
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+```
+
+#### Linux / macOS Users
+```shell
+python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+```
+
+You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
+
+The output on a 6900XT would like:
+
+```shell 
+44it [00:08,  5.14it/s]i = 44 t = 120 (191ms)
+45it [00:08,  5.15it/s]i = 45 t = 100 (191ms)
+46it [00:08,  5.16it/s]i = 46 t = 80 (191ms)
+47it [00:09,  5.16it/s]i = 47 t = 60 (193ms)
+48it [00:09,  5.15it/s]i = 48 t = 40 (195ms)
+49it [00:09,  5.12it/s]i = 49 t = 20 (196ms)
+50it [00:09,  5.14it/s]
+Average step time: 192.8154182434082ms/it
+Total image generation runtime (s): 10.390909433364868
+(shark.venv) PS C:\g\shark>
+```
+
+Here are some samples generated:
+
+![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
+
+![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
+
+
+
+For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
+
+Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
+

 <details>
-  <summary>Installation (Linux, macOS and Windows)</summary>
+  <summary>Binary Installation</summary>

 ### Setup a new pip Virtual Environment

@@ -66,43 +164,6 @@ python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
 </details>


-<details>
-  <summary>Source Installation</summary>
-
-## Check out the code
-
-```shell
-git clone https://github.com/nod-ai/SHARK.git
-```
-
-## Setup your Python VirtualEnvironment and Dependencies
-
-### Windows Users
-
-```shell
-# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...). 
-# Requires Python 3.10 and Powershell
-./setup_venv.ps1
-shark.venv/Scripts/activate
-```
-
-### Linux / macOS Users
-
-```shell
-# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
-./setup_venv.sh
-source shark.venv/bin/activate
-```
-
-
-### Run a demo script
-```shell
-python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
-# Or a pytest
-pytest tank/test_models.py -k "MiniLM"
-```
-
-</details>

 <details>
  <summary>Development, Testing and Benchmarks</summary>
@@ -113,9 +174,17 @@ Set `USE_IREE=1` to use upstream IREE
 # PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
 ```

+### Run any of the hundreds of SHARK tank models via the test framework
+```shell
+python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
+# Or a pytest
+pytest tank/test_models.py -k "MiniLM"
+```
+  
+
 If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
 the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
-with Python bindings and set your PYTHONPATH as mentioned [here](https://google.github.io/iree/bindings/python/)
+with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
 for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
 for Torch-MLIR.

@@ -154,6 +223,7 @@ shark_module = SharkInference(
 ```

 Output will include:
+- An ordered list ordered-dispatches.txt of all the dispatches with their runtime
 - Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
 - An .mlir file containing the dispatch benchmark 
 - A compiled .vmfb file containing the dispatch benchmark
@@ -232,6 +302,11 @@ SHARK is maintained to support the latest innovations in ML Models:

 For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).

+## Communication Channels
+
+*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
+*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+
 ## Related Projects

 <details>
--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -42,7 +42,7 @@ class TFHuggingFaceLanguage(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=tf_bert_input)
+    @tf.function(input_signature=tf_bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/cpp/save_img.py
+++ b/cpp/save_img.py
@@ -1,7 +1,6 @@
 import numpy as np
 import tensorflow as tf
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model


 def load_and_preprocess_image(fname: str):
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
 addopts = --verbose -p no:warnings
-norecursedirs = inference tank/tflite 
+norecursedirs = inference tank/tflite examples benchmarks shark 
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -2,7 +2,7 @@
 --pre

 numpy
-torch==1.14.0.dev20221021
+torch
 torchvision

 tqdm
@@ -28,6 +28,7 @@ Pillow

 # web dependecies.
 gradio
+altair

 # Testing and support.
 #lit
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -2,7 +2,6 @@
 --pre

 numpy==1.22.4
-torch
 torchvision

 tqdm
@@ -35,6 +34,7 @@ sacremoses

 # web dependecies.
 gradio
+altair
 scipy

 #ONNX and ORT for benchmarking
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,10 +5,21 @@ wheel
 tqdm

 # SHARK Downloader
-gsutil
+google-cloud-storage

 # Testing
 pytest
 pytest-xdist
 Pillow
 parameterized
+
+# Add transformers, diffusers and scipy since it most commonly used
+transformers
+diffusers
+scipy
+ftfy
+gradio
+altair
+
+# Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
+pyinstaller
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -31,10 +31,9 @@ Write-Host "Installing Build Dependencies"
 python -m venv .\shark.venv\
 .\shark.venv\Scripts\activate
 pip install -r requirements.txt
-pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116 -f https://llvm.github.io/torch-mlir/package-index/
+pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
 pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
 Write-Host "Building SHARK..."
 pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
-pip install diffusers transformers scipy pillow gradio
 Write-Host "Build and installation completed successfully"
 Write-Host "Source your venv with ./shark.venv/Scripts/activate"
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -77,7 +77,8 @@ $PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
 $PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
 if [ "$torch_mlir_bin" = true ]; then
  if [[ $(uname -s) = 'Darwin' ]]; then
-    echo "MacOS detected. Please install torch-mlir from source or .whl, as dependency problems may occur otherwise."
+    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
+    $PYTHON -m pip install --pre --no-cache-dir  torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
  else
    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
    if [ $? -eq 0 ];then
@@ -93,8 +94,10 @@ else
  exit 1
 fi
 if [[ -z "${USE_IREE}" ]]; then
+  rm .use-iree
  RUNTIME="https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html"
 else
+  touch ./.use-iree
  RUNTIME="https://iree-org.github.io/iree/pip-release-links.html"
 fi
 if [[ -z "${NO_BACKEND}" ]]; then
@@ -103,29 +106,29 @@ if [[ -z "${NO_BACKEND}" ]]; then
 else
  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
 fi
+
 if [[ ! -z "${IMPORTER}" ]]; then
  echo "${Yellow}Installing importer tools.."
  if [[ $(uname -s) = 'Linux' ]]; then
    echo "${Yellow}Linux detected.. installing Linux importer tools"
    #Always get the importer tools from upstream IREE
-    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  elif [[ $(uname -s) = 'Darwin' ]]; then
    echo "${Yellow}macOS detected.. installing macOS importer tools"
    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
-    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-    $PYTHON -m pip install https://github.com/llvm/torch-mlir/releases/download/snapshot-20221024.636/torch_mlir-20221024.636-cp310-cp310-macosx_11_0_universal2.whl
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  fi
 fi

-$PYTHON -m pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME}
+$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/torch/

 if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
  $PYTHON -m pip uninstall -y torch torchvision
-  $PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+  $PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch + cu116."
+    echo "Successfully Installed torch + cu117."
  else
-    echo "Could not install torch + cu116." >&2
+    echo "Could not install torch + cu117." >&2
  fi
 fi

--- a/shark/examples/shark_eager/dynamo_demo.ipynb
+++ b/shark/examples/shark_eager/dynamo_demo.ipynb
@@ -36,7 +36,9 @@
    "    from torchdynamo.optimizations.backends import create_backend\n",
    "    from torchdynamo.optimizations.subgraph import SubGraph\n",
    "except ModuleNotFoundError:\n",
-    "    print(\"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\")\n",
+    "    print(\n",
+    "        \"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\"\n",
+    "    )\n",
    "    exit()\n",
    "\n",
    "# torch-mlir imports for compiling\n",
@@ -97,7 +99,9 @@
    "\n",
    "        for node in fx_g.graph.nodes:\n",
    "            if node.op == \"output\":\n",
-    "                assert len(node.args) == 1, \"Output node must have a single argument\"\n",
+    "                assert (\n",
+    "                    len(node.args) == 1\n",
+    "                ), \"Output node must have a single argument\"\n",
    "                node_arg = node.args[0]\n",
    "                if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
    "                    node.args = (node_arg[0],)\n",
@@ -116,8 +120,12 @@
    "    if len(args) == 1 and isinstance(args[0], list):\n",
    "        args = args[0]\n",
    "\n",
-    "    linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)\n",
-    "    callable, _ = get_iree_compiled_module(linalg_module, \"cuda\", func_name=\"forward\")\n",
+    "    linalg_module = compile(\n",
+    "        ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS\n",
+    "    )\n",
+    "    callable, _ = get_iree_compiled_module(\n",
+    "        linalg_module, \"cuda\", func_name=\"forward\"\n",
+    "    )\n",
    "\n",
    "    def forward(*inputs):\n",
    "        return callable(*inputs)\n",
@@ -212,6 +220,7 @@
    "    assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
    "    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
    "\n",
+    "\n",
    "@torchdynamo.optimize(\"torch_mlir\")\n",
    "def toy_example2(*args):\n",
    "    a, b = args\n",
--- a/shark/examples/shark_inference/CLIPModel_tf.py
+++ b/shark/examples/shark_inference/CLIPModel_tf.py
@@ -22,7 +22,7 @@ class CLIPModule(tf.Module):
            input_ids=x, attention_mask=y, pixel_values=z
        )

-    @tf.function(input_signature=clip_vit_inputs)
+    @tf.function(input_signature=clip_vit_inputs, jit_compile=True)
    def forward(self, input_ids, attention_mask, pixel_values):
        return self.m.predict(
            input_ids, attention_mask, pixel_values
--- a/shark/examples/shark_inference/ESRGAN/README.md
+++ b/shark/examples/shark_inference/ESRGAN/README.md
@@ -0,0 +1,15 @@
+## Running ESRGAN
+
+```
+1. pip install numpy opencv-python
+2. mkdir InputImages
+   (this is where all the input images will reside in)
+3. mkdir OutputImages
+   (this is where the model will generate all the images)
+4. mkdir models
+   (save the .pth checkpoint file here)
+5. python esrgan.py
+```
+
+- Download [RRDB_ESRGAN_x4.pth](https://drive.google.com/drive/u/0/folders/17VYV_SoZZesU6mbxz2dMAIccSSlqLecY) and place it in the `models` directory as mentioned above in step 4.
+- Credits : [ESRGAN](https://github.com/xinntao/ESRGAN)
--- a/shark/examples/shark_inference/ESRGAN/esrgan.py
+++ b/shark/examples/shark_inference/ESRGAN/esrgan.py
@@ -0,0 +1,240 @@
+from ast import arg
+import os.path as osp
+import glob
+import cv2
+import numpy as np
+import torch
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from shark.shark_inference import SharkInference
+import torch_mlir
+import tempfile
+import functools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def make_layer(block, n_layers):
+    layers = []
+    for _ in range(n_layers):
+        layers.append(block())
+    return nn.Sequential(*layers)
+
+
+class ResidualDenseBlock_5C(nn.Module):
+    def __init__(self, nf=64, gc=32, bias=True):
+        super(ResidualDenseBlock_5C, self).__init__()
+        # gc: growth channel, i.e. intermediate channels
+        self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
+        self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
+        self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
+        self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
+        self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+
+        # initialization
+        # mutil.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
+
+    def forward(self, x):
+        x1 = self.lrelu(self.conv1(x))
+        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
+        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
+        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
+        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
+        return x5 * 0.2 + x
+
+
+class RRDB(nn.Module):
+    """Residual in Residual Dense Block"""
+
+    def __init__(self, nf, gc=32):
+        super(RRDB, self).__init__()
+        self.RDB1 = ResidualDenseBlock_5C(nf, gc)
+        self.RDB2 = ResidualDenseBlock_5C(nf, gc)
+        self.RDB3 = ResidualDenseBlock_5C(nf, gc)
+
+    def forward(self, x):
+        out = self.RDB1(x)
+        out = self.RDB2(out)
+        out = self.RDB3(out)
+        return out * 0.2 + x
+
+
+class RRDBNet(nn.Module):
+    def __init__(self, in_nc, out_nc, nf, nb, gc=32):
+        super(RRDBNet, self).__init__()
+        RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
+
+        self.conv_first = nn.Conv2d(in_nc, nf, 3, 1, 1, bias=True)
+        self.RRDB_trunk = make_layer(RRDB_block_f, nb)
+        self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        #### upsampling
+        self.upconv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.upconv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.HRconv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
+
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+
+    def forward(self, x):
+        fea = self.conv_first(x)
+        trunk = self.trunk_conv(self.RRDB_trunk(fea))
+        fea = fea + trunk
+
+        fea = self.lrelu(
+            self.upconv1(F.interpolate(fea, scale_factor=2, mode="nearest"))
+        )
+        fea = self.lrelu(
+            self.upconv2(F.interpolate(fea, scale_factor=2, mode="nearest"))
+        )
+        out = self.conv_last(self.lrelu(self.HRconv(fea)))
+
+        return out
+
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument(
+    "--mlir_loc",
+    type=str,
+    default=None,
+    help="location of the model's mlir file",
+)
+args = p.parse_args()
+###################################################
+
+
+def inference(input_m):
+    return model(input_m)
+
+
+def load_mlir(mlir_loc):
+    import os
+
+    if mlir_loc == None:
+        return None
+    print(f"Trying to load the model from {mlir_loc}.")
+    with open(os.path.join(mlir_loc)) as f:
+        mlir_module = f.read()
+    return mlir_module
+
+
+def compile_through_fx(model, inputs, mlir_loc=None):
+
+    module = load_mlir(mlir_loc)
+    if module == None:
+        fx_g = make_fx(
+            model,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(inputs)
+
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+
+        def strip_overloads(gm):
+            """
+            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+            Args:
+                gm(fx.GraphModule): The input Fx graph module to be modified
+            """
+            for node in gm.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverload):
+                    node.target = node.target.overloadpacket
+            gm.recompile()
+
+        strip_overloads(fx_g)
+
+        ts_g = torch.jit.script(fx_g)
+
+        print("Torchscript graph generated successfully")
+        module = torch_mlir.compile(
+            ts_g,
+            inputs,
+            torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=False,
+            verbose=False,
+        )
+
+    mlir_model = str(module)
+    func_name = "forward"
+    shark_module = SharkInference(
+        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
+    )
+    shark_module.compile()
+
+    return shark_module
+
+
+model_path = "models/RRDB_ESRGAN_x4.pth"  # models/RRDB_ESRGAN_x4.pth OR models/RRDB_PSNR_x4.pth
+# device = torch.device('cuda')  # if you want to run on CPU, change 'cuda' -> cpu
+device = torch.device("cpu")
+
+test_img_folder = "InputImages/*"
+
+model = RRDBNet(3, 3, 64, 23, gc=32)
+model.load_state_dict(torch.load(model_path), strict=True)
+model.eval()
+model = model.to(device)
+
+print("Model path {:s}. \nTesting...".format(model_path))
+
+if __name__ == "__main__":
+    idx = 0
+    for path in glob.glob(test_img_folder):
+        idx += 1
+        base = osp.splitext(osp.basename(path))[0]
+        print(idx, base)
+        # read images
+        img = cv2.imread(path, cv2.IMREAD_COLOR)
+        img = img * 1.0 / 255
+        img = torch.from_numpy(
+            np.transpose(img[:, :, [2, 1, 0]], (2, 0, 1))
+        ).float()
+        img_LR = img.unsqueeze(0)
+        img_LR = img_LR.to(device)
+
+        with torch.no_grad():
+            shark_module = compile_through_fx(inference, img_LR)
+            shark_output = shark_module.forward((img_LR,))
+            shark_output = torch.from_numpy(shark_output)
+            shark_output = (
+                shark_output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            )
+            esrgan_output = (
+                model(img_LR).data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            )
+        # SHARK OUTPUT
+        shark_output = np.transpose(shark_output[[2, 1, 0], :, :], (1, 2, 0))
+        shark_output = (shark_output * 255.0).round()
+        cv2.imwrite(
+            "OutputImages/{:s}_rlt_shark_output.png".format(base), shark_output
+        )
+        print("Generated SHARK's output")
+        # ESRGAN OUTPUT
+        esrgan_output = np.transpose(esrgan_output[[2, 1, 0], :, :], (1, 2, 0))
+        esrgan_output = (esrgan_output * 255.0).round()
+        cv2.imwrite(
+            "OutputImages/{:s}_rlt_esrgan_output.png".format(base),
+            esrgan_output,
+        )
+        print("Generated ESRGAN's output")
--- a/shark/examples/shark_inference/albert_maskfill_tf.py
+++ b/shark/examples/shark_inference/albert_maskfill_tf.py
@@ -28,7 +28,7 @@ class AlbertModule(tf.Module):
        self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=t5_inputs)
+    @tf.function(input_signature=t5_inputs, jit_compile=True)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

--- a/shark/examples/shark_inference/bloom_tank.py
+++ b/shark/examples/shark_inference/bloom_tank.py
@@ -1,7 +1,9 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model

-mlir_model, func_name, inputs, golden_out = download_torch_model("bloom")
+mlir_model, func_name, inputs, golden_out = download_model(
+    "bloom", frontend="torch"
+)

 shark_module = SharkInference(
    mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"
--- a/shark/examples/shark_inference/gpt2_tf.py
+++ b/shark/examples/shark_inference/gpt2_tf.py
@@ -19,7 +19,7 @@ class GPT2Module(tf.Module):

        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=gpt2_inputs)
+    @tf.function(input_signature=gpt2_inputs, jit_compile=True)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

--- a/shark/examples/shark_inference/minilm_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_benchmark_tf.py
@@ -26,7 +26,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/shark/examples/shark_inference/minilm_jit.py
+++ b/shark/examples/shark_inference/minilm_jit.py
@@ -1,9 +1,10 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model


-mlir_model, func_name, inputs, golden_out = download_torch_model(
-    "microsoft/MiniLM-L12-H384-uncased"
+mlir_model, func_name, inputs, golden_out = download_model(
+    "microsoft/MiniLM-L12-H384-uncased",
+    frontend="torch",
 )


--- a/shark/examples/shark_inference/minilm_tf.py
+++ b/shark/examples/shark_inference/minilm_tf.py
@@ -26,7 +26,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/shark/examples/shark_inference/resnet50_script.py
+++ b/shark/examples/shark_inference/resnet50_script.py
@@ -5,7 +5,7 @@ import torchvision.models as models
 from torchvision import transforms
 import sys
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model


 ################################## Preprocessing inputs and model ############
@@ -66,7 +66,9 @@ labels = load_labels()


 ## Can pass any img or input to the forward module.
-mlir_model, func_name, inputs, golden_out = download_torch_model("resnet50")
+mlir_model, func_name, inputs, golden_out = download_model(
+    "resnet50", frontend="torch"
+)

 shark_module = SharkInference(mlir_model, func_name, mlir_dialect="linalg")
 shark_module.compile()
--- a/shark/examples/shark_inference/stable_diff_f16.py
+++ b/shark/examples/shark_inference/stable_diff_f16.py
@@ -37,10 +37,12 @@ args = p.parse_args()


 def fp16_unet():
-    from shark.shark_downloader import download_torch_model
+    from shark.shark_downloader import download_model

-    mlir_model, func_name, inputs, golden_out = download_torch_model(
-        "stable_diff_f16_18_OCT", tank_url="gs://shark_tank/prashant_nod"
+    mlir_model, func_name, inputs, golden_out = download_model(
+        "stable_diff_f16_18_OCT",
+        tank_url="gs://shark_tank/prashant_nod",
+        frontend="torch",
    )
    shark_module = SharkInference(
        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
--- a/shark/examples/shark_inference/stable_diff_tf.py
+++ b/shark/examples/shark_inference/stable_diff_tf.py
@@ -17,7 +17,7 @@ from keras_cv.models.generative.stable_diffusion.text_encoder import (
 )

 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
+from shark.shark_downloader import download_model
 from PIL import Image

 # pip install "git+https://github.com/keras-team/keras-cv.git"
@@ -75,8 +75,8 @@ class SharkStableDiffusion:
        # Create models
        self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)

-        mlir_model, func_name, inputs, golden_out = download_tf_model(
-            "stable_diff", tank_url="gs://shark_tank/quinn"
+        mlir_model, func_name, inputs, golden_out = download_model(
+            "stable_diff", tank_url="gs://shark_tank/quinn", frontend="tf"
        )
        shark_module = SharkInference(
            mlir_model, func_name, device=device, mlir_dialect="mhlo"
--- a/shark/examples/shark_inference/stable_diffusion/README.md
+++ b/shark/examples/shark_inference/stable_diffusion/README.md
@@ -2,14 +2,55 @@

 ## Installation

-```shell
-pip install diffusers
-pip install scipy
-```
+Follow setup instructions in the main [README.md](https://github.com/nod-ai/SHARK#readme) for regular usage. 

-## RUN
+## Debug commands and other advanced usage follows.

 ```shell
-python main.py --precision="fp32"|"fp16" --prompt="enter the text" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir
+python main.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text" 

 ```
+
+## dump all dispatch .spv and isa using amdllpc
+
+```shell
+python main.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
+```
+
+## Compile and save the .vmfb (using vulkan fp16 as an example):
+
+```shell
+python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
+```
+
+## Capture an RGP trace
+
+```shell
+python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
+```
+
+## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
+
+```shell
+iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf16  
+```
+
+## Run the unet module with iree-benchmark-module (same config as above):
+```shell
+##if you want to use .npz inputs:
+unzip ~/.local/shark_tank/<your unet>/inputs.npz
+
+iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --function_input=@arr_0.npy --function_input=1xf16 --function_input=@arr_2.npy --function_input=@arr_3.npy --function_input=@arr_4.npy  
+```
+
+## Using other supported Stable Diffusion variants with SHARK:
+
+Currently we support the following fine-tuned versions of Stable Diffusion:
+- [AnythingV3](https://huggingface.co/Linaqruf/anything-v3.0)
+- [Analog Diffusion](https://huggingface.co/wavymulder/Analog-Diffusion)
+
+use the flag `--variant=` to specify the model to be used.
+
+```shell
+python .\shark\examples\shark_inference\stable_diffusion\main.py --variant=anythingv3 --max_length=77 --prompt="1girl, brown hair, green eyes, colorful, autumn, cumulonimbus clouds, lighting, blue sky, falling leaves, garden"
+```
--- a/shark/examples/shark_inference/stable_diffusion/main.py
+++ b/shark/examples/shark_inference/stable_diffusion/main.py
@@ -1,45 +1,52 @@
+import os
+
+os.environ["AMD_ENABLE_LLPC"] = "1"
+
 from transformers import CLIPTextModel, CLIPTokenizer
 import torch
 from PIL import Image
-from diffusers import LMSDiscreteScheduler
+import torchvision.transforms as T
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
 from tqdm.auto import tqdm
 import numpy as np
+from random import randint
 from stable_args import args
-from model_wrappers import (
-    get_vae32,
-    get_vae16,
-    get_unet16_wrapped,
-    get_unet32_wrapped,
-    get_clipped_text,
+
+# This has to come before importing cache objects
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+from utils import set_init_device_flags
+
+from opt_params import get_unet, get_vae, get_clip
+from schedulers import (
+    SharkEulerDiscreteScheduler,
 )
-from utils import get_shark_model
 import time
-
-GCLOUD_BUCKET = "gs://shark_tank/prashant_nod"
-VAE_FP16 = "vae_fp16"
-VAE_FP32 = "vae_fp32"
-UNET_FP16 = "unet_fp16"
-UNET_FP32 = "unet_fp32"
-IREE_EXTRA_ARGS = []
-
-TUNED_GCLOUD_BUCKET = "gs://shark_tank/quinn"
-UNET_FP16_TUNED = "unet_fp16_tunedv2"
-
-BATCH_SIZE = len(args.prompts)
-
-if BATCH_SIZE not in [1, 2]:
-    import sys
-
-    sys.exit("Only batch size 1 and 2 are supported.")
-
-if BATCH_SIZE > 1 and args.precision != "fp16":
-    sys.exit("batch size > 1 is supported for fp16 model.")
-
-
-if BATCH_SIZE != 1:
-    TUNED_GCLOUD_BUCKET = "gs://shark_tank/prashant_nod"
-    UNET_FP16_TUNED = f"unet_fp16_{BATCH_SIZE}"
-    VAE_FP16 = f"vae_fp16_{BATCH_SIZE}"
+import sys
+from shark.iree_utils.compile_utils import dump_isas

 # Helper function to profile the vulkan device.
 def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
@@ -58,112 +65,94 @@ def end_profiling(device):
        return device.end_profiling()


-def get_models():
-    global IREE_EXTRA_ARGS
-    if args.precision == "fp16":
-        IREE_EXTRA_ARGS += [
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=32",
-        ]
-        if args.use_tuned:
-            unet_gcloud_bucket = TUNED_GCLOUD_BUCKET
-            vae_gcloud_bucket = GCLOUD_BUCKET
-            unet_args = IREE_EXTRA_ARGS
-            vae_args = IREE_EXTRA_ARGS + [
-                "--iree-flow-enable-conv-nchw-to-nhwc-transform"
-            ]
-            unet_name = UNET_FP16_TUNED
-            vae_name = VAE_FP16
-        else:
-            unet_gcloud_bucket = GCLOUD_BUCKET
-            vae_gcloud_bucket = GCLOUD_BUCKET
-            IREE_EXTRA_ARGS += [
-                "--iree-flow-enable-conv-nchw-to-nhwc-transform"
-            ]
-            unet_args = IREE_EXTRA_ARGS
-            vae_args = IREE_EXTRA_ARGS
-            unet_name = UNET_FP16
-            vae_name = VAE_FP16
-
-        if batch_size > 1:
-            vae_args = []
-
-        if args.import_mlir == True:
-            return get_vae16(model_name=VAE_FP16), get_unet16_wrapped(
-                model_name=UNET_FP16
-            )
-        else:
-            return get_shark_model(
-                vae_gcloud_bucket,
-                vae_name,
-                vae_args,
-            ), get_shark_model(
-                unet_gcloud_bucket,
-                unet_name,
-                unet_args,
-            )
-
-    elif args.precision == "fp32":
-        IREE_EXTRA_ARGS += [
-            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=16",
-        ]
-        if args.import_mlir == True:
-            return get_vae32(model_name=VAE_FP32), get_unet32_wrapped(
-                model_name=UNET_FP32
-            )
-        else:
-            return get_shark_model(
-                GCLOUD_BUCKET,
-                VAE_FP32,
-                IREE_EXTRA_ARGS,
-            ), get_shark_model(
-                GCLOUD_BUCKET,
-                UNET_FP32,
-                IREE_EXTRA_ARGS,
-            )
-
-
 if __name__ == "__main__":

    dtype = torch.float32 if args.precision == "fp32" else torch.half
-    if len(args.iree_vulkan_target_triple) > 0:
-        IREE_EXTRA_ARGS.append(
-            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-        )
-
-    clip_model = "clip_text"
-    clip_extra_args = [
-        "--iree-flow-linalg-ops-padding-size=16",
-        "--iree-flow-enable-padding-linalg-ops",
-    ]
-    clip = get_shark_model(GCLOUD_BUCKET, clip_model, clip_extra_args)

    prompt = args.prompts
+    neg_prompt = args.negative_prompts
    height = 512  # default height of Stable Diffusion
    width = 512  # default width of Stable Diffusion
+    if args.version == "v2_1":
+        height = 768
+        width = 768

    num_inference_steps = args.steps  # Number of denoising steps

-    guidance_scale = args.guidance_scale  # Scale for classifier-free guidance
+    # Scale for classifier-free guidance
+    guidance_scale = torch.tensor(args.guidance_scale).to(torch.float32)

+    # Handle out of range seeds.
+    uint32_info = np.iinfo(np.uint32)
+    uint32_min, uint32_max = uint32_info.min, uint32_info.max
+    seed = args.seed
+    if seed < uint32_min or seed >= uint32_max:
+        seed = randint(uint32_min, uint32_max)
    generator = torch.manual_seed(
-        args.seed
+        seed
    )  # Seed generator to create the inital latent noise

+    # TODO: Add support for batch_size > 1.
    batch_size = len(prompt)
+    if batch_size != 1:
+        sys.exit("More than one prompt is not supported yet.")
+    if batch_size != len(neg_prompt):
+        sys.exit("prompts and negative prompts must be of same length")

-    vae, unet = get_models()
+    set_init_device_flags()
+    clip = get_clip()
+    unet = get_unet()
+    vae = get_vae()
+    if args.dump_isa:
+        dump_isas(args.dispatch_benchmarks_dir)

    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-
-    scheduler = LMSDiscreteScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
+    scheduler = DPMSolverMultistepScheduler.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        subfolder="scheduler",
    )
+    cpu_scheduling = True
+    if args.version == "v2_1":
+        tokenizer = CLIPTokenizer.from_pretrained(
+            "stabilityai/stable-diffusion-2-1", subfolder="tokenizer"
+        )
+
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-2-1",
+            subfolder="scheduler",
+        )
+
+    if args.version == "v2_1base" and args.variant == "stablediffusion":
+        tokenizer = CLIPTokenizer.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer"
+        )
+
+        if args.use_compiled_scheduler:
+            scheduler = SharkEulerDiscreteScheduler.from_pretrained(
+                "stabilityai/stable-diffusion-2-1-base",
+                subfolder="scheduler",
+            )
+            scheduler.compile()
+            cpu_scheduling = False
+        else:
+            scheduler = EulerDiscreteScheduler.from_pretrained(
+                "stabilityai/stable-diffusion-2-1-base",
+                subfolder="scheduler",
+            )
+
+    # create a random initial latent.
+    latents = torch.randn(
+        (batch_size, 4, height // 8, width // 8),
+        generator=generator,
+        dtype=torch.float32,
+    ).to(dtype)
+    # Warmup phase to improve performance.
+    if args.warmup_count >= 1:
+        vae_warmup_input = torch.clone(latents).detach().numpy()
+        clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
+    for i in range(args.warmup_count):
+        vae.forward((vae_warmup_input,))
+        clip.forward((clip_warmup_input,))

    start = time.time()

@@ -174,68 +163,91 @@ if __name__ == "__main__":
        truncation=True,
        return_tensors="pt",
    )
-
-    text_embeddings = clip.forward((text_input.input_ids,))
-    text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
    max_length = text_input.input_ids.shape[-1]
    uncond_input = tokenizer(
-        [""] * batch_size,
+        neg_prompt,
        padding="max_length",
        max_length=max_length,
+        truncation=True,
        return_tensors="pt",
    )
-    uncond_embeddings = clip.forward((uncond_input.input_ids,))
-    uncond_embeddings = torch.from_numpy(uncond_embeddings).to(dtype)
+    text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])

-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-    latents = torch.randn(
-        (batch_size, 4, height // 8, width // 8),
-        generator=generator,
-        dtype=torch.float32,
-    ).to(dtype)
+    clip_inf_start = time.time()
+    text_embeddings = clip.forward((text_input,))
+    clip_inf_end = time.time()
+    text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+    text_embeddings_numpy = text_embeddings.detach().numpy()

    scheduler.set_timesteps(num_inference_steps)
    scheduler.is_scale_input_called = True

-    latents = latents * scheduler.sigmas[0]
-    text_embeddings_numpy = text_embeddings.detach().numpy()
-    avg_ms = 0
+    latents = latents * scheduler.init_noise_sigma

-    for i, t in tqdm(enumerate(scheduler.timesteps)):
+    avg_ms = 0
+    for i, t in tqdm(enumerate(scheduler.timesteps), disable=args.hide_steps):
        step_start = time.time()
-        print(f"i = {i} t = {t}", end="")
+        if not args.hide_steps:
+            print(f"i = {i} t = {t}", end="")
        timestep = torch.tensor([t]).to(dtype).detach().numpy()
-        latents_numpy = latents.detach().numpy()
-        sigma_numpy = np.array(scheduler.sigmas[i]).astype(np.float32)
+        latent_model_input = scheduler.scale_model_input(latents, t)
+        if cpu_scheduling:
+            latent_model_input = latent_model_input.detach().numpy()

        profile_device = start_profiling(file_path="unet.rdc")
+
        noise_pred = unet.forward(
-            (latents_numpy, timestep, text_embeddings_numpy, sigma_numpy)
+            (
+                latent_model_input,
+                timestep,
+                text_embeddings_numpy,
+                guidance_scale,
+            ),
+            send_to_host=False,
        )
+
        end_profiling(profile_device)
-        noise_pred = torch.from_numpy(noise_pred)
+
+        if cpu_scheduling:
+            noise_pred = torch.from_numpy(noise_pred.to_host())
+            latents = scheduler.step(noise_pred, t, latents).prev_sample
+        else:
+            latents = scheduler.step(noise_pred, t, latents)
        step_time = time.time() - step_start
        avg_ms += step_time
        step_ms = int((step_time) * 1000)
-        print(f" ({step_ms}ms)")
-
-        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
-    avg_ms = 1000 * avg_ms / args.steps
-    print(f"Average step time: {avg_ms}ms/it")
+        if not args.hide_steps:
+            print(f" ({step_ms}ms)")

    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    latents_numpy = latents.detach().numpy()
+    if args.use_base_vae:
+        latents = 1 / 0.18215 * latents
+    latents_numpy = latents
+    if cpu_scheduling:
+        latents_numpy = latents.detach().numpy()
    profile_device = start_profiling(file_path="vae.rdc")
-    image = vae.forward((latents_numpy,))
+    vae_start = time.time()
+    images = vae.forward((latents_numpy,))
+    vae_end = time.time()
    end_profiling(profile_device)
-    image = torch.from_numpy(image)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
+    if args.use_base_vae:
+        image = torch.from_numpy(images)
+        image = (image.detach().cpu() * 255.0).numpy()
+        images = image.round()
+    end_time = time.time()

-    print("Total image generation runtime (s): {}".format(time.time() - start))
+    avg_ms = 1000 * avg_ms / args.steps
+    clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
+    vae_inf_time = (vae_end - vae_start) * 1000
+    total_time = end_time - start
+    print(f"\nAverage step time: {avg_ms}ms/it")
+    print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
+    print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
+    print(f"\nTotal image generation time: {total_time}sec")

-    pil_images = [Image.fromarray(image) for image in images]
+    transform = T.ToPILImage()
+    pil_images = [
+        transform(image) for image in torch.from_numpy(images).to(torch.uint8)
+    ]
    for i in range(batch_size):
        pil_images[i].save(f"{args.prompts[i]}_{i}.jpg")
--- a/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
+++ b/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
@@ -1,223 +1,285 @@
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
+from diffusers import AutoencoderKL, UNet2DConditionModel
 from transformers import CLIPTextModel
 from utils import compile_through_fx
 from stable_args import args
 import torch

-YOUR_TOKEN = "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"
+model_config = {
+    "v2_1": "stabilityai/stable-diffusion-2-1",
+    "v2_1base": "stabilityai/stable-diffusion-2-1-base",
+    "v1_4": "CompVis/stable-diffusion-v1-4",
+}
+
+# clip has 2 variants of max length 77 or 64.
+model_clip_max_length = 64 if args.max_length == 64 else 77
+if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
+    model_clip_max_length = 77
+elif args.variant == "openjourney":
+    model_clip_max_length = 64
+
+model_variant = {
+    "stablediffusion": "SD",
+    "anythingv3": "Linaqruf/anything-v3.0",
+    "dreamlike": "dreamlike-art/dreamlike-diffusion-1.0",
+    "openjourney": "prompthero/openjourney",
+    "analogdiffusion": "wavymulder/Analog-Diffusion",
+}
+
+model_input = {
+    "v2_1": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 96, 96),),
+        "unet": (
+            torch.randn(1, 4, 96, 96),  # latents
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 1024),  # embedding
+            torch.tensor(1).to(torch.float32),  # guidance_scale
+        ),
+    },
+    "v2_1base": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 64, 64),),
+        "unet": (
+            torch.randn(1, 4, 64, 64),  # latents
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 1024),  # embedding
+            torch.tensor(1).to(torch.float32),  # guidance_scale
+        ),
+    },
+    "v1_4": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 64, 64),),
+        "unet": (
+            torch.randn(1, 4, 64, 64),
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 768),
+            torch.tensor(1).to(torch.float32),
+        ),
+    },
+}
+
+# revision param for from_pretrained defaults to "main" => fp32
+model_revision = {
+    "stablediffusion": "fp16" if args.precision == "fp16" else "main",
+    "anythingv3": "diffusers",
+    "analogdiffusion": "main",
+    "openjourney": "main",
+    "dreamlike": "main",
+}


-BATCH_SIZE = len(args.prompts)
+def get_clip_mlir(model_name="clip_text", extra_args=[]):

+    text_encoder = CLIPTextModel.from_pretrained(
+        "openai/clip-vit-large-patch14"
+    )
+    if args.variant == "stablediffusion":
+        if args.version != "v1_4":
+            text_encoder = CLIPTextModel.from_pretrained(
+                model_config[args.version], subfolder="text_encoder"
+            )
+
+    elif args.variant in [
+        "anythingv3",
+        "analogdiffusion",
+        "openjourney",
+        "dreamlike",
+    ]:
+        text_encoder = CLIPTextModel.from_pretrained(
+            model_variant[args.variant],
+            subfolder="text_encoder",
+            revision=model_revision[args.variant],
+        )
+    else:
+        raise ValueError(f"{args.variant} not yet added")

-def get_clipped_text(model_name="clip_text"):
    class CLIPText(torch.nn.Module):
        def __init__(self):
            super().__init__()
-            self.text_encoder = CLIPTextModel.from_pretrained(
-                "openai/clip-vit-large-patch14"
-            )
+            self.text_encoder = text_encoder

        def forward(self, input):
            return self.text_encoder(input)[0]

    clip_model = CLIPText()
-    clip_input = torch.randint(1, 2, (BATCH_SIZE, 77))
    shark_clip = compile_through_fx(
        clip_model,
-        (clip_input,),
+        model_input[args.version]["clip"],
        model_name=model_name,
+        extra_args=extra_args,
    )
    return shark_clip


-def get_vae32(model_name="vae_fp32"):
-    class VaeModel(torch.nn.Module):
+def get_base_vae_mlir(model_name="vae", extra_args=[]):
+    class BaseVaeModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.vae = AutoencoderKL.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
+                model_config[args.version]
+                if args.variant == "stablediffusion"
+                else model_variant[args.variant],
                subfolder="vae",
-                use_auth_token=YOUR_TOKEN,
+                revision=model_revision[args.variant],
            )

        def forward(self, input):
            x = self.vae.decode(input, return_dict=False)[0]
            return (x / 2 + 0.5).clamp(0, 1)

-    vae = VaeModel()
-    vae_input = torch.rand(BATCH_SIZE, 4, 64, 64)
+    vae = BaseVaeModel()
+    if args.variant == "stablediffusion":
+        if args.precision == "fp16":
+            vae = vae.half().cuda()
+            inputs = tuple(
+                [
+                    inputs.half().cuda()
+                    for inputs in model_input[args.version]["vae"]
+                ]
+            )
+        else:
+            inputs = model_input[args.version]["vae"]
+    elif args.variant in [
+        "anythingv3",
+        "analogdiffusion",
+        "openjourney",
+        "dreamlike",
+    ]:
+        if args.precision == "fp16":
+            vae = vae.half().cuda()
+            inputs = tuple(
+                [inputs.half().cuda() for inputs in model_input["v1_4"]["vae"]]
+            )
+        else:
+            inputs = model_input["v1_4"]["vae"]
+    else:
+        raise ValueError(f"{args.variant} not yet added")
+
    shark_vae = compile_through_fx(
        vae,
-        (vae_input,),
+        inputs,
        model_name=model_name,
+        extra_args=extra_args,
    )
    return shark_vae


-def get_vae16(model_name="vae_fp16"):
+def get_vae_mlir(model_name="vae", extra_args=[]):
    class VaeModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.vae = AutoencoderKL.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
+                model_config[args.version]
+                if args.variant == "stablediffusion"
+                else model_variant[args.variant],
                subfolder="vae",
-                use_auth_token=YOUR_TOKEN,
-                revision="fp16",
+                revision=model_revision[args.variant],
            )

        def forward(self, input):
+            input = 1 / 0.18215 * input
            x = self.vae.decode(input, return_dict=False)[0]
-            return (x / 2 + 0.5).clamp(0, 1)
+            x = (x / 2 + 0.5).clamp(0, 1)
+            x = x * 255.0
+            return x.round()

    vae = VaeModel()
-    vae = vae.half().cuda()
-    vae_input = torch.rand(BATCH_SIZE, 4, 64, 64, dtype=torch.half).cuda()
+    if args.variant == "stablediffusion":
+        if args.precision == "fp16":
+            vae = vae.half().cuda()
+            inputs = tuple(
+                [
+                    inputs.half().cuda()
+                    for inputs in model_input[args.version]["vae"]
+                ]
+            )
+        else:
+            inputs = model_input[args.version]["vae"]
+    elif args.variant in [
+        "anythingv3",
+        "analogdiffusion",
+        "openjourney",
+        "dreamlike",
+    ]:
+        if args.precision == "fp16":
+            vae = vae.half().cuda()
+            inputs = tuple(
+                [inputs.half().cuda() for inputs in model_input["v1_4"]["vae"]]
+            )
+        else:
+            inputs = model_input["v1_4"]["vae"]
+    else:
+        raise ValueError(f"{args.variant} not yet added")
+
    shark_vae = compile_through_fx(
        vae,
-        (vae_input,),
+        inputs,
        model_name=model_name,
+        extra_args=extra_args,
    )
    return shark_vae


-def get_unet32(model_name="unet_fp32"):
+def get_unet_mlir(model_name="unet", extra_args=[]):
    class UnetModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
+                model_config[args.version]
+                if args.variant == "stablediffusion"
+                else model_variant[args.variant],
                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
+                revision=model_revision[args.variant],
            )
            self.in_channels = self.unet.in_channels
            self.train(False)

-        def forward(self, x, y, z):
-            return self.unet.forward(x, y, z, return_dict=False)[0]
-
-    unet = UnetModel()
-    latent_model_input = torch.rand([2, 4, 64, 64])
-    text_embeddings = torch.rand([2, args.max_length, 768])
-    shark_unet = compile_through_fx(
-        unet,
-        (latent_model_input, torch.tensor([1.0]), text_embeddings),
-        model_name=model_name,
-    )
-    return shark_unet
-
-
-def get_unet16(model_name="unet_fp16"):
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-                revision="fp16",
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-        def forward(self, x, y, z):
-            return self.unet.forward(x, y, z, return_dict=False)[0]
-
-    unet = UnetModel()
-    unet = unet.half().cuda()
-    latent_model_input = torch.rand([2, 4, 64, 64]).half().cuda()
-    text_embeddings = torch.rand([2, args.max_length, 768]).half().cuda()
-    shark_unet = compile_through_fx(
-        unet,
-        (
-            latent_model_input,
-            torch.tensor([1.0]).half().cuda(),
-            text_embeddings,
-        ),
-        model_name=model_name,
-    )
-    return shark_unet
-
-
-def get_unet16_wrapped(guidance_scale=7.5, model_name="unet_fp16_wrapped"):
-    class UnetModel(torch.nn.Module):
-        def __init__(self, guidance_scale=guidance_scale):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-                revision="fp16",
-            )
-            self.in_channels = self.unet.in_channels
-            self.guidance_scale = guidance_scale
-            self.train(False)
-
-        def forward(self, latent, timestep, text_embedding, sigma):
+        def forward(self, latent, timestep, text_embedding, guidance_scale):
            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
            latents = torch.cat([latent] * 2)
-            latents = latents / (torch.pow((torch.pow(sigma, 2) + 1), 0.5))
            unet_out = self.unet.forward(
                latents, timestep, text_embedding, return_dict=False
            )[0]
            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (
+            noise_pred = noise_pred_uncond + guidance_scale * (
                noise_pred_text - noise_pred_uncond
            )
            return noise_pred

    unet = UnetModel()
-    unet = unet.half().cuda()
-    latent_model_input = torch.rand([BATCH_SIZE, 4, 64, 64]).half().cuda()
-    text_embeddings = (
-        torch.rand([2 * BATCH_SIZE, args.max_length, 768]).half().cuda()
-    )
-    sigma = torch.tensor(1).to(torch.float32)
+    if args.variant == "stablediffusion":
+        if args.precision == "fp16":
+            unet = unet.half().cuda()
+            inputs = tuple(
+                [
+                    inputs.half().cuda() if len(inputs.shape) != 0 else inputs
+                    for inputs in model_input[args.version]["unet"]
+                ]
+            )
+        else:
+            inputs = model_input[args.version]["unet"]
+    elif args.variant in [
+        "anythingv3",
+        "analogdiffusion",
+        "openjourney",
+        "dreamlike",
+    ]:
+        if args.precision == "fp16":
+            unet = unet.half().cuda()
+            inputs = tuple(
+                [
+                    inputs.half().cuda() if len(inputs.shape) != 0 else inputs
+                    for inputs in model_input["v1_4"]["unet"]
+                ]
+            )
+        else:
+            inputs = model_input["v1_4"]["unet"]
+    else:
+        raise ValueError(f"{args.variant} is not yet added")
    shark_unet = compile_through_fx(
        unet,
-        (
-            latent_model_input,
-            torch.tensor([1.0]).half().cuda(),
-            text_embeddings,
-            sigma,
-        ),
-        model_name=model_name,
-    )
-    return shark_unet
-
-
-def get_unet32_wrapped(guidance_scale=7.5, model_name="unet_fp32_wrapped"):
-    class UnetModel(torch.nn.Module):
-        def __init__(self, guidance_scale=guidance_scale):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-            )
-            self.in_channels = self.unet.in_channels
-            self.guidance_scale = guidance_scale
-            self.train(False)
-
-        def forward(self, latent, timestep, text_embedding, sigma):
-            latents = torch.cat([latent] * 2)
-            latents = latents / (torch.pow((torch.pow(sigma, 2) + 1), 0.5))
-            unet_out = self.unet.forward(
-                latents, timestep, text_embedding, return_dict=False
-            )[0]
-            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
-            return noise_pred
-
-    unet = UnetModel()
-    latent_model_input = torch.rand([BATCH_SIZE, 4, 64, 64])
-    text_embeddings = torch.rand([2 * BATCH_SIZE, args.max_length, 768])
-    sigma = torch.tensor(1).to(torch.float32)
-    shark_unet = compile_through_fx(
-        unet,
-        (latent_model_input, torch.tensor([1.0]), text_embeddings, sigma),
+        inputs,
        model_name=model_name,
+        extra_args=extra_args,
    )
    return shark_unet
--- a/shark/examples/shark_inference/stable_diffusion/opt_params.py
+++ b/shark/examples/shark_inference/stable_diffusion/opt_params.py
@@ -0,0 +1,99 @@
+import sys
+from model_wrappers import (
+    get_base_vae_mlir,
+    get_vae_mlir,
+    get_unet_mlir,
+    get_clip_mlir,
+)
+from resources import models_db
+from stable_args import args
+from utils import get_shark_model
+
+BATCH_SIZE = len(args.prompts)
+if BATCH_SIZE != 1:
+    sys.exit("Only batch size 1 is supported.")
+
+
+def get_params(bucket_key, model_key, model, is_tuned, precision):
+    iree_flags = []
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    try:
+        bucket = models_db[0][bucket_key]
+        model_name = models_db[1][model_key]
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "default_compilation_flags"
+        ]
+    except KeyError:
+        raise Exception(
+            f"{bucket}/{model_key} is not present in the models database"
+        )
+
+    if (
+        "specified_compilation_flags"
+        in models_db[2][model][is_tuned][precision]
+    ):
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in models_db[2][model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return bucket, model_name, iree_flags
+
+
+def get_unet():
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    bucket_key = f"{args.variant}/{is_tuned}"
+    model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "unet", is_tuned, args.precision
+    )
+    if not args.use_tuned and args.import_mlir:
+        return get_unet_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_vae():
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    is_base = "/base" if args.use_base_vae else ""
+    bucket_key = f"{args.variant}/{is_tuned}"
+    model_key = f"{args.variant}/{args.version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "vae", is_tuned, args.precision
+    )
+    if not args.use_tuned and args.import_mlir:
+        if args.use_base_vae:
+            return get_base_vae_mlir(model_name, iree_flags)
+        return get_vae_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_clip():
+    bucket_key = f"{args.variant}/untuned"
+    model_key = f"{args.variant}/{args.version}/clip/fp32/length_{args.max_length}/untuned"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "clip", "untuned", "fp32"
+    )
+    if args.import_mlir:
+        return get_clip_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
--- a/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
+++ b/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
@@ -0,0 +1,44 @@
+Compile / Run Instructions:
+
+To compile .vmfb for SD (vae, unet, CLIP), run the following commands with the .mlir in your local shark_tank cache (default location for Linux users is `~/.local/shark_tank`). These will be available once the script from [this README](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md) is run once.
+Running the script mentioned above with the `--save_vmfb` flag will also save the .vmfb in your SHARK base directory if you want to skip straight to benchmarks.
+
+Compile Commands FP32/FP16: 
+
+```shell
+Vulkan AMD: 
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+
+#  add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
+#  use –iree-input-type=mhlo for tf models
+
+CUDA NVIDIA:
+iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+
+CPU:
+iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu  --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+```
+
+
+
+Run / Benchmark Command (FP32 - NCHW):
+(NEED to use BS=2 since we do two forward passes to unet as a result of classifier free guidance.)
+
+```shell
+## Vulkan AMD:
+iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
+
+## CUDA:
+iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=cuda  --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
+
+## CPU:
+iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=local-task  --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
+
+```
+
+Run via vulkan_gui for RGP Profiling:
+
+To build the vulkan app for profiling UNet follow the instructions [here](https://github.com/nod-ai/SHARK/tree/main/cpp) and then run the following command from the cpp directory with your compiled stable_diff.vmfb
+```shell
+./build/vulkan_gui/iree-vulkan-gui --module_file=/path/to/unet.vmfb --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
+```
--- a/shark/examples/shark_inference/stable_diffusion/resources.py
+++ b/shark/examples/shark_inference/stable_diffusion/resources.py
@@ -0,0 +1,31 @@
+import os
+import json
+import sys
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+prompt_examples = []
+prompts_loc = resource_path("resources/prompts.json")
+if os.path.exists(prompts_loc):
+    with open(prompts_loc, encoding="utf-8") as fopen:
+        prompt_examples = json.load(fopen)
+
+if not prompt_examples:
+    print("Unable to fetch prompt examples.")
+
+
+models_db = []
+models_loc = resource_path("resources/model_db.json")
+if os.path.exists(models_loc):
+    with open(models_loc, encoding="utf-8") as fopen:
+        models_db = json.load(fopen)
+
+if len(models_db) != 3:
+    sys.exit("Error: Unable to load models database.")
--- a/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
@@ -0,0 +1,165 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_1dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip2base_18dec_fp32",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_19dec_v2p1base_fp32_64",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet2_14dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae2_19dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform",
+            "--iree-flow-enable-conv-winograd-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform",
+            "--iree-flow-enable-conv-winograd-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/shark/examples/shark_inference/stable_diffusion/resources/prompts.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/prompts.json
--- a/shark/examples/shark_inference/stable_diffusion/schedulers.py
+++ b/shark/examples/shark_inference/stable_diffusion/schedulers.py
@@ -0,0 +1,131 @@
+import sys
+import numpy as np
+from typing import List, Optional, Tuple, Union
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
+from diffusers.configuration_utils import register_to_config
+from utils import compile_through_fx, get_shark_model
+from stable_args import args
+import torch
+
+SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+
+model_input = {
+    "euler": {
+        "latent": torch.randn(1, 4, 64, 64),
+        "output": torch.randn(1, 4, 64, 64),
+        "sigma": torch.tensor(1).to(torch.float32),
+        "dt": torch.tensor(1).to(torch.float32),
+    },
+}
+
+
+class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
+        super().__init__(
+            num_train_timesteps,
+            beta_start,
+            beta_end,
+            beta_schedule,
+            trained_betas,
+            prediction_type,
+        )
+
+    def compile(self):
+        example_latent = model_input["euler"]["latent"]
+        example_output = model_input["euler"]["output"]
+        if args.precision == "fp16":
+            example_latent = example_latent.half()
+            example_output = example_output.half()
+        example_sigma = model_input["euler"]["sigma"]
+        example_dt = model_input["euler"]["dt"]
+
+        class ScalingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, latent, sigma):
+                return latent / ((sigma**2 + 1) ** 0.5)
+
+        class SchedulerStepModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, noise_pred, sigma, latent, dt):
+                pred_original_sample = latent - sigma * noise_pred
+                derivative = (latent - pred_original_sample) / sigma
+                return latent + derivative * dt
+
+        iree_flags = []
+        if len(args.iree_vulkan_target_triple) > 0:
+            iree_flags.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+        # Disable bindings fusion to work with moltenVK.
+        if sys.platform == "darwin":
+            iree_flags.append("-iree-stream-fuse-binding=false")
+
+        if args.import_mlir:
+            scaling_model = ScalingModel()
+            self.scaling_model = compile_through_fx(
+                scaling_model,
+                (example_latent, example_sigma),
+                model_name="euler_scale_model_input_" + args.precision,
+                extra_args=iree_flags,
+            )
+
+            step_model = SchedulerStepModel()
+            self.step_model = compile_through_fx(
+                step_model,
+                (example_output, example_sigma, example_latent, example_dt),
+                model_name="euler_step_" + args.precision,
+                extra_args=iree_flags,
+            )
+        else:
+            self.scaling_model = get_shark_model(
+                SCHEDULER_BUCKET,
+                "euler_scale_model_input_" + args.precision,
+                iree_flags,
+            )
+            self.step_model = get_shark_model(
+                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
+            )
+
+    def scale_model_input(self, sample, timestep):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        return self.scaling_model.forward(
+            (
+                sample,
+                sigma,
+            ),
+            send_to_host=False,
+        )
+
+    def step(self, noise_pred, timestep, latent):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        dt = self.sigmas[step_index + 1] - sigma
+        return self.step_model.forward(
+            (
+                noise_pred,
+                sigma,
+                latent,
+                dt,
+            ),
+            send_to_host=False,
+        )
--- a/shark/examples/shark_inference/stable_diffusion/stable_args.py
+++ b/shark/examples/shark_inference/stable_diffusion/stable_args.py
@@ -4,27 +4,38 @@ p = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
 )

+##############################################################################
+### Stable Diffusion Params
+##############################################################################
+
 p.add_argument(
    "--prompts",
    nargs="+",
-    default=["a photograph of an astronaut riding a horse"],
+    default=["cyberpunk forest by Salvador Dali"],
    help="text of which images to be generated.",
 )
+
 p.add_argument(
-    "--device", type=str, default="cpu", help="device to run the model."
+    "--negative-prompts",
+    nargs="+",
+    default=[""],
+    help="text you don't want to see in the generated image.",
 )
+
 p.add_argument(
    "--steps",
    type=int,
-    default=10,
+    default=50,
    help="the no. of steps to do the sampling.",
 )
+
 p.add_argument(
    "--seed",
    type=int,
    default=42,
    help="the seed to use.",
 )
+
 p.add_argument(
    "--guidance_scale",
    type=float,
@@ -32,6 +43,32 @@ p.add_argument(
    help="the value to be used for guidance scaling.",
 )

+p.add_argument(
+    "--max_length",
+    type=int,
+    default=64,
+    help="max length of the tokenizer output, options are 64 and 77.",
+)
+
+##############################################################################
+### Model Config and Usage Params
+##############################################################################
+
+p.add_argument(
+    "--device", type=str, default="vulkan", help="device to run the model."
+)
+
+p.add_argument(
+    "--version",
+    type=str,
+    default="v2_1base",
+    help="Specify version of stable diffusion model",
+)
+
+p.add_argument(
+    "--precision", type=str, default="fp16", help="precision to run the model."
+)
+
 p.add_argument(
    "--import_mlir",
    default=False,
@@ -39,17 +76,6 @@ p.add_argument(
    help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
 )

-p.add_argument(
-    "--precision", type=str, default="fp32", help="precision to run the model."
-)
-
-p.add_argument(
-    "--max_length",
-    type=int,
-    default=77,
-    help="max length of the tokenizer output.",
-)
-
 p.add_argument(
    "--load_vmfb",
    default=True,
@@ -64,6 +90,37 @@ p.add_argument(
    help="saves the compiled flatbuffer to the local directory",
 )

+p.add_argument(
+    "--use_tuned",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Download and use the tuned version of the model if available",
+)
+
+p.add_argument(
+    "--use_base_vae",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Do conversion from the VAE output to pixel space on cpu.",
+)
+
+p.add_argument(
+    "--variant",
+    default="stablediffusion",
+    help="We now support multiple vairants of SD finetuned for different dataset. you can use the following anythingv3, ...",  # TODO add more once supported
+)
+
+p.add_argument(
+    "--scheduler",
+    type=str,
+    default="SharkEulerDiscrete",
+    help="other supported schedulers are [PNDM, DDIM, LMSDiscrete, EulerDiscrete, DPMSolverMultistep]",
+)
+
+##############################################################################
+### IREE - Vulkan supported flags
+##############################################################################
+
 p.add_argument(
    "--iree-vulkan-target-triple",
    type=str,
@@ -79,10 +136,91 @@ p.add_argument(
 )

 p.add_argument(
-    "--use_tuned",
+    "--vulkan_large_heap_block_size",
+    default="4147483648",
+    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
+)
+
+p.add_argument(
+    "--vulkan_validation_layers",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for disabling vulkan validation layers when benchmarking",
+)
+
+##############################################################################
+### Misc. Debug and Optimization flags
+##############################################################################
+
+p.add_argument(
+    "--use_compiled_scheduler",
    default=True,
    action=argparse.BooleanOptionalAction,
-    help="Download and use the tuned version of the model if available",
+    help="use the default scheduler precompiled into the model if available",
+)
+
+p.add_argument(
+    "--local_tank_cache",
+    default="",
+    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
+)
+
+p.add_argument(
+    "--dump_isa",
+    default=False,
+    action="store_true",
+    help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
+)
+
+p.add_argument(
+    "--dispatch_benchmarks",
+    default=None,
+    help='dispatches to return benchamrk data on.  use "All" for all, and None for none.',
+)
+
+p.add_argument(
+    "--dispatch_benchmarks_dir",
+    default="temp_dispatch_benchmarks",
+    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
+)
+
+p.add_argument(
+    "--enable_rgp",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for inserting debug frames between iterations for use with rgp.",
+)
+
+p.add_argument(
+    "--hide_steps",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for hiding the details of iteration/sec for each step.",
+)
+
+p.add_argument(
+    "--warmup_count",
+    type=int,
+    default=0,
+    help="flag setting warmup count for clip and vae [>= 0].",
+)
+
+p.add_argument(
+    "--clear_all",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
+)
+
+##############################################################################
+### Web UI flags
+##############################################################################
+
+p.add_argument(
+    "--progress_bar",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for removing the pregress bar animation during image generation",
 )

 args = p.parse_args()
--- a/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
+++ b/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
@@ -0,0 +1,139 @@
+# Stable Diffusion optimized for AMD RDNA2/RDNA3 GPUs
+
+Before you start, please be aware that this is beta software that relies on a special AMD driver. Like all StableDiffusion GUIs published so far, you need some technical expertise to set it up. We apologize in advance if you bump into issues. If that happens, please don't hesitate to ask our Discord community for help! If you still can't get it to work, we're sorry, and please be assured that we (Nod and AMD) are working hard to improve the user experience in coming months.
+If it works well for you, please "star" the following GitHub projects... this is one of the best ways to help and spread the word!
+
+* https://github.com/nod-ai/SHARK
+* https://github.com/iree-org/iree
+
+## Install this specific AMD Drivers (AMD latest may not have all the fixes).
+
+### AMD KB Drivers for RDNA2 and RDNA3:
+
+*AMD Software: Adrenalin Edition 22.11.1 for MLIR/IREE Driver Version 22.20.29.09 for Windows® 10 and Windows® 11 (Windows Driver Store Version 31.0.12029.9003)*
+
+First, download this special driver in a folder of your choice. We recommend you keep that driver around since you may need to re-install it later, if Windows Update decides to overwrite it:
+https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mlir-iree
+
+KNOWN ISSUES with this special AMD driver:
+* `Windows Update` may (depending how it's configured) automatically install a new official AMD driver that overwrites this IREE-specific driver. If Stable Diffusion used to work, then a few days later, it slows down a lot or produces incorrect results (e.g. black images), this may be the cause. To fix this problem, please check the installed driver's version, and re-install the special driver if needed. (TODO: document how to prevent this `Windows Update` behavior!)
+* Some people using this special driver experience mouse pointer accuracy issues, if you use a larger-than-default mouse pointer. The clicked point isn't centered properly. One possible work-around is to reset the pointer size to "1" in "Change pointer size and color".
+
+## Installation
+
+Download the latest Windows SHARK SD binary [423 here](https://github.com/nod-ai/SHARK/releases/download/20230101.423/shark_sd_20230101_423.exe) in a folder of your choice. If you want nighly builds you can look for them in the github releases page. Please read carefully the following notes:
+
+Notes:
+* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR, that can get outdated if you run multiple EXE from the same folder. You can use `--clean_all` flag once to clean all the old files. 
+* Your browser may warn you about downloading an .exe file
+* If you recently updated the driver or this binary (EXE file), we recommend you:
+  * clear all the local artifacts with `--clean_all` OR 
+  * clear the Vulkan shader cache: For Windows users this can be done by clearing the contents of `C:\Users\%username%\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
+  * clear the `huggingface` cache. In Windows, this is `C:\Users\%username%\.cache\huggingface`.
+
+## Running
+
+* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE to start the web browser)
+* The first run may take about 10-15 minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
+* If successful, you will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
+* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/?__theme=dark.
+
+## Stopping
+
+* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment. The application should stop. 
+* Please make sure to do the above step before you attempt to update the EXE to a new version.
+
+# Results
+
+<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
+
+
+Here are some samples generated:
+
+![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
+
+![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
+
+
+<details>
+  <summary>Advanced Installation </summary>
+
+
+## Setup your Python VirtualEnvironment and Dependencies
+
+### Windows 10/11 Users
+
+* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
+
+* Install Git for Windows from [here](https://git-scm.com/download/win)
+
+#### Allow the install script to run in Powershell
+```powershell
+set-executionpolicy remotesigned 
+```
+
+#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
+```powershell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+./setup_venv.ps1 #You can re-run this script to get the latest version
+```
+
+### Linux
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+./setup_venv.sh
+source shark.venv/bin/activate
+```
+
+### Run Stable Diffusion on your device - WebUI
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\Users\nod\SHARK> cd web
+(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
+```
+#### Linux Users
+```shell
+(shark.venv) > cd web
+(shark.venv) > python index.py
+```
+
+
+
+### Run Stable Diffusion on your device - Commandline
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+```
+
+#### Linux
+```shell
+python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+```
+
+The output on a 6900XT would like:
+
+```shell 
+44it [00:08,  5.14it/s]i = 44 t = 120 (191ms)
+45it [00:08,  5.15it/s]i = 45 t = 100 (191ms)
+46it [00:08,  5.16it/s]i = 46 t = 80 (191ms)
+47it [00:09,  5.16it/s]i = 47 t = 60 (193ms)
+48it [00:09,  5.15it/s]i = 48 t = 40 (195ms)
+49it [00:09,  5.12it/s]i = 49 t = 20 (196ms)
+50it [00:09,  5.14it/s]
+Average step time: 192.8154182434082ms/it
+Total image generation runtime (s): 10.390909433364868
+(shark.venv) PS C:\g\shark>
+```
+
+
+For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
+</details>
+<details>
+  <summary>Discord link</summary>
+Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
+</details>
--- a/shark/examples/shark_inference/stable_diffusion/utils.py
+++ b/shark/examples/shark_inference/stable_diffusion/utils.py
@@ -1,21 +1,26 @@
 import os
-
 import torch
 from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
-from torch.fx.experimental.proxy_tensor import make_fx
 from stable_args import args
-from torch._decomp import get_decompositions
-import torch_mlir
+from shark.shark_importer import import_with_fx
+from shark.iree_utils.vulkan_utils import (
+    set_iree_vulkan_runtime_flags,
+    get_vulkan_target_triple,
+)


 def _compile_module(shark_module, model_name, extra_args=[]):
    if args.load_vmfb or args.save_vmfb:
-        extended_name = "{}_{}".format(model_name, args.device)
+        device = (
+            args.device
+            if "://" not in args.device
+            else "-".join(args.device.split("://"))
+        )
+        extended_name = "{}_{}".format(model_name, device)
        vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
-            print("Loading flatbuffer from {}".format(vmfb_path))
-            shark_module.load_module(vmfb_path)
+            print(f"loading existing vmfb from: {vmfb_path}")
+            shark_module.load_module(vmfb_path, extra_args=extra_args)
        else:
            if args.save_vmfb:
                print("Saving to {}".format(vmfb_path))
@@ -28,7 +33,7 @@ def _compile_module(shark_module, model_name, extra_args=[]):
            path = shark_module.save_module(
                os.getcwd(), extended_name, extra_args
            )
-            shark_module.load_module(path)
+            shark_module.load_module(path, extra_args=extra_args)
    else:
        shark_module.compile(extra_args)
    return shark_module
@@ -36,10 +41,16 @@ def _compile_module(shark_module, model_name, extra_args=[]):

 # Downloads the model from shark_tank and returns the shark_module.
 def get_shark_model(tank_url, model_name, extra_args=[]):
-    from shark.shark_downloader import download_torch_model
+    from shark.shark_downloader import download_model
+    from shark.parser import shark_args

-    mlir_model, func_name, inputs, golden_out = download_torch_model(
-        model_name, tank_url=tank_url
+    # Set local shark_tank cache directory.
+    shark_args.local_tank_cache = args.local_tank_cache
+
+    mlir_model, func_name, inputs, golden_out = download_model(
+        model_name,
+        tank_url=tank_url,
+        frontend="torch",
    )
    shark_module = SharkInference(
        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
@@ -47,51 +58,10 @@ def get_shark_model(tank_url, model_name, extra_args=[]):
    return _compile_module(shark_module, model_name, extra_args)


-# Converts the torch-module into shark_module.
+# Converts the torch-module into a shark_module.
 def compile_through_fx(model, inputs, model_name, extra_args=[]):

-    fx_g = make_fx(
-        model,
-        decomposition_table=get_decompositions(
-            [
-                torch.ops.aten.embedding_dense_backward,
-                torch.ops.aten.native_layer_norm_backward,
-                torch.ops.aten.slice_backward,
-                torch.ops.aten.select_backward,
-                torch.ops.aten.norm.ScalarOpt_dim,
-                torch.ops.aten.native_group_norm,
-                torch.ops.aten.upsample_bilinear2d.vec,
-                torch.ops.aten.split.Tensor,
-                torch.ops.aten.split_with_sizes,
-            ]
-        ),
-    )(*inputs)
-
-    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-    fx_g.recompile()
-
-    def strip_overloads(gm):
-        """
-        Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-        Args:
-            gm(fx.GraphModule): The input Fx graph module to be modified
-        """
-        for node in gm.graph.nodes:
-            if isinstance(node.target, torch._ops.OpOverload):
-                node.target = node.target.overloadpacket
-        gm.recompile()
-
-    strip_overloads(fx_g)
-
-    ts_g = torch.jit.trace(fx_g, inputs)
-
-    mlir_importer = SharkImporter(
-        ts_g,
-        inputs,
-        frontend="torch",
-    )
-
-    (mlir_module, func_name), _, _ = mlir_importer.import_debug()
+    mlir_module, func_name = import_with_fx(model, inputs)

    shark_module = SharkInference(
        mlir_module,
@@ -101,3 +71,162 @@ def compile_through_fx(model, inputs, model_name, extra_args=[]):
    )

    return _compile_module(shark_module, model_name, extra_args)
+
+
+def set_iree_runtime_flags():
+
+    vulkan_runtime_flags = [
+        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
+        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
+    ]
+    if args.enable_rgp:
+        vulkan_runtime_flags += [
+            f"--enable_rgp=true",
+            f"--vulkan_debug_utils=true",
+        ]
+    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
+
+
+def get_all_devices(driver_name):
+    """
+    Inputs: driver_name
+    Returns a list of all the available devices for a given driver sorted by
+    the iree path names of the device as in --list_devices option in iree.
+    """
+    from iree.runtime import get_driver
+
+    driver = get_driver(driver_name)
+    device_list_src = driver.query_available_devices()
+    device_list_src.sort(key=lambda d: d["path"])
+    return device_list_src
+
+
+def get_device_mapping(driver, key_combination=3):
+    """This method ensures consistent device ordering when choosing
+    specific devices for execution
+    Args:
+        driver (str): execution driver (vulkan, cuda, rocm, etc)
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Returns:
+        dict: map to possible device names user can input mapped to desired combination of name/path.
+    """
+    from shark.iree_utils._common import iree_device_map
+
+    driver = iree_device_map(driver)
+    device_list = get_all_devices(driver)
+    device_map = dict()
+
+    def get_output_value(dev_dict):
+        if key_combination == 1:
+            return f"{driver}://{dev_dict['path']}"
+        if key_combination == 2:
+            return dev_dict["name"]
+        if key_combination == 3:
+            return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
+
+    # mapping driver name to default device (driver://0)
+    device_map[f"{driver}"] = get_output_value(device_list[0])
+    for i, device in enumerate(device_list):
+        # mapping with index
+        device_map[f"{driver}://{i}"] = get_output_value(device)
+        # mapping with full path
+        device_map[f"{driver}://{device['path']}"] = get_output_value(device)
+    return device_map
+
+
+def map_device_to_name_path(device, key_combination=3):
+    """Gives the appropriate device data (supported name/path) for user selected execution device
+    Args:
+        device (str): user
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Raises:
+        ValueError:
+    Returns:
+        str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
+    """
+    driver = device.split("://")[0]
+    device_map = get_device_mapping(driver, key_combination)
+    try:
+        device_mapping = device_map[device]
+    except KeyError:
+        raise ValueError(f"Device '{device}' is not a valid device.")
+    return device_mapping
+
+
+def set_init_device_flags():
+    if "vulkan" in args.device:
+        # set runtime flags for vulkan.
+        set_iree_runtime_flags()
+
+        # set triple flag to avoid multiple calls to get_vulkan_triple_flag
+        device_name, args.device = map_device_to_name_path(args.device)
+        if not args.iree_vulkan_target_triple:
+            triple = get_vulkan_target_triple(device_name)
+            if triple is not None:
+                args.iree_vulkan_target_triple = triple
+        print(
+            f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
+        )
+    elif "cuda" in args.device:
+        args.device = "cuda"
+    elif "cpu" in args.device:
+        args.device = "cpu"
+
+    # set max_length based on availability.
+    if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
+        args.max_length = 77
+    elif args.variant == "openjourney":
+        args.max_length = 64
+
+    # use tuned models only in the case of stablediffusion/fp16 and rdna3 cards.
+    if (
+        args.variant in ["openjourney", "dreamlike"]
+        or args.precision != "fp16"
+        or "vulkan" not in args.device
+        or "rdna3" not in args.iree_vulkan_target_triple
+    ):
+        args.use_tuned = False
+        print("Tuned models are currently not supported for this setting.")
+
+    elif args.use_base_vae and args.variant != "stablediffusion":
+        args.use_tuned = False
+        print("Tuned models are currently not supported for this setting.")
+
+    if args.use_tuned:
+        print("Using tuned models for stablediffusion/fp16 and rdna3 card.")
+
+
+# Utility to get list of devices available.
+def get_available_devices():
+    def get_devices_by_name(driver_name):
+        from shark.iree_utils._common import iree_device_map
+
+        device_list = []
+        try:
+            driver_name = iree_device_map(driver_name)
+            device_list_dict = get_all_devices(driver_name)
+            print(f"{driver_name} devices are available.")
+        except:
+            print(f"{driver_name} devices are not available.")
+        else:
+            for i, device in enumerate(device_list_dict):
+                device_list.append(f"{driver_name}://{i} => {device['name']}")
+        return device_list
+
+    set_iree_runtime_flags()
+
+    available_devices = []
+    vulkan_devices = get_devices_by_name("vulkan")
+    available_devices.extend(vulkan_devices)
+    cuda_devices = get_devices_by_name("cuda")
+    available_devices.extend(cuda_devices)
+    available_devices.append("cpu")
+    return available_devices
--- a/shark/examples/shark_inference/t5_tf.py
+++ b/shark/examples/shark_inference/t5_tf.py
@@ -18,7 +18,7 @@ class T5Module(tf.Module):
        self.m = TFT5Model.from_pretrained("t5-small")
        self.m.predict = lambda x, y: self.m(input_ids=x, decoder_input_ids=y)

-    @tf.function(input_signature=t5_inputs)
+    @tf.function(input_signature=t5_inputs, jit_compile=True)
    def forward(self, input_ids, decoder_input_ids):
        return self.m.predict(input_ids, decoder_input_ids)

--- a/shark/examples/shark_inference/v_diffusion.py
+++ b/shark/examples/shark_inference/v_diffusion.py
@@ -1,8 +1,10 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model


-mlir_model, func_name, inputs, golden_out = download_torch_model("v_diffusion")
+mlir_model, func_name, inputs, golden_out = download_model(
+    "v_diffusion", frontend="torch"
+)

 shark_module = SharkInference(
    mlir_model, func_name, device="vulkan", mlir_dialect="linalg"
--- a/shark/examples/shark_training/bert_training_tf.py
+++ b/shark/examples/shark_training/bert_training_tf.py
@@ -52,7 +52,8 @@ class BertModule(tf.Module):
        input_signature=[
            bert_input,  # inputs
            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ]
+        ],
+        jit_compile=True,
    )
    def forward(self, inputs, labels):
        with tf.GradientTape() as tape:
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -37,7 +37,19 @@ def run_cmd(cmd):
        sys.exit("Exiting program due to error running:", cmd)


-IREE_DEVICE_MAP = {
+def iree_device_map(device):
+    uri_parts = device.split("://", 2)
+    if len(uri_parts) == 1:
+        return _IREE_DEVICE_MAP[uri_parts[0]]
+    else:
+        return f"{_IREE_DEVICE_MAP[uri_parts[0]]}://{uri_parts[1]}"
+
+
+def get_supported_device_list():
+    return list(_IREE_DEVICE_MAP.keys())
+
+
+_IREE_DEVICE_MAP = {
    "cpu": "local-task",
    "cuda": "cuda",
    "vulkan": "vulkan",
@@ -46,7 +58,14 @@ IREE_DEVICE_MAP = {
    "intel-gpu": "level_zero",
 }

-IREE_TARGET_MAP = {
+
+def iree_target_map(device):
+    if "://" in device:
+        device = device.split("://")[0]
+    return _IREE_TARGET_MAP[device]
+
+
+_IREE_TARGET_MAP = {
    "cpu": "llvm-cpu",
    "cuda": "cuda",
    "vulkan": "vulkan",
@@ -55,9 +74,13 @@ IREE_TARGET_MAP = {
    "intel-gpu": "opencl-spirv",
 }

+
 # Finds whether the required drivers are installed for the given device.
 def check_device_drivers(device):
    """Checks necessary drivers present for gpu and vulkan devices"""
+    if "://" in device:
+        device = device.split("://")[0]
+
    if device == "cuda":
        try:
            subprocess.check_output("nvidia-smi")
--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -13,12 +13,13 @@
 # limitations under the License.

 import iree.runtime.scripts.iree_benchmark_module as benchmark_module
-from shark.iree_utils._common import run_cmd, IREE_DEVICE_MAP
+from shark.iree_utils._common import run_cmd, iree_device_map
+from shark.iree_utils.cpu_utils import get_cpu_count
 import numpy as np
 import os
 import re

-UNIT_TO_SECOND_MAP = {"ms": 0.001, "s": 1}
+UNIT_TO_SECOND_MAP = {"us": 1e-6, "ms": 0.001, "s": 1}


 def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
@@ -69,10 +70,14 @@ def build_benchmark_args(
        # TODO: Replace name of train with actual train fn name.
        fn_name = "train"
    benchmark_cl.append(f"--entry_function={fn_name}")
-    benchmark_cl.append(f"--device={IREE_DEVICE_MAP[device]}")
+    benchmark_cl.append(f"--device={iree_device_map(device)}")
    mlir_input_types = tensor_to_type_str(input_tensors, mlir_dialect)
    for mlir_input in mlir_input_types:
        benchmark_cl.append(f"--function_input={mlir_input}")
+    if device == "cpu":
+        num_cpus = get_cpu_count()
+        if num_cpus is not None:
+            benchmark_cl.append(f"--task_topology_max_group_count={num_cpus}")
    time_extractor = "| awk 'END{{print $2 $3}}'"
    benchmark_cl.append(time_extractor)
    return benchmark_cl
@@ -94,8 +99,9 @@ def build_benchmark_args_non_tensor_input(
    benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
    benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
    # TODO: The function named can be passed as one of the args.
-    benchmark_cl.append(f"--entry_function={function_name}")
-    benchmark_cl.append(f"--device={IREE_DEVICE_MAP[device]}")
+    if function_name:
+        benchmark_cl.append(f"--entry_function={function_name}")
+    benchmark_cl.append(f"--device={iree_device_map(device)}")
    for input in inputs:
        benchmark_cl.append(f"--function_input={input}")
    time_extractor = "| awk 'END{{print $2 $3}}'"
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -13,27 +13,37 @@
 # limitations under the License.
 import iree.runtime as ireert
 import iree.compiler as ireec
-from shark.iree_utils._common import IREE_DEVICE_MAP, IREE_TARGET_MAP
+from shark.iree_utils._common import iree_device_map, iree_target_map
 from shark.iree_utils.benchmark_utils import *
+from shark.parser import shark_args
 import numpy as np
 import os
 import re

+
 # Get the iree-compile arguments given device.
 def get_iree_device_args(device, extra_args=[]):
-    if device == "cpu":
+    device_uri = device.split("://")
+    if len(device_uri) > 1:
+        if device_uri[0] not in ["vulkan"]:
+            print(
+                f"Specific device selection only supported for vulkan now."
+                f"Proceeding with {device} as device."
+            )
+
+    if device_uri[0] == "cpu":
        from shark.iree_utils.cpu_utils import get_iree_cpu_args

        return get_iree_cpu_args()
-    if device == "cuda":
+    if device_uri[0] == "cuda":
        from shark.iree_utils.gpu_utils import get_iree_gpu_args

        return get_iree_gpu_args()
-    if device in ["metal", "vulkan"]:
+    if device_uri[0] in ["metal", "vulkan"]:
        from shark.iree_utils.vulkan_utils import get_iree_vulkan_args

        return get_iree_vulkan_args(extra_args=extra_args)
-    if device == "rocm":
+    if device_uri[0] == "rocm":
        from shark.iree_utils.gpu_utils import get_iree_rocm_args

        return get_iree_rocm_args()
@@ -64,12 +74,23 @@ def get_iree_common_args():
    ]


+# Args that are suitable only for certain models or groups of models.
+# shark_args are passed down from pytests to control which models compile with these flags,
+# but they can also be set in shark/parser.py
+def get_model_specific_args():
+    ms_args = []
+    if shark_args.enable_conv_transform == True:
+        ms_args += ["--iree-flow-enable-conv-nchw-to-nhwc-transform"]
+    return ms_args
+
+
 def create_dispatch_dirs(bench_dir, device):
+    protected_files = ["ordered-dispatches.txt"]
    bench_dir_path = bench_dir.split("/")
    bench_dir_path[-1] = "temp_" + bench_dir_path[-1]
    tmp_bench_dir = "/".join(bench_dir_path)
    for f_ in os.listdir(bench_dir):
-        if os.path.isfile(f"{bench_dir}/{f_}"):
+        if os.path.isfile(f"{bench_dir}/{f_}") and f_ not in protected_files:
            dir_name = re.sub("\.\S*$", "", f_)
            if os.path.exists(f"{bench_dir}/{dir_name}"):
                os.system(f"rm -rf {bench_dir}/{dir_name}")
@@ -87,7 +108,19 @@ def create_dispatch_dirs(bench_dir, device):
                )


+def dump_isas(bench_dir):
+    for d_ in os.listdir(bench_dir):
+        if os.path.isdir(f"{bench_dir}/{d_}"):
+            for f_ in os.listdir(f"{bench_dir}/{d_}"):
+                if f_.endswith(".spv"):
+                    os.system(
+                        f"amdllpc -gfxip 11.0 {bench_dir}/{d_}/{f_} -v > \
+                         {bench_dir}/{d_}/isa.txt"
+                    )
+
+
 def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
+    benchmark_runtimes = {}
    dispatch_list = []
    all_dispatches = False

@@ -103,94 +136,115 @@ def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
            print("ERROR: Invalid dispatch benchmarks")
            return None
    for d_ in os.listdir(bench_dir):
-        in_dispatches = False
-        for dispatch in dispatch_list:
-            if str(dispatch) in d_:
-                in_dispatches = True
-        if all_dispatches or in_dispatches:
-            for f_ in os.listdir(f"{bench_dir}/{d_}"):
+        if os.path.isdir(f"{bench_dir}/{d_}"):
+            in_dispatches = False
+            for dispatch in dispatch_list:
+                if str(dispatch) in d_:
+                    in_dispatches = True
+            if all_dispatches or in_dispatches:
+                for f_ in os.listdir(f"{bench_dir}/{d_}"):

-                if "benchmark.mlir" in f_:
-                    dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
-                    module = dispatch_file.read()
-                    dispatch_file.close()
+                    if "benchmark.mlir" in f_:
+                        dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
+                        module = dispatch_file.read()
+                        dispatch_file.close()

-                    flatbuffer_blob = ireec.compile_str(
-                        module, target_backends=[IREE_TARGET_MAP[device]]
-                    )
+                        flatbuffer_blob = ireec.compile_str(
+                            module, target_backends=[iree_target_map(device)]
+                        )

-                    vmfb_file = open(
-                        f"{bench_dir}/{d_}/{d_}_benchmark.vmfb", "wb"
-                    )
-                    vmfb_file.write(flatbuffer_blob)
-                    vmfb_file.close()
+                        vmfb_file = open(
+                            f"{bench_dir}/{d_}/{d_}_benchmark.vmfb", "wb"
+                        )
+                        vmfb_file.write(flatbuffer_blob)
+                        vmfb_file.close()

-                    config = ireert.Config(IREE_DEVICE_MAP[device])
-                    vm_module = ireert.VmModule.from_flatbuffer(
-                        config.vm_instance, flatbuffer_blob
-                    )
+                        config = get_iree_runtime_config(device)
+                        vm_module = ireert.VmModule.from_flatbuffer(
+                            config.vm_instance, flatbuffer_blob
+                        )

-                    benchmark_cl = build_benchmark_args_non_tensor_input(
-                        input_file=f"{bench_dir}/{d_}/{d_}_benchmark.vmfb",
-                        device=device,
-                        inputs=(0,),
-                        mlir_dialect="linalg",
-                        function_name=vm_module.function_names[0],
-                    )
+                        benchmark_cl = build_benchmark_args_non_tensor_input(
+                            input_file=f"{bench_dir}/{d_}/{d_}_benchmark.vmfb",
+                            device=device,
+                            inputs=(0,),
+                            mlir_dialect="linalg",
+                            function_name="",
+                        )

-                    benchmark_bash = open(
-                        f"{bench_dir}/{d_}/{d_}_benchmark.sh", "w+"
-                    )
-                    benchmark_bash.write("#!/bin/bash\n")
-                    benchmark_bash.write(" ".join(benchmark_cl))
-                    benchmark_bash.close()
+                        benchmark_bash = open(
+                            f"{bench_dir}/{d_}/{d_}_benchmark.sh", "w+"
+                        )
+                        benchmark_bash.write("#!/bin/bash\n")
+                        benchmark_bash.write(" ".join(benchmark_cl))
+                        benchmark_bash.close()

-                    benchmark_data = run_benchmark_module(benchmark_cl)
+                        benchmark_data = run_benchmark_module(benchmark_cl)

-                    benchmark_file = open(
-                        f"{bench_dir}/{d_}/{d_}_data.txt", "w+"
-                    )
-                    benchmark_file.write(f"DISPATCH: {d_}\n")
-                    benchmark_file.write(str(benchmark_data) + "\n")
-                    benchmark_file.write(
-                        "SHARK BENCHMARK RESULT: "
-                        + str(1 / (benchmark_data * 0.001))
-                        + "\n"
-                    )
-                    benchmark_file.close()
+                        benchmark_file = open(
+                            f"{bench_dir}/{d_}/{d_}_data.txt", "w+"
+                        )
+                        benchmark_file.write(f"DISPATCH: {d_}\n")
+                        benchmark_file.write(str(benchmark_data) + "\n")
+                        benchmark_file.write(
+                            "SHARK BENCHMARK RESULT: "
+                            + str(1 / (benchmark_data * 0.001))
+                            + "\n"
+                        )
+                        benchmark_file.close()

-                elif ".mlir" in f_ and "benchmark" not in f_:
-                    dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
-                    module = dispatch_file.read()
-                    dispatch_file.close()
+                        benchmark_runtimes[d_] = 1 / (benchmark_data * 0.001)

-                    module = re.sub(
-                        "hal.executable private",
-                        "hal.executable public",
-                        module,
-                    )
+                    elif ".mlir" in f_ and "benchmark" not in f_:
+                        dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
+                        module = dispatch_file.read()
+                        dispatch_file.close()

-                    flatbuffer_blob = ireec.compile_str(
-                        module,
-                        target_backends=[IREE_TARGET_MAP[device]],
-                        extra_args=["--compile-mode=hal-executable"],
-                    )
+                        module = re.sub(
+                            "hal.executable private",
+                            "hal.executable public",
+                            module,
+                        )

-                    spirv_file = open(
-                        f"{bench_dir}/{d_}/{d_}_spirv.vmfb", "wb"
-                    )
-                    spirv_file.write(flatbuffer_blob)
-                    spirv_file.close()
+                        flatbuffer_blob = ireec.compile_str(
+                            module,
+                            target_backends=[iree_target_map(device)],
+                            extra_args=["--compile-mode=hal-executable"],
+                        )
+
+                        spirv_file = open(
+                            f"{bench_dir}/{d_}/{d_}_spirv.vmfb", "wb"
+                        )
+                        spirv_file.write(flatbuffer_blob)
+                        spirv_file.close()
+
+    ordered_dispatches = [
+        (k, v)
+        for k, v in sorted(
+            benchmark_runtimes.items(), key=lambda item: item[1]
+        )
+    ][::-1]
+    f_ = open(f"{bench_dir}/ordered-dispatches.txt", "w+")
+    for dispatch in ordered_dispatches:
+        f_.write(f"{dispatch[0]}: {dispatch[1]}ms\n")
+    f_.close()


 def compile_module_to_flatbuffer(
-    module, device, frontend, func_name, model_config_path, extra_args
+    module,
+    device,
+    frontend,
+    func_name,
+    model_config_path,
+    extra_args,
+    model_name="None",
 ):
    # Setup Compile arguments wrt to frontends.
    input_type = ""
    args = get_iree_frontend_args(frontend)
    args += get_iree_device_args(device, extra_args)
    args += get_iree_common_args()
+    args += get_model_specific_args()
    args += extra_args

    if frontend in ["tensorflow", "tf"]:
@@ -208,7 +262,7 @@ def compile_module_to_flatbuffer(
        # Currently for MHLO/TOSA.
        flatbuffer_blob = ireec.compile_str(
            module,
-            target_backends=[IREE_TARGET_MAP[device]],
+            target_backends=[iree_target_map(device)],
            extra_args=args,
            input_type=input_type,
        )
@@ -216,7 +270,7 @@ def compile_module_to_flatbuffer(
        # Currently for Torch.
        flatbuffer_blob = ireec.compile_str(
            module,
-            target_backends=[IREE_TARGET_MAP[device]],
+            target_backends=[iree_target_map(device)],
            extra_args=args,
        )

@@ -225,7 +279,7 @@ def compile_module_to_flatbuffer(

 def get_iree_module(flatbuffer_blob, device, func_name):
    # Returns the compiled module and the configs.
-    config = ireert.Config(IREE_DEVICE_MAP[device])
+    config = get_iree_runtime_config(device)
    vm_module = ireert.VmModule.from_flatbuffer(
        config.vm_instance, flatbuffer_blob
    )
@@ -275,7 +329,10 @@ def export_iree_module_to_vmfb(
        module, device, mlir_dialect, func_name, model_config_path, extra_args
    )
    if module_name is None:
-        module_name = f"{mlir_dialect}_{func_name}_{device}"
+        device_name = (
+            device if "://" not in device else "-".join(device.split("://"))
+        )
+        module_name = f"{mlir_dialect}_{func_name}_{device_name}"
    filename = os.path.join(directory, module_name + ".vmfb")
    print(f"Saved vmfb in {filename}.")
    with open(filename, "wb") as f:
@@ -297,18 +354,34 @@ def export_module_to_mlir_file(module, frontend, directory: str):
    return filename


-def get_results(compiled_vm, input, config, frontend="torch"):
+def get_results(
+    compiled_vm, input, config, frontend="torch", send_to_host=True
+):
    """Runs a .vmfb file given inputs and config and returns output."""
    device_inputs = [ireert.asdevicearray(config.device, a) for a in input]
    result = compiled_vm(*device_inputs)
    result_tensors = []
    if isinstance(result, tuple):
-        for val in result:
-            result_tensors.append(np.copy(np.asarray(val, val.dtype)))
+        if send_to_host:
+            for val in result:
+                result_tensors.append(np.asarray(val, val.dtype))
+        else:
+            for val in result:
+                result_tensors.append(val)
        return result_tensors
    elif isinstance(result, dict):
        data = list(result.items())
-        res = np.array(data, dtype=object)
-        return np.copy(res)
+        if send_to_host:
+            res = np.array(data, dtype=object)
+            return np.copy(res)
+        return data
    else:
-        return np.copy(np.asarray(result, dtype=result.dtype))
+        if send_to_host:
+            return result.to_host()
+        return result
+
+
+def get_iree_runtime_config(device):
+    device = iree_device_map(device)
+    config = ireert.Config(device=ireert.get_device(device))
+    return config
--- a/shark/iree_utils/cpu_utils.py
+++ b/shark/iree_utils/cpu_utils.py
@@ -16,6 +16,17 @@

 import subprocess

+
+def get_cpu_count():
+    import multiprocessing
+
+    try:
+        cpu_count = multiprocessing.cpu_count()
+        return cpu_count
+    except NotImplementedError:
+        return None
+
+
 # Get the default cpu args.
 def get_iree_cpu_args():
    find_triple_cmd = "uname -s -m"
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -25,7 +25,8 @@ def get_iree_gpu_args():
    # TODO: Give the user_interface to pass the sm_arch.
    sm_arch = get_cuda_sm_cc()
    if (
-        sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]
+        sm_arch
+        in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86", "sm_89"]
    ) and (shark_args.enable_tf32 == True):
        return [
            "--iree-hal-cuda-disable-loop-nounroll-wa",
@@ -56,7 +57,7 @@ CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36


 def get_cuda_sm_cc():
-    libnames = ("libcuda.so", "libcuda.dylib", "cuda.dll")
+    libnames = ("libcuda.so", "libcuda.dylib", "nvcuda.dll")
    for libname in libnames:
        try:
            cuda = ctypes.CDLL(libname)
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -16,6 +16,8 @@

 from os import linesep
 from shark.iree_utils._common import run_cmd
+import iree.runtime as ireert
+from sys import platform


 def get_vulkan_device_name():
@@ -24,47 +26,108 @@ def get_vulkan_device_name():
    if len(vulkaninfo_list) == 0:
        raise ValueError("No device name found in VulkanInfo!")
    if len(vulkaninfo_list) > 1:
-        print(
-            f"Found {len(vulkaninfo_list)} device names. choosing first one: {vulkaninfo_list[0]}"
-        )
+        print("Following devices found:")
+        for i, dname in enumerate(vulkaninfo_list):
+            print(f"{i}. {dname}")
+        print(f"Choosing first one: {vulkaninfo_list[0]}")
    return vulkaninfo_list[0]


-def get_vulkan_triple_flag(extra_args=[]):
-    if "-iree-vulkan-target-triple=" in " ".join(extra_args):
-        print(f"Using target triple from command line args")
-        return None
-
-    vulkan_device = get_vulkan_device_name()
-    if all(x in vulkan_device for x in ("Apple", "M1")):
-        print(f"Found {vulkan_device} Device. Using m1-moltenvk-macos")
-        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif all(x in vulkan_device for x in ("Apple", "M2")):
-        print("Found Apple M2 Device. Using m1-moltenvk-macos")
-        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif all(x in vulkan_device for x in ("A100", "SXM4")):
-        print(f"Found {vulkan_device} Device. Using ampere-rtx3080-linux")
-        return "-iree-vulkan-target-triple=ampere-rtx3080-linux"
-    elif all(x in vulkan_device for x in ("RTX", "3090")):
-        print(f"Found {vulkan_device} Device. Using ampere-rtx3090-linux")
-        return "-iree-vulkan-target-triple=ampere-rtx3090-linux"
-    elif "AMD" in vulkan_device:
-        print("Found AMD device. Using rdna2-unknown-linux")
-        return "-iree-vulkan-target-triple=rdna2-unknown-linux"
+def get_os_name():
+    if platform.startswith("linux"):
+        return "linux"
+    elif platform == "darwin":
+        return "macos"
+    elif platform == "win32":
+        return "windows"
    else:
+        print("Cannot detect OS type, defaulting to linux.")
+        return "linux"
+
+
+def get_vulkan_target_triple(device_name):
+    """This method provides a target triple str for specified vulkan device.
+
+    Args:
+        device_name (str): name of the hardware device to be used with vulkan
+
+    Returns:
+        str or None: target triple or None if no match found for given name
+    """
+    system_os = get_os_name()
+    # Apple Targets
+    if all(x in device_name for x in ("Apple", "M1")):
+        triple = "m1-moltenvk-macos"
+    elif all(x in device_name for x in ("Apple", "M2")):
+        triple = "m1-moltenvk-macos"
+
+    # Nvidia Targets
+    elif all(x in device_name for x in ("RTX", "2080")):
+        triple = f"turing-rtx2080-{system_os}"
+    elif all(x in device_name for x in ("A100", "SXM4")):
+        triple = f"ampere-rtx3080-{system_os}"
+    elif all(x in device_name for x in ("RTX", "3090")):
+        triple = f"ampere-rtx3090-{system_os}"
+    elif all(x in device_name for x in ("RTX", "4090")):
+        triple = f"ampere-rtx3090-{system_os}"
+    elif all(x in device_name for x in ("RTX", "4000")):
+        triple = f"turing-rtx4000-{system_os}"
+    elif all(x in device_name for x in ("RTX", "5000")):
+        triple = f"turing-rtx5000-{system_os}"
+    elif all(x in device_name for x in ("RTX", "6000")):
+        triple = f"turing-rtx6000-{system_os}"
+    elif all(x in device_name for x in ("RTX", "8000")):
+        triple = f"turing-rtx8000-{system_os}"
+    elif all(x in device_name for x in ("GTX", "1060")):
+        triple = f"pascal-gtx1060-{system_os}"
+    elif all(x in device_name for x in ("GTX", "1070")):
+        triple = f"pascal-gtx1070-{system_os}"
+    elif all(x in device_name for x in ("GTX", "1080")):
+        triple = f"pascal-gtx1080-{system_os}"
+
+    # Amd Targets
+    elif all(x in device_name for x in ("AMD", "7900")):
+        triple = f"rdna3-7900-{system_os}"
+    elif any(x in device_name for x in ("AMD", "Radeon")):
+        triple = f"rdna2-unknown-{system_os}"
+    else:
+        triple = None
+    return triple
+
+
+def get_vulkan_triple_flag(device_name=None, extra_args=[]):
+    for flag in extra_args:
+        if "-iree-vulkan-target-triple=" in flag:
+            print(f"Using target triple {flag.split('=')[1]}")
+            return None
+
+    vulkan_device = (
+        device_name if device_name is not None else get_vulkan_device_name()
+    )
+    triple = get_vulkan_target_triple(vulkan_device)
+    if triple is not None:
        print(
-            """Optimized kernel for your target device is not added yet.
-            Contact SHARK Admin on discord[https://discord.com/invite/RUqY2h2s9u]
-            or pull up an issue."""
+            f"Found vulkan device {vulkan_device}. Using target triple {triple}"
        )
-        print(f"Target : {vulkan_device}")
-        return None
+        return f"-iree-vulkan-target-triple={triple}"
+    print(
+        """Optimized kernel for your target device is not added yet.
+        Contact SHARK Admin on discord[https://discord.com/invite/RUqY2h2s9u]
+        or pull up an issue."""
+    )
+    print(f"Target : {vulkan_device}")
+    return None


 def get_iree_vulkan_args(extra_args=[]):
-    # vulkan_flag = ["--iree-flow-demote-i64-to-i32"]
    vulkan_flag = []
-    vulkan_triple_flag = get_vulkan_triple_flag(extra_args)
+    vulkan_triple_flag = get_vulkan_triple_flag(extra_args=extra_args)
    if vulkan_triple_flag is not None:
        vulkan_flag.append(vulkan_triple_flag)
    return vulkan_flag
+
+
+def set_iree_vulkan_runtime_flags(flags):
+    for flag in flags:
+        ireert.flags.parse_flags(flag)
+    return
--- a/shark/model_annotation.py
+++ b/shark/model_annotation.py
@@ -12,6 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+"""
+Usage:
+This function takes the model mlir file and the tuned config file as input,
+and output a new mlir file with lowering configs annotated on certain ops.
+There are two ways to utilize the function:
+1. Call model_annotation function within another python script
+from shark.model_annotation import model_annotation
+with create_context() as ctx:
+   module = model_annotation(ctx, input_contents=..., config_path=..., search_op=...)
+2. Run model_annotation.py directly
+python model_annotation.py -model path_to_original_mlir -config_path path_to_config_file
+"""
+
 import json
 import os
 import sys
@@ -26,21 +39,18 @@ def model_annotation(
    *,
    input_contents: str,
    config_path: str,
-    search_op: str = "matmul",
+    search_op: str,
 ):
    if os.path.isfile(input_contents):
        with open(input_contents, "rb") as f:
            input_contents = f.read()
-
    module = ir.Module.parse(input_contents)

-    with open(config_path, "r") as f:
-        data = json.load(f)
-        configs = data["options"]
+    configs = load_model_configs(config_path)

    # The Python API does not expose a general walk() function, so we just
    # do it ourselves.
-    walk_children(module.operation, configs, 0, search_op)
+    walk_children(module.operation, configs, search_op)

    if not module.operation.verify():
        raise RuntimeError("Modified program does not verify!")
@@ -48,15 +58,49 @@ def model_annotation(
    return module


-def walk_children(
-    op: ir.Operation, configs: List[Dict], idx: int, search_op: str
-):
+def load_model_configs(config_path: str):
+    config = {}
+    with open(config_path, "r") as f:
+        for line in f:
+            data = json.loads(line)
+
+            if "identifier" not in data.keys():
+                continue
+            if data["identifier"] == "matmul":
+                matrix_size = [data["m"], data["n"], data["k"]]
+            elif data["identifier"] == "bmm":
+                matrix_size = [data["b"], data["m"], data["n"], data["k"]]
+            elif data["identifier"] == "generic":
+                matrix_size = [1, data["b"], data["m"], data["n"], data["k"]]
+            elif data["identifier"] == "conv":
+                matrix_size = [
+                    data["n"],
+                    data["ih"],
+                    data["iw"],
+                    data["c"],
+                    data["kh"],
+                    data["kw"],
+                    data["f"],
+                    data["oh"],
+                    data["ow"],
+                    data["d"],
+                    data["s"],
+                    data["p"],
+                ]
+            config[shape_list_to_string(matrix_size)] = data
+        f.close()
+        return config
+
+
+def walk_children(op: ir.Operation, configs: List[Dict], search_op: str):
    if search_op == "matmul":
        op_names = ["linalg.matmul", "mhlo.dot"]
    elif search_op == "bmm":
        op_names = ["linalg.batch_matmul", "mhlo.dot_general"]
    elif search_op == "conv":
        op_names = ["mhlo.convolution", "linalg.conv_2d_nhwc_hwcf"]
+    elif search_op == "generic":
+        op_names = ["linalg.generic"]
    elif search_op == "all":
        op_names = [
            "mhlo.dot",
@@ -65,6 +109,7 @@ def walk_children(
            "linalg.matmul",
            "linalg.batch_matmul",
            "linalg.conv_2d_nhwc_hwcf",
+            "linalg.generic",
        ]
    else:
        raise ValueError(f"{search_op} op is not tunable.")
@@ -76,36 +121,168 @@ def walk_children(
                # 'operation' and 'name' attributes.
                if isinstance(child_op, ir.OpView):
                    child_op = child_op.operation
-                if child_op.name in op_names and idx < len(configs):
-                    add_attributes(child_op, configs[idx])
-                    idx = idx + 1
+                if child_op.name in op_names:
+                    if child_op.name == "linalg.generic":
+                        # This is for generic op that has contractionOpInterface
+                        # which is basically einsum("mk,bkn->bmn")
+                        op_result = str(child_op.results[0])
+                        op_iterator = str(
+                            child_op.attributes["iterator_types"]
+                        )
+                        if len(child_op.operands) != 3:
+                            continue
+                        if "reduction" not in op_iterator:
+                            continue
+                        if (
+                            "arith.addf" not in op_result
+                            or "arith.mulf" not in op_result
+                        ):
+                            continue
+                        if "arith.subf" in op_result:
+                            continue
+
+                    child_op_shape = get_op_shape(child_op, search_op)
+                    if (
+                        child_op_shape in configs.keys()
+                        and configs[child_op_shape]["options"][0] != None
+                    ):
+                        add_attributes(
+                            child_op, configs[child_op_shape]["options"][0]
+                        )
                    print(f"Updated op {child_op}", file=sys.stderr)
-                walk_children(child_op, configs, idx, search_op)
+
+                walk_children(child_op, configs, search_op)


-def add_attributes(op: ir.Operation, config: Dict):
-    (
-        tile_sizes,
-        pipeline,
-        workgroup_size,
-        split_k,
-        pipeline_depth,
-    ) = parse_config(config)
+def get_op_shape(op: ir.Operation, search_op: str):
+    shape_list = []
+    if search_op in ["generic", "all"]:
+        if op.name in ["linalg.generic"]:
+            input1 = str(op.operands[0].type)
+            input2 = str(op.operands[1].type)
+            m = input1.split("tensor<")[1].split("x")[0]
+            b = input2.split("tensor<")[1].split("x")[0]
+            k = input2.split("tensor<")[1].split("x")[1]
+            n = input2.split("tensor<")[1].split("x")[2]
+            shape_list = [1, int(b), int(m), int(n), int(k)]

-    add_compilation_info(
-        op,
-        tile_sizes=tile_sizes,
-        pipeline=pipeline,
-        workgroup_size=workgroup_size,
-        pipeline_depth=pipeline_depth,
-    )
+    if search_op in ["matmul", "all"]:
+        if op.name in ["mhlo.dot"]:
+            op_result = str(op.results[0])
+            m = op_result.split("tensor<")[1].split("x")[0]
+            k = op_result.split("tensor<")[1].split("x")[1]
+            n = op_result.split("tensor<")[2].split("x")[1]
+            shape_list = [int(m), int(n), int(k)]
+        elif op.name in ["linalg.matmul"]:
+            op_result = str(op.results[0]).split("ins(")[1]
+            m = op_result.split("tensor<")[1].split("x")[0]
+            k = op_result.split("tensor<")[1].split("x")[1]
+            n = op_result.split("tensor<")[2].split("x")[1]
+            shape_list = [int(m), int(n), int(k)]

-    if split_k:
-        add_attribute_by_name(op, "iree_flow_split_k", split_k)
+    if search_op in ["bmm", "all"]:
+        if op.name in ["mhlo.dot_general"]:
+            op_result = str(op.results[0])
+            b = op_result.split("tensor<")[1].split("x")[1]
+            m = op_result.split("tensor<")[1].split("x")[2]
+            k = op_result.split("tensor<")[1].split("x")[3]
+            n = op_result.split("tensor<")[3].split("x")[3]
+            shape_list = [int(b), int(m), int(n), int(k)]
+        elif op.name in ["linalg.batch_matmul"]:
+            op_result = str(op.results[0]).split("ins(")[1]
+            b = op_result.split("tensor<")[1].split("x")[0]
+            m = op_result.split("tensor<")[1].split("x")[1]
+            k = op_result.split("tensor<")[1].split("x")[2]
+            n = op_result.split("tensor<")[3].split("x")[2]
+            shape_list = [int(b), int(m), int(n), int(k)]
+
+    if search_op in ["conv", "all"]:
+        if op.name in ["mhlo.convolution"]:
+            op_result = str(op.results[0])
+            dilation = (
+                str(op.attributes["rhs_dilation"])
+                .split("dense<")[1]
+                .split(">")[0]
+            )
+            stride = (
+                str(op.attributes["window_strides"])
+                .split("dense<")[1]
+                .split(">")[0]
+            )
+            pad = (
+                str(op.attributes["padding"]).split("dense<")[1].split(">")[0]
+            )
+            n = op_result.split("tensor<")[1].split("x")[0]
+            ih = op_result.split("tensor<")[1].split("x")[1]
+            iw = op_result.split("tensor<")[1].split("x")[2]
+            c = op_result.split("tensor<")[1].split("x")[3]
+            kh = op_result.split("tensor<")[2].split("x")[0]
+            kw = op_result.split("tensor<")[2].split("x")[1]
+            f = op_result.split("tensor<")[2].split("x")[3]
+            oh = op_result.split("tensor<")[3].split("x")[1]
+            ow = op_result.split("tensor<")[3].split("x")[2]
+            shape_list = [
+                int(n),
+                int(ih),
+                int(iw),
+                int(c),
+                int(kh),
+                int(kw),
+                int(f),
+                int(oh),
+                int(ow),
+                int(dilation),
+                int(stride),
+                int(pad),
+            ]
+
+        elif op.name in ["linalg.conv_2d_nhwc_hwcf"]:
+            op_result = str(op.results[0]).split("ins(")[1]
+            dilation = (
+                str(op.attributes["dilations"])
+                .split("dense<")[1]
+                .split(">")[0]
+            )
+            stride = (
+                str(op.attributes["strides"]).split("dense<")[1].split(">")[0]
+            )
+            pad = 0
+            n = op_result.split("tensor<")[1].split("x")[0]
+            ih = op_result.split("tensor<")[1].split("x")[1]
+            iw = op_result.split("tensor<")[1].split("x")[2]
+            c = op_result.split("tensor<")[1].split("x")[3]
+            kh = op_result.split("tensor<")[2].split("x")[0]
+            kw = op_result.split("tensor<")[2].split("x")[1]
+            f = op_result.split("tensor<")[2].split("x")[3]
+            oh = op_result.split("tensor<")[3].split("x")[1]
+            ow = op_result.split("tensor<")[3].split("x")[2]
+            shape_list = [
+                int(n),
+                int(ih),
+                int(iw),
+                int(c),
+                int(kh),
+                int(kw),
+                int(f),
+                int(oh),
+                int(ow),
+                int(dilation),
+                int(stride),
+                int(pad),
+            ]
+
+    shape_str = shape_list_to_string(shape_list)
+    return shape_str


-def parse_config(config: Dict):
-    if config["pipeline"] == "GPU" or config["pipeline"] == "GPU_TENSORCORE":
+def add_attributes(op: ir.Operation, config: List[Dict]):
+    # Parse the config file
+    split_k = None
+    pipeline_depth = None
+    store_stage = None
+    subgroup_size = None
+
+    if "GPU" in config["pipeline"]:
        pipeline = (
            "LLVMGPUMatmulSimt"
            if config["pipeline"] == "GPU"
@@ -113,58 +290,80 @@ def parse_config(config: Dict):
        )
        tile_sizes = [config["work_group_tile_sizes"]]
        workgroup_size = config["work_group_sizes"]
-        try:
+        if "pipeline_depth" in config.keys():
            pipeline_depth = config["pipeline_depth"]
-        except:
-            pipeline_depth = None
-        try:
+        if "split_k" in config.keys():
            split_k = config["split_k"]
-        except:
-            split_k = None
-    else:
+        if "devices" in config.keys():
+            devices = config["devices"]
+        if "shard_sizes" in config.keys():
+            shard_sizes = config["shard_sizes"]
+    elif "SPIRV" in config["pipeline"]:
        pipeline = config["pipeline"]
        tile_sizes = [
            config["work_group_tile_sizes"],
-            config["l1_tile_sizes"],
-            config["vector_tile_sizes"],
+            config["parallel_tile_sizes"],
+            config["reduction_tile_sizes"],
+        ]
+        workgroup_size = config["work_group_sizes"]
+        if "vector_tile_sizes" in config.keys():
+            tile_sizes += [config["vector_tile_sizes"]]
+        if "window_tile_sizes" in config.keys():
+            tile_sizes += [config["window_tile_sizes"]]
+        if "subgroup_size" in config.keys():
+            subgroup_size = config["subgroup_size"]
+        if "pipeline_depth" in config.keys():
+            pipeline_depth = config["pipeline_depth"]
+        if "store_stage" in config.keys():
+            store_stage = config["store_stage"]
+    else:
+        # For IREE CPU pipelines
+        pipeline = config["pipeline"]
+        tile_sizes = [
+            config["work_group_tile_sizes"],
+            config["parallel_tile_sizes"],
+            config["reduction_tile_sizes"],
        ]
        workgroup_size = []
-        split_k = None
-        pipeline_depth = None
-    return tile_sizes, pipeline, workgroup_size, split_k, pipeline_depth

-
-def add_compilation_info(
-    op: ir.Operation,
-    tile_sizes: List[List[int]],
-    pipeline: str,
-    workgroup_size: List[int],
-    pipeline_depth: int,
-):
-    # We don't have a Python binding for CompilationInfo, so we just parse
-    # its string form.
-    if pipeline_depth:
-        attr = ir.Attribute.parse(
-            f"#iree_codegen.compilation_info<"
-            f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
-            f"translation_info = <{pipeline} pipeline_depth = {pipeline_depth}>, "
-            f"workgroup_size = {repr(workgroup_size)}>"
-        )
+    # Add compilation info as an attribute. We don't have a Python binding for CompilationInfo,
+    # so we just parse its string form.
+    if pipeline_depth != None:
+        translation_info = f"{pipeline} pipeline_depth = {pipeline_depth}"
+        if store_stage != None:
+            translation_info += f" store_stage = {store_stage}"
    else:
-        attr = ir.Attribute.parse(
-            f"#iree_codegen.compilation_info<"
-            f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
-            f"translation_info = <{pipeline}>, "
-            f"workgroup_size = {repr(workgroup_size)}>"
-        )
+        translation_info = f"{pipeline}"
+
+    compilation_info = (
+        f"#iree_codegen.compilation_info<"
+        f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
+        f"translation_info = <{translation_info}>, "
+        f"workgroup_size = {repr(workgroup_size)} "
+    )
+
+    if subgroup_size != None:
+        compilation_info += f", subgroup_size = {subgroup_size}>"
+    else:
+        compilation_info += ">"
+
+    attr = ir.Attribute.parse(compilation_info)
    op.attributes["compilation_info"] = attr

+    # Add other attributes if required.
+    if split_k:
+        add_attribute_by_name(op, "iree_flow_split_k", split_k)
+

 def add_attribute_by_name(op: ir.Operation, name: str, val: int):
    attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), val)
    op.attributes[name] = attr


+def shape_list_to_string(input):
+    return "x".join([str(d) for d in input])
+
+
 def create_context() -> ir.Context:
    context = ir.Context()
    ireec_trans.register_all_dialects(context)
@@ -173,15 +372,48 @@ def create_context() -> ir.Context:


 if __name__ == "__main__":
+    import argparse
+    from pathlib import Path
+
+    def path_expand(s):
+        return Path(s).expanduser().resolve()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-model",
+        type=path_expand,
+        default="model.mlir",
+        help="Path to the input mlir file",
+    )
+    parser.add_argument(
+        "-config_path",
+        type=path_expand,
+        default="best_configs.json",
+        help="Path where stores the op config file",
+    )
+    parser.add_argument(
+        "-output_path",
+        type=path_expand,
+        default="tuned_model.mlir",
+        help="Path to save the annotated mlir file",
+    )
+    parser.add_argument(
+        "-search_op",
+        type=str,
+        default="all",
+        help="Op to be optimized. options are matmul, bmm, conv.",
+    )
+
+    args = parser.parse_args()
+
    with create_context() as ctx:
        module = model_annotation(
            ctx,
-            input_contents=sys.argv[1],
-            config_path=sys.argv[2],
-            search_op="all",
+            input_contents=args.model,
+            config_path=args.config_path,
+            search_op=args.search_op,
        )
        mlir_str = str(module)
-        filename = "tuned_model.mlir"
-        with open(filename, "w") as f:
+        with open(args.output_path, "w") as f:
            f.write(mlir_str)
-        print(f"Saved mlir in {filename}.")
+        print(f"Saved mlir in {args.output_path}.")
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -105,4 +105,11 @@ parser.add_argument(
    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
 )

+parser.add_argument(
+    "--enable_conv_transform",
+    default=False,
+    action="store_true",
+    help="Enables the --iree-flow-enable-conv-nchw-to-nhwc-transform flag.",
+)
+
 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -39,6 +39,22 @@ class OnnxFusionOptions(object):
        self.no_attention_mask = False


+def check_requirements(frontend):
+    import importlib
+
+    has_pkgs = False
+    if frontend == "torch":
+        tv_spec = importlib.util.find_spec("torchvision")
+        has_pkgs = tv_spec is not None
+
+    elif frontend in ["tensorflow", "tf"]:
+        keras_spec = importlib.util.find_spec("keras")
+        tf_spec = importlib.util.find_spec("tensorflow")
+        has_pkgs = keras_spec is not None and tf_spec is not None
+
+    return has_pkgs
+
+
 class SharkBenchmarkRunner(SharkRunner):
    # SharkRunner derived class with Benchmarking capabilities.
    def __init__(
@@ -80,11 +96,11 @@ class SharkBenchmarkRunner(SharkRunner):
            input_tensors,
            mlir_dialect=self.mlir_dialect,
        )
-        print(self.benchmark_cl)

    def benchmark_frontend(self, modelname):
        if self.mlir_dialect in ["linalg", "torch"]:
            return self.benchmark_torch(modelname)
+
        elif self.mlir_dialect in ["mhlo", "tf"]:
            return self.benchmark_tf(modelname)

@@ -123,32 +139,45 @@ class SharkBenchmarkRunner(SharkRunner):

    def benchmark_tf(self, modelname):
        import tensorflow as tf
+
+        visible_default = tf.config.list_physical_devices("GPU")
+        try:
+            tf.config.set_visible_devices([], "GPU")
+            visible_devices = tf.config.get_visible_devices()
+            for device in visible_devices:
+                assert device.device_type != "GPU"
+        except:
+            # Invalid device or cannot modify virtual devices once initialized.
+            pass
+
        from tank.model_utils_tf import get_tf_model

-        model, input, = get_tf_model(
-            modelname
-        )[:2]
-        frontend_model = model
+        # tf_device = "/GPU:0" if self.device == "cuda" else "/CPU:0"
+        tf_device = "/CPU:0"
+        with tf.device(tf_device):
+            model, input, = get_tf_model(
+                modelname
+            )[:2]
+            frontend_model = model

-        for i in range(shark_args.num_warmup_iterations):
-            frontend_model.forward(*input)
+            for i in range(shark_args.num_warmup_iterations):
+                frontend_model.forward(*input)

-        begin = time.time()
-        for i in range(shark_args.num_iterations):
-            out = frontend_model.forward(*input)
-            if i == shark_args.num_iterations - 1:
-                end = time.time()
-                break
-        print(
-            f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
-        )
-        return [
-            f"{shark_args.num_iterations/(end-begin)}",
-            f"{((end-begin)/shark_args.num_iterations)*1000}",
-        ]
+            begin = time.time()
+            for i in range(shark_args.num_iterations):
+                out = frontend_model.forward(*input)
+                if i == shark_args.num_iterations - 1:
+                    end = time.time()
+                    break
+            print(
+                f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+            )
+            return [
+                f"{shark_args.num_iterations/(end-begin)}",
+                f"{((end-begin)/shark_args.num_iterations)*1000}",
+            ]

    def benchmark_c(self):
-        print(self.benchmark_cl)
        result = run_benchmark_module(self.benchmark_cl)
        print(f"Shark-IREE-C benchmark:{result} iter/second")
        return [f"{result}", f"{1000/result}"]
@@ -258,19 +287,15 @@ for currently supported models. Exiting benchmark ONNX."
                    return [param_count, model_tags, model_notes]

    def compare_bench_results(self, baseline: str, result: str):
-        # Takes two numbers represented as strings and returns "<n>x slower/faster", as in "result is <n>x slower than baseline".
-        a = float(baseline)
-        b = float(result)
-        if a < b:
-            # result slower than baseline
-            comparison = (b - a) / a
-            comp_str = f"{round(comparison, 2)}x slower"
-        elif a > b:
-            # result faster than baseline
+        if baseline is not None:
+            # Takes a baseline and a result string and calculates a comparison, e.g. "1.04x baseline".
+            a = float(baseline)
+            b = float(result)
            comparison = a / b
-            comp_str = f"{round(comparison, 2)}x faster"
+            comp_str = f"{round(comparison, 2)}x baseline"
        else:
-            comp_str = "equal"
+            comp_str = "N/A"
+
        return comp_str

    def benchmark_all_csv(
@@ -320,17 +345,21 @@ for currently supported models. Exiting benchmark ONNX."
                ) = ["", "", ""]
                if e == "frontend":
                    bench_result["engine"] = frontend
-                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
-                    ) = self.benchmark_frontend(modelname)
-                    self.frontend_result = bench_result["ms/iter"]
-                    bench_result["vs. PyTorch/TF"] = "="
-                    (
-                        bench_result["param_count"],
-                        bench_result["tags"],
-                        bench_result["notes"],
-                    ) = self.get_metadata(modelname)
+                    if check_requirements(frontend):
+                        (
+                            bench_result["iter/sec"],
+                            bench_result["ms/iter"],
+                        ) = self.benchmark_frontend(modelname)
+                        self.frontend_result = bench_result["ms/iter"]
+                        bench_result["vs. PyTorch/TF"] = "baseline"
+                        (
+                            bench_result["param_count"],
+                            bench_result["tags"],
+                            bench_result["notes"],
+                        ) = self.get_metadata(modelname)
+                    else:
+                        self.frontend_result = None
+                        continue

                elif e == "shark_python":
                    bench_result["engine"] = "shark_python"
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -14,11 +14,58 @@

 import numpy as np
 import os
-import urllib.request
-import json
-import hashlib
+from tqdm.std import tqdm
+import sys
 from pathlib import Path
 from shark.parser import shark_args
+from google.cloud import storage
+
+
+def download_public_file(
+    full_gs_url, destination_folder_name, single_file=False
+):
+    """Downloads a public blob from the bucket."""
+    # bucket_name = "gs://your-bucket-name/path/to/file"
+    # destination_file_name = "local/path/to/file"
+
+    storage_client = storage.Client.create_anonymous_client()
+    bucket_name = full_gs_url.split("/")[2]
+    source_blob_name = None
+    dest_filename = None
+    desired_file = None
+    if single_file:
+
+        desired_file = full_gs_url.split("/")[-1]
+        source_blob_name = "/".join(full_gs_url.split("/")[3:-1])
+        destination_folder_name, dest_filename = os.path.split(
+            destination_folder_name
+        )
+    else:
+        source_blob_name = "/".join(full_gs_url.split("/")[3:])
+    bucket = storage_client.bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=source_blob_name)
+    if not os.path.exists(destination_folder_name):
+        os.mkdir(destination_folder_name)
+    for blob in blobs:
+        blob_name = blob.name.split("/")[-1]
+        if single_file:
+            if blob_name == desired_file:
+                destination_filename = os.path.join(
+                    destination_folder_name, dest_filename
+                )
+                with open(destination_filename, "wb") as f:
+                    with tqdm.wrapattr(
+                        f, "write", total=blob.size
+                    ) as file_obj:
+                        storage_client.download_blob_to_file(blob, file_obj)
+            else:
+                continue
+
+        destination_filename = os.path.join(destination_folder_name, blob_name)
+        with open(destination_filename, "wb") as f:
+            with tqdm.wrapattr(f, "write", total=blob.size) as file_obj:
+                storage_client.download_blob_to_file(blob, file_obj)
+

 input_type_to_np_dtype = {
    "float32": np.float32,
@@ -30,7 +77,6 @@ input_type_to_np_dtype = {
    "int8": np.int8,
 }

-
 # Save the model in the home local so it needn't be fetched everytime in the CI.
 home = str(Path.home())
 alt_path = os.path.join(os.path.dirname(__file__), "../gen_shark_tank/")
@@ -50,10 +96,10 @@ if custom_path:
 else:
    WORKDIR = os.path.join(home, ".local/shark_tank/")
    print(
-        f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache="
-        " pytest flag"
+        f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache= flag"
    )

+
 # Checks whether the directory and files exists.
 def check_dir_exists(model_name, frontend="torch", dynamic=""):
    model_dir = os.path.join(WORKDIR, model_name)
@@ -79,194 +125,62 @@ def check_dir_exists(model_name, frontend="torch", dynamic=""):
            and os.path.isfile(os.path.join(model_dir, "golden_out.npz"))
            and os.path.isfile(os.path.join(model_dir, "hash.npy"))
        ):
-            print(
-                f"""The models are present in the {WORKDIR}. If you want a fresh 
-                download, consider deleting the directory."""
-            )
+            print(f"""Using cached models from {WORKDIR}...""")
            return True
    return False


 # Downloads the torch model from gs://shark_tank dir.
-def download_torch_model(
-    model_name, dynamic=False, tank_url="gs://shark_tank/latest"
+def download_model(
+    model_name,
+    dynamic=False,
+    tank_url="gs://shark_tank/latest",
+    frontend=None,
+    tuned=None,
 ):
    model_name = model_name.replace("/", "_")
    dyn_str = "_dynamic" if dynamic else ""
    os.makedirs(WORKDIR, exist_ok=True)
-    model_dir_name = model_name + "_torch"
-
-    def gs_download_model():
-        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
-            + tank_url
-            + "/"
-            + model_dir_name
-            + " "
-            + WORKDIR
-        )
-        if os.system(gs_command) != 0:
-            raise Exception("model not present in the tank. Contact Nod Admin")
-
-    if not check_dir_exists(model_dir_name, frontend="torch", dynamic=dyn_str):
-        gs_download_model()
-    else:
-        model_dir = os.path.join(WORKDIR, model_dir_name)
-        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
-        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp '
-            + tank_url
-            + "/"
-            + model_dir_name
-            + "/hash.npy"
-            + " "
-            + os.path.join(model_dir, "upstream_hash.npy")
-        )
-        if os.system(gs_hash) != 0:
-            raise Exception("hash of the model not present in the tank.")
-        upstream_hash = str(
-            np.load(os.path.join(model_dir, "upstream_hash.npy"))
-        )
-        if local_hash != upstream_hash:
-            if shark_args.update_tank == True:
-                gs_download_model()
-            else:
-                print(
-                    "Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
-                )
-
+    model_dir_name = model_name + "_" + frontend
    model_dir = os.path.join(WORKDIR, model_dir_name)
-    with open(
-        os.path.join(model_dir, model_name + dyn_str + "_torch.mlir"),
-        mode="rb",
-    ) as f:
-        mlir_file = f.read()
+    full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name

-    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
-    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
-    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
+    if shark_args.update_tank == True:
+        print(f"Updating artifacts for model {model_name}...")
+        download_public_file(full_gs_url, model_dir)

-    inputs_tuple = tuple([inputs[key] for key in inputs])
-    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
-    return mlir_file, function_name, inputs_tuple, golden_out_tuple
-
-
-# Downloads the tflite model from gs://shark_tank dir.
-def download_tflite_model(
-    model_name, dynamic=False, tank_url="gs://shark_tank/latest"
-):
-    dyn_str = "_dynamic" if dynamic else ""
-    os.makedirs(WORKDIR, exist_ok=True)
-    model_dir_name = model_name + "_tflite"
-
-    def gs_download_model():
-        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
-            + tank_url
-            + "/"
-            + model_dir_name
-            + " "
-            + WORKDIR
-        )
-        if os.system(gs_command) != 0:
-            raise Exception("model not present in the tank. Contact Nod Admin")
-
-    if not check_dir_exists(
-        model_dir_name, frontend="tflite", dynamic=dyn_str
+    elif not check_dir_exists(
+        model_dir_name, frontend=frontend, dynamic=dyn_str
    ):
-        gs_download_model()
+        print(f"Downloading artifacts for model {model_name}...")
+        download_public_file(full_gs_url, model_dir)
    else:
-        model_dir = os.path.join(WORKDIR, model_dir_name)
-        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
-        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp '
-            + tank_url
-            + "/"
-            + model_dir_name
-            + "/hash.npy"
-            + " "
-            + os.path.join(model_dir, "upstream_hash.npy")
-        )
-        if os.system(gs_hash) != 0:
-            raise Exception("hash of the model not present in the tank.")
-        upstream_hash = str(
-            np.load(os.path.join(model_dir, "upstream_hash.npy"))
-        )
-        if local_hash != upstream_hash:
-            if shark_args.update_tank == True:
-                gs_download_model()
-            else:
+        if not _internet_connected():
+            print(
+                "No internet connection. Using the model already present in the tank."
+            )
+        else:
+            local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
+            gs_hash_url = (
+                tank_url.rstrip("/") + "/" + model_dir_name + "/hash.npy"
+            )
+            download_public_file(
+                gs_hash_url,
+                os.path.join(model_dir, "upstream_hash.npy"),
+                single_file=True,
+            )
+            upstream_hash = str(
+                np.load(os.path.join(model_dir, "upstream_hash.npy"))
+            )
+            if local_hash != upstream_hash:
                print(
-                    "Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
+                    "Hash does not match upstream in gs://shark_tank/latest. If you want to use locally generated artifacts, this is working as intended. Otherwise, run with --update_tank."
                )

    model_dir = os.path.join(WORKDIR, model_dir_name)
-    with open(
-        os.path.join(model_dir, model_name + dyn_str + "_tflite.mlir"),
-        mode="rb",
-    ) as f:
-        mlir_file = f.read()
-
-    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
-    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
-    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
-
-    inputs_tuple = tuple([inputs[key] for key in inputs])
-    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
-    return mlir_file, function_name, inputs_tuple, golden_out_tuple
-
-
-def download_tf_model(
-    model_name, tuned=None, tank_url="gs://shark_tank/latest"
-):
-    model_name = model_name.replace("/", "_")
-    os.makedirs(WORKDIR, exist_ok=True)
-    model_dir_name = model_name + "_tf"
-
-    def gs_download_model():
-        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
-            + tank_url
-            + "/"
-            + model_dir_name
-            + " "
-            + WORKDIR
-        )
-        if os.system(gs_command) != 0:
-            raise Exception("model not present in the tank. Contact Nod Admin")
-
-    if not check_dir_exists(model_dir_name, frontend="tf"):
-        gs_download_model()
-    else:
-        model_dir = os.path.join(WORKDIR, model_dir_name)
-        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
-        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp '
-            + tank_url
-            + "/"
-            + model_dir_name
-            + "/hash.npy"
-            + " "
-            + os.path.join(model_dir, "upstream_hash.npy")
-        )
-        if os.system(gs_hash) != 0:
-            raise Exception("hash of the model not present in the tank.")
-        upstream_hash = str(
-            np.load(os.path.join(model_dir, "upstream_hash.npy"))
-        )
-        if local_hash != upstream_hash:
-            if shark_args.update_tank == True:
-                gs_download_model()
-            else:
-                print(
-                    "Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
-                )
-
-    model_dir = os.path.join(WORKDIR, model_dir_name)
-    suffix = "_tf.mlir" if tuned is None else "_tf_" + tuned + ".mlir"
+    tuned_str = "" if tuned is None else "_" + tuned
+    suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
    filename = os.path.join(model_dir, model_name + suffix)
-    if not os.path.isfile(filename):
-        filename = os.path.join(model_dir, model_name + "_tf.mlir")

    with open(filename, mode="rb") as f:
        mlir_file = f.read()
@@ -278,3 +192,13 @@ def download_tf_model(
    inputs_tuple = tuple([inputs[key] for key in inputs])
    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
    return mlir_file, function_name, inputs_tuple, golden_out_tuple
+
+
+def _internet_connected():
+    import requests as req
+
+    try:
+        req.get("http://1.1.1.1")
+        return True
+    except:
+        return False
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -87,7 +87,6 @@ class SharkImporter:

    def _tflite_mlir(self, func_name, save_dir="./shark_tmp/"):
        from iree.compiler import tflite as tflitec
-        from shark.iree_utils._common import IREE_TARGET_MAP

        self.mlir_model = tflitec.compile_file(
            self.raw_model_file,  # in tflite, it is a path to .tflite file, not a tflite interpreter
@@ -244,3 +243,59 @@ class SharkImporter:
                self.inputs,
                golden_out,
            )
+
+
+# Applies fx conversion to the model and imports the mlir.
+def import_with_fx(model, inputs, debug=False):
+    import torch
+    from torch.fx.experimental.proxy_tensor import make_fx
+    from torch._decomp import get_decompositions
+
+    # TODO: Control the decompositions.
+    fx_g = make_fx(
+        model,
+        decomposition_table=get_decompositions(
+            [
+                torch.ops.aten.embedding_dense_backward,
+                torch.ops.aten.native_layer_norm_backward,
+                torch.ops.aten.slice_backward,
+                torch.ops.aten.select_backward,
+                torch.ops.aten.norm.ScalarOpt_dim,
+                torch.ops.aten.native_group_norm,
+                torch.ops.aten.upsample_bilinear2d.vec,
+                torch.ops.aten.split.Tensor,
+                torch.ops.aten.split_with_sizes,
+                torch.ops.aten.native_layer_norm,
+            ]
+        ),
+    )(*inputs)
+
+    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+    fx_g.recompile()
+
+    def strip_overloads(gm):
+        """
+        Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+        Args:
+            gm(fx.GraphModule): The input Fx graph module to be modified
+        """
+        for node in gm.graph.nodes:
+            if isinstance(node.target, torch._ops.OpOverload):
+                node.target = node.target.overloadpacket
+        gm.recompile()
+
+    strip_overloads(fx_g)
+
+    mlir_importer = SharkImporter(
+        fx_g,
+        inputs,
+        frontend="torch",
+    )
+
+    if debug:
+        (mlir_module, func_name), _, _ = mlir_importer.import_debug()
+        return mlir_module, func_name
+
+    mlir_module, func_name = mlir_importer.import_mlir()
+
+    return mlir_module, func_name
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -97,6 +97,9 @@ class SharkInference:
            extra_args.append(
                f"--iree-hal-dump-executable-sources-to={self.dispatch_benchmarks_dir}"
            )
+            extra_args.append(
+                f"--iree-hal-dump-executable-binaries-to={self.dispatch_benchmarks_dir}"
+            )
            temp_dir = self.dispatch_benchmarks_dir.split("/")
            temp_dir[-1] = "temp_" + temp_dir[-1]
            temp_dir = "/".join(temp_dir)
@@ -135,8 +138,8 @@ class SharkInference:
            os.system(f"rm -rf {self.temp_dispatch_benchmarks_dir}")

    # inputs are considered to be tuple of np.array.
-    def forward(self, inputs: tuple):
-        return self.shark_runner.run(inputs)
+    def forward(self, inputs: tuple, send_to_host=True):
+        return self.shark_runner.run(inputs, send_to_host)

    # Captures the static input information from the mlir_module.
    # TODO(pashu123): Generate the input information for dynamic shapes.
@@ -193,11 +196,12 @@ class SharkInference:
        )

    # load and return the module.
-    def load_module(self, path):
+    def load_module(self, path, extra_args=[]):
        self.shark_runner = SharkRunner(
            function_name=self.function_name,
            device=self.device,
            compile_vmfb=False,
+            extra_args=extra_args,
        )
        (
            self.shark_runner.iree_compilation_module,
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -75,7 +75,7 @@ class SharkRunner:
        self.extra_args = extra_args

        if check_device_drivers(self.device):
-            device_driver_info(self.device)
+            print(device_driver_info(self.device))
            sys.exit(1)

        if compile_vmfb == True:
@@ -91,10 +91,11 @@ class SharkRunner:
                extra_args=self.extra_args,
            )

-    def run(self, inputs: tuple):
+    def run(self, inputs: tuple, send_to_host=False):
        return get_results(
            self.iree_compilation_module,
            inputs,
            self.iree_config,
            self.mlir_dialect,
+            send_to_host,
        )
--- a/shark/stress_test.py
+++ b/shark/stress_test.py
@@ -0,0 +1,315 @@
+# Copyright 2022 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from iree.runtime import query_available_drivers, get_driver
+from shark.shark_downloader import download_model
+from shark.shark_inference import SharkInference
+from typing import List, Optional, Tuple
+import numpy as np
+import argparse
+from shark.iree_utils._common import _IREE_DEVICE_MAP
+import multiprocessing
+from shark.shark_runner import supported_dialects
+import logging
+from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures.thread import ThreadPoolExecutor
+import time
+import numpy as np
+
+IREE_TO_SHARK_DRIVER_MAP = {v: k for k, v in _IREE_DEVICE_MAP.items()}
+
+
+def stress_test_compiled_model(
+    shark_module_path: str,
+    function_name: str,
+    device: str,
+    inputs: List[np.ndarray],
+    golden_out: List[np.ndarray],
+    batch_size: int,
+    max_iterations: int,
+    max_duration_seconds: float,
+    inference_timeout_seconds: float,
+    tolerance_nulp: int,
+    stress_test_index: int,
+):
+    logging.info(
+        f"Running stress test {stress_test_index} on device {device}."
+    )
+    # All interactions with the module must run in a single thread.
+    # We are using execution in a sperate thread in order to be able
+    # to wait with a timeout on the inference operation.
+    module_executor = ThreadPoolExecutor(1)
+    shark_module = module_executor.submit(
+        SharkInference,
+        mlir_module=bytes(),
+        function_name=function_name,
+        device=device,
+    ).result()
+    module_executor.submit(
+        shark_module.load_module, shark_module_path
+    ).result()
+    input_batches = [np.repeat(arr, batch_size, axis=0) for arr in inputs]
+    golden_output_batches = np.repeat(golden_out, batch_size, axis=0)
+    report_interval_seconds = 10
+    start_time = time.time()
+    previous_report_time = start_time
+    first_iteration_output = None
+    for i in range(max_iterations):
+        output = module_executor.submit(
+            shark_module.forward, input_batches
+        ).result(inference_timeout_seconds)
+        if first_iteration_output is None:
+            np.testing.assert_array_almost_equal_nulp(
+                golden_output_batches, output, nulp=tolerance_nulp
+            )
+            first_iteration_output = output
+        else:
+            np.testing.assert_array_equal(output, first_iteration_output)
+        current_time = time.time()
+        if report_interval_seconds < current_time - previous_report_time:
+            logging.info(
+                f"Stress test {stress_test_index} on device "
+                f"{device} at iteration {i+1}"
+            )
+            previous_report_time = current_time
+        if max_duration_seconds < current_time - start_time:
+            return
+    logging.info(f"Stress test {stress_test_index} on device {device} done.")
+
+
+def get_device_type(device_name: str):
+    return device_name.split("://", 1)[0]
+
+
+def get_device_types(device_names: str):
+    return [get_device_type(device_name) for device_name in device_names]
+
+
+def query_devices(device_types: Optional[List[str]] = None) -> List[str]:
+    devices = []
+    if device_types is None:
+        device_types = [
+            IREE_TO_SHARK_DRIVER_MAP[name]
+            for name in query_available_drivers()
+            if name in IREE_TO_SHARK_DRIVER_MAP
+        ]
+    for device_type in device_types:
+        driver = get_driver(_IREE_DEVICE_MAP[device_type])
+        device_infos = driver.query_available_devices()
+        for device_info in device_infos:
+            uri_path = (
+                device_info["path"]
+                if device_info["path"] != ""
+                else str(device_info["device_id"])
+            )
+            device_uri = f"{device_type}://{uri_path}"
+            devices.append(device_uri)
+    return devices
+
+
+def compile_stress_test_module(
+    device_types: List[str], mlir_model: str, func_name: str, mlir_dialect: str
+) -> List[str]:
+    shark_module_paths = []
+    for device_type in device_types:
+        logging.info(
+            f"Compiling stress test model for device type {device_type}."
+        )
+        shark_module = SharkInference(
+            mlir_model,
+            func_name,
+            mlir_dialect=mlir_dialect,
+            device=device_type,
+        )
+        shark_module_paths.append(shark_module.save_module())
+    return shark_module_paths
+
+
+def stress_test(
+    model_name: str,
+    dynamic_model: bool = False,
+    device_types: Optional[List[str]] = None,
+    device_names: Optional[List[str]] = None,
+    batch_size: int = 1,
+    max_iterations: int = 10**7,
+    max_duration_seconds: float = 3600,
+    inference_timeout_seconds: float = 60,
+    mlir_dialect: str = "linalg",
+    frontend: str = "torch",
+    oversubscription_factor: int = 1,
+    tolerance_nulp: int = 50000,
+):
+    logging.info(f"Downloading stress test model {model_name}.")
+    mlir_model, func_name, inputs, golden_out = download_model(
+        model_name=model_name, dynamic=dynamic_model, frontend=frontend
+    )
+
+    if device_names is None or device_types is not None:
+        device_names = [] if device_names is None else device_names
+        with ProcessPoolExecutor() as executor:
+            # query_devices needs to run in a separate process,
+            # because it will interfere with other processes that are forked later.
+            device_names.extend(
+                executor.submit(query_devices, device_types).result()
+            )
+
+    device_types_set = list(set(get_device_types(device_names)))
+    with ProcessPoolExecutor() as executor:
+        # This needs to run in a subprocess because when compiling for CUDA,
+        # some stuff get intialized and cuInit will fail in a forked process
+        # later. It should be just compiling, but alas.
+        shark_module_paths_set = executor.submit(
+            compile_stress_test_module,
+            device_types_set,
+            mlir_model,
+            func_name,
+            mlir_dialect,
+        ).result()
+    device_type_shark_module_path_map = {
+        device_type: module_path
+        for device_type, module_path in zip(
+            device_types_set, shark_module_paths_set
+        )
+    }
+    device_name_shark_module_path_map = {
+        device_name: device_type_shark_module_path_map[
+            get_device_type(device_name)
+        ]
+        for device_name in device_names
+    }
+
+    # This needs to run in a spearate process, because it uses the drvier chache
+    # in IREE and a subsequent call to `iree.runtime.SystemContext.add_vm_module`
+    # in a forked process will hang.
+    with multiprocessing.Pool(
+        len(device_name_shark_module_path_map) * oversubscription_factor
+    ) as process_pool:
+        process_pool.starmap(
+            stress_test_compiled_model,
+            [
+                (
+                    module_path,
+                    func_name,
+                    device_name,
+                    inputs,
+                    golden_out,
+                    batch_size,
+                    max_iterations,
+                    max_duration_seconds,
+                    inference_timeout_seconds,
+                    tolerance_nulp,
+                    stress_test_index,
+                )
+                for stress_test_index, (device_name, module_path) in enumerate(
+                    list(device_name_shark_module_path_map.items())
+                    * oversubscription_factor
+                )
+            ],
+        )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(encoding="utf-8", level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        description="Downloads, compiles and runs a model from the tank to stress test the system."
+    )
+    parser.add_argument(
+        "--model", type=str, help="Model name in the tank.", default="alexnet"
+    )
+    parser.add_argument(
+        "--dynamic",
+        help="Use dynamic version of the model.",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--frontend", type=str, help="Frontend of the model.", default="torch"
+    )
+    parser.add_argument(
+        "--mlir-dialect",
+        type=str,
+        help="MLIR dialect of the model.",
+        default="linalg",
+        choices=supported_dialects,
+    )
+    parser.add_argument(
+        "--device-types",
+        type=str,
+        nargs="*",
+        choices=_IREE_DEVICE_MAP.keys(),
+        help="Runs the stress test on all devices with that type. "
+        "If absent and no deveices are specified "
+        "will run against all available devices.",
+    )
+    parser.add_argument(
+        "--devices",
+        type=str,
+        nargs="*",
+        help="List of devices to run the stress test on. "
+        "If device-types is specified will run against the union of the two.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Number of inputs to feed into the model",
+        default=1,
+    )
+    parser.add_argument(
+        "--oversubscription",
+        type=int,
+        help="Oversubscrption factor. Each device will execute the model simultaneously "
+        "this many number of times.",
+        default=1,
+    )
+    parser.add_argument(
+        "--max-iterations",
+        type=int,
+        help="Maximum number of iterations to run the stress test per device.",
+        default=10**7,
+    )
+    parser.add_argument(
+        "--max-duration",
+        type=float,
+        help="Maximum number of seconds to run the stress test.",
+        default=3600,
+    )
+    parser.add_argument(
+        "--inference-timeout",
+        type=float,
+        help="Timeout in seconds for a single model inference operation.",
+        default=60,
+    )
+    parser.add_argument(
+        "--tolerance-nulp",
+        type=int,
+        help="The maximum number of unit in the last place for tolerance "
+        "when verifing results with the golden reference output.",
+        default=50000,
+    )
+
+    args = parser.parse_known_args()[0]
+    stress_test(
+        model_name=args.model,
+        dynamic_model=args.dynamic,
+        frontend=args.frontend,
+        mlir_dialect=args.mlir_dialect,
+        device_types=args.device_types,
+        device_names=args.devices,
+        batch_size=args.batch_size,
+        oversubscription_factor=args.oversubscription,
+        max_iterations=args.max_iterations,
+        max_duration_seconds=args.max_duration,
+        inference_timeout_seconds=args.inference_timeout,
+        tolerance_nulp=args.tolerance_nulp,
+    )
--- a/shark/tests/test_stress_test.py
+++ b/shark/tests/test_stress_test.py
@@ -0,0 +1,31 @@
+# Copyright 2022 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import subprocess
+import sys
+import importlib.util
+
+
+def test_stress_test():
+    subprocess.check_call(
+        [
+            sys.executable,
+            importlib.util.find_spec("shark.stress_test").origin,
+            "--model=squeezenet1_0",
+            "--devices",
+            "cpu",
+            "--max-iterations=1",
+        ]
+    )
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -56,9 +56,8 @@ def get_torch_mlir_module(
    input: tuple,
    dynamic: bool,
    jit_trace: bool,
-    from_torchscript: bool = False,
 ):
-    """Get the MLIR's linalg-on-tensors module from torchscipt module."""
+    """Get the MLIR's linalg-on-tensors module from the torchscipt module."""
    ignore_traced_shapes = False
    if dynamic:
        input = create_dynamic_placeholders(input)
--- a/tank/README.md
+++ b/tank/README.md
@@ -193,10 +193,10 @@ cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print
 echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo

 # Benchmark canonical Resnet50 on CPU via pytest
-pytest --benchmark tank/test_models -k "resnet50 and tf_static_cpu"
+pytest --benchmark tank/test_models.py -k "resnet50 and tf_static_cpu"

 # Benchmark canonical MiniLM on CPU via pytest
-pytest --benchmark tank/test_models -k "MiniLM and cpu"
+pytest --benchmark tank/test_models.py -k "MiniLM and cpu"

 # Benchmark MiniLM on CPU via transformer-benchmarks:
 git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -1,34 +1,34 @@
-resnet50,mhlo,tf,1e-02,1e-3,default
-albert-base-v2,mhlo,tf,1e-02,1e-3,default
-roberta-base,mhlo,tf,1e-02,1e-3,default
-bert-base-uncased,mhlo,tf,1e-2,1e-3,default
-camembert-base,mhlo,tf,1e-2,1e-3,default
-dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default
-distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default
-facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit
-funnel-transformer/small,mhlo,tf,1e-2,1e-3,default
-google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default
-google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default
-google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit
-hf-internal-testing/tiny-random-flaubert,mhlo,tf,1e-2,1e-3,default
-microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf
-microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default
-microsoft/mpnet-base,mhlo,tf,1e-2,1e-3,default
-albert-base-v2,linalg,torch,1e-2,1e-3,default
-alexnet,linalg,torch,1e-2,1e-3,default
-bert-base-cased,linalg,torch,1e-2,1e-3,default
-bert-base-uncased,linalg,torch,1e-2,1e-3,default
-distilbert-base-uncased,linalg,torch,1e-2,1e-3,default
-facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default
-google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default
-microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default
-microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default
-microsoft/resnet-50,linalg,torch,1e-2,1e-3,default
-google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default
-mobilenet_v3_small,linalg,torch,1e-2,1e-3,default
-nvidia/mit-b0,linalg,torch,1e-2,1e-3,default
-resnet101,linalg,torch,1e-2,1e-3,default
-resnet18,linalg,torch,1e-2,1e-3,default
-resnet50,linalg,torch,1e-2,1e-3,default
-squeezenet1_0,linalg,torch,1e-2,1e-3,default
-wide_resnet50_2,linalg,torch,1e-2,1e-3,default
+resnet50,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error: mostly conv"
+albert-base-v2,mhlo,tf,1e-2,1e-2,default,None,False,False,False,""
+roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,""
+bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+camembert-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"https://github.com/iree-org/iree/issues/9971"
+distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,True,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342"
+funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,True,"https://github.com/nod-ai/SHARK/issues/201"
+google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile."
+google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile."
+microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,""
+microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,False,False,False,""
+albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir"
+alexnet,linalg,torch,1e-2,1e-3,default,None,False,False,True,"Assertion Error: Zeros Output"
+bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,""
+bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,""
+facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile."
+google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311"
+microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390"
+microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,True,""
+microsoft/resnet-50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,True,"https://github.com/nod-ai/SHARK/issues/344"
+mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,True,"https://github.com/nod-ai/SHARK/issues/388"
+nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,True,"https://github.com/nod-ai/SHARK/issues/343"
+resnet101,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+resnet18,linalg,torch,1e-2,1e-3,default,None,True,True,True,""
+resnet50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/388"
+wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"Vulkan Numerical Error (mostly conv)"
+efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/575"
+mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,True,"https://github.com/nod-ai/SHARK/issues/388"
--- a/tank/examples/MiniLM_tf/huggingface_MiniLM_gen.py
+++ b/tank/examples/MiniLM_tf/huggingface_MiniLM_gen.py
@@ -32,7 +32,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def predict(self, input_word_ids, input_mask, segment_ids):
        return self.m.predict(input_word_ids, input_mask, segment_ids)

--- a/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
+++ b/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
@@ -33,7 +33,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def predict(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/tank/examples/MiniLM_tf/seq_classification.py
+++ b/tank/examples/MiniLM_tf/seq_classification.py
@@ -52,7 +52,7 @@ class SeqClassification(tf.Module):
        )
        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)[0]

-    @tf.function(input_signature=inputs_signature)
+    @tf.function(input_signature=inputs_signature, jit_compile=True)
    def forward(self, input_ids, attention_mask):
        return tf.math.softmax(
            self.m.predict(input_ids, attention_mask), axis=-1
--- a/tank/examples/bert-base-uncased_tosa_torch/bert_base_uncased_tosa.py
+++ b/tank/examples/bert-base-uncased_tosa_torch/bert_base_uncased_tosa.py
@@ -1,8 +1,9 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model

-mlir_model, func_name, inputs, golden_out = download_torch_model(
-    "bert-base-uncased_tosa"
+mlir_model, func_name, inputs, golden_out = download_model(
+    "bert-base-uncased_tosa",
+    frontend="torch",
 )

 shark_module = SharkInference(
--- a/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
+++ b/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
@@ -72,7 +72,8 @@ class BertModule(tf.Module):
        input_signature=[
            bert_input,  # inputs
            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ]
+        ],
+        jit_compile=True,
    )
    def learn(self, inputs, labels):
        with tf.GradientTape() as tape:
--- a/tank/examples/bert_tf/bert_large_gen.py
+++ b/tank/examples/bert_tf/bert_large_gen.py
@@ -60,7 +60,8 @@ class BertModule(tf.Module):
                shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32
            ),  # input2: segment_ids
            tf.TensorSpec([BATCH_SIZE], tf.int32),  # input3: labels
-        ]
+        ],
+        jit_compile=True,
    )
    def learn(self, input_word_ids, input_mask, segment_ids, labels):
        with tf.GradientTape() as tape:
@@ -75,7 +76,7 @@ class BertModule(tf.Module):
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def predict(self, input_word_ids, input_mask, segment_ids):
        inputs = [input_word_ids, input_mask, segment_ids]
        return self.m.predict(inputs)
--- a/tank/examples/bert_tf/bert_large_run.py
+++ b/tank/examples/bert_tf/bert_large_run.py
@@ -57,7 +57,8 @@ class BertModule(tf.Module):
        input_signature=[
            bert_input,  # inputs
            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ]
+        ],
+        jit_compile=True,
    )
    def learn(self, inputs, labels):
        with tf.GradientTape() as tape:
--- a/tank/examples/bert_tf/bert_large_tf.py
+++ b/tank/examples/bert_tf/bert_large_tf.py
@@ -50,7 +50,8 @@ class BertModule(tf.Module):
        input_signature=[
            bert_input,  # inputs
            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ]
+        ],
+        jit_compile=True,
    )
    def learn(self, inputs, labels):
        with tf.GradientTape() as tape:
--- a/tank/examples/bert_tf/bert_small_gen.py
+++ b/tank/examples/bert_tf/bert_small_gen.py
@@ -57,7 +57,8 @@ class BertModule(tf.Module):
                shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32
            ),  # input2: segment_ids
            tf.TensorSpec([BATCH_SIZE], tf.int32),  # input3: labels
-        ]
+        ],
+        jit_compile=True,
    )
    def learn(self, input_word_ids, input_mask, segment_ids, labels):
        with tf.GradientTape() as tape:
@@ -72,7 +73,7 @@ class BertModule(tf.Module):
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def predict(self, input_word_ids, input_mask, segment_ids):
        inputs = [input_word_ids, input_mask, segment_ids]
        return self.m.predict(inputs)
--- a/tank/examples/bert_tf/bert_small_run.py
+++ b/tank/examples/bert_tf/bert_small_run.py
@@ -53,7 +53,8 @@ class BertModule(tf.Module):
        input_signature=[
            bert_input,  # inputs
            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ]
+        ],
+        jit_compile=True,
    )
    def learn(self, inputs, labels):
        with tf.GradientTape() as tape:
--- a/tank/examples/bert_tf/bert_small_tf_run.py
+++ b/tank/examples/bert_tf/bert_small_tf_run.py
@@ -46,7 +46,8 @@ class BertModule(tf.Module):
        input_signature=[
            bert_input,  # inputs
            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ]
+        ],
+        jit_compile=True,
    )
    def learn(self, inputs, labels):
        with tf.GradientTape() as tape:
--- a/tank/examples/bert_tf/seq_classification.py
+++ b/tank/examples/bert_tf/seq_classification.py
@@ -52,7 +52,7 @@ class SeqClassification(tf.Module):
        )
        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)[0]

-    @tf.function(input_signature=inputs_signature)
+    @tf.function(input_signature=inputs_signature, jit_compile=True)
    def forward(self, input_ids, attention_mask):
        return tf.math.softmax(
            self.m.predict(input_ids, attention_mask), axis=-1
--- a/tank/examples/deberta-base_tf/deberta-base_tf_test.py
+++ b/tank/examples/deberta-base_tf/deberta-base_tf_test.py
@@ -1,6 +1,6 @@
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
+from shark.shark_downloader import download_model
 from shark.parser import shark_args
 from tank.test_utils import get_valid_test_params, shark_test_name_func
 from parameterized import parameterized
@@ -21,8 +21,8 @@ class DebertaBaseModuleTester:
        self.benchmark = benchmark

    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "microsoft/deberta-base"
+        model, func_name, inputs, golden_out = download_model(
+            "microsoft/deberta-base", frontend="tf"
        )

        shark_module = SharkInference(
--- a/tank/examples/gpt2-64/gpt2-64_tflite_test.py
+++ b/tank/examples/gpt2-64/gpt2-64_tflite_test.py
@@ -1,5 +1,5 @@
 import numpy as np
-from shark.shark_downloader import download_tflite_model
+from shark.shark_downloader import download_model
 from shark.shark_inference import SharkInference
 import pytest
 import unittest
@@ -58,8 +58,8 @@ class GptTfliteModuleTester:
        shark_args.save_vmfb = self.save_vmfb

        # Preprocess to get SharkImporter input args
-        mlir_model, func_name, inputs, tflite_results = download_tflite_model(
-            model_name="gpt2-64"
+        mlir_model, func_name, inputs, tflite_results = download_model(
+            model_name="gpt2-64", backend="tflite"
        )
        shark_module = SharkInference(
            mlir_module=mlir_model,
--- a/tank/examples/opt/opt_torch_test.py
+++ b/tank/examples/opt/opt_torch_test.py
@@ -20,10 +20,6 @@ class OPTModuleTester:
        self.benchmark = benchmark

    def create_and_check_module(self, dynamic, device, model_name):
-        # model_mlir, func_name, input, act_out = download_torch_model(
-        #     "opt", dynamic
-        # )
-
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        # config = OPTConfig()
        # opt_model = OPTModel(config)
--- a/tank/examples/rembert_tf/rembert_tf_test.py
+++ b/tank/examples/rembert_tf/rembert_tf_test.py
@@ -1,6 +1,6 @@
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
+from shark.shark_downloader import download_model
 from tank.test_utils import get_valid_test_params, shark_test_name_func
 from parameterized import parameterized

@@ -18,8 +18,8 @@ class RemBertModuleTester:
        self.benchmark = benchmark

    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "google/rembert"
+        model, func_name, inputs, golden_out = download_model(
+            "google/rembert", frontend="tf"
        )

        shark_module = SharkInference(
--- a/tank/examples/tapas-base_tf/tapas-base_tf_test.py
+++ b/tank/examples/tapas-base_tf/tapas-base_tf_test.py
@@ -1,6 +1,6 @@
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
+from shark.shark_downloader import download_model

 import iree.compiler as ireec
 import unittest
@@ -16,8 +16,9 @@ class TapasBaseModuleTester:
        self.benchmark = benchmark

    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "google/tapas-base"
+        model, func_name, inputs, golden_out = download_model(
+            "google/tapas-base",
+            frontend="tf",
        )

        shark_module = SharkInference(
--- a/tank/examples/v_diffusion_pytorch/cfg_sample_from_mlir.py
+++ b/tank/examples/v_diffusion_pytorch/cfg_sample_from_mlir.py
@@ -15,7 +15,7 @@ from torchvision.transforms import functional as TF
 from tqdm import trange

 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model
 import numpy as np

 import sys
@@ -191,7 +191,9 @@ x_in = x[0:min_batch_size, :, :, :]
 ts = x_in.new_ones([x_in.shape[0]])
 t_in = t[0] * ts

-mlir_model, func_name, inputs, golden_out = download_torch_model("v_diffusion")
+mlir_model, func_name, inputs, golden_out = download_model(
+    "v_diffusion", frontend="torch"
+)

 shark_module = SharkInference(
    mlir_model, func_name, device=args.runtime_device, mlir_dialect="linalg"
--- a/tank/model_metadata.csv
+++ b/tank/model_metadata.csv
@@ -27,3 +27,5 @@ microsoft/mpnet-base,False,False,-,-,-
 roberta-base,False,False,-,-,-
 xlm-roberta-base,False,False,-,-,-
 facebook/convnext-tiny-224,False,False,-,-,-
+efficientnet-v2-s,False,False,22M,"image-classification,cnn","Includes MBConv and Fused-MBConv"
+mnasnet1_0,False,True,-,"cnn, torchvision, mobile, architecture-search","Outperforms other mobile CNNs on Accuracy vs. Latency"
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -15,6 +15,7 @@ vision_models = [
    "squeezenet1_0",
    "wide_resnet50_2",
    "mobilenet_v3_small",
+    "mnasnet1_0",
 ]
 hf_img_cls_models = [
    "google/vit-base-patch16-224",
@@ -142,13 +143,14 @@ def get_vision_model(torch_model):
    import torchvision.models as models

    vision_models_dict = {
-        "alexnet": models.alexnet(pretrained=True),
-        "resnet18": models.resnet18(pretrained=True),
-        "resnet50": models.resnet50(pretrained=True),
-        "resnet101": models.resnet101(pretrained=True),
-        "squeezenet1_0": models.squeezenet1_0(pretrained=True),
-        "wide_resnet50_2": models.wide_resnet50_2(pretrained=True),
-        "mobilenet_v3_small": models.mobilenet_v3_small(pretrained=True),
+        "alexnet": models.alexnet(weights="DEFAULT"),
+        "resnet18": models.resnet18(weights="DEFAULT"),
+        "resnet50": models.resnet50(weights="DEFAULT"),
+        "resnet101": models.resnet101(weights="DEFAULT"),
+        "squeezenet1_0": models.squeezenet1_0(weights="DEFAULT"),
+        "wide_resnet50_2": models.wide_resnet50_2(weights="DEFAULT"),
+        "mobilenet_v3_small": models.mobilenet_v3_small(weights="DEFAULT"),
+        "mnasnet1_0": models.mnasnet1_0(weights="DEFAULT"),
    }
    if isinstance(torch_model, str):
        torch_model = vision_models_dict[torch_model]
@@ -160,6 +162,8 @@ def get_vision_model(torch_model):

 ################################################################################

+####################### Other PyTorch HF Models ###############################
+
 # Utility function for comparing two tensors (torch).
 def compare_tensors(torch_tensor, numpy_tensor, rtol=1e-02, atol=1e-03):
    # torch_to_numpy = torch_tensor.detach().numpy()
--- a/tank/model_utils_tf.py
+++ b/tank/model_utils_tf.py
@@ -6,24 +6,12 @@ from transformers import (
    TFBertModel,
 )

-visible_default = tf.config.list_physical_devices("GPU")
-try:
-    tf.config.set_visible_devices([], "GPU")
-    visible_devices = tf.config.get_visible_devices()
-    for device in visible_devices:
-        assert device.device_type != "GPU"
-except:
-    # Invalid device or cannot modify virtual devices once initialized.
-    pass
-
 BATCH_SIZE = 1
 MAX_SEQUENCE_LENGTH = 128

 ################################## MHLO/TF models #########################################
 # TODO : Generate these lists or fetch model source from tank/tf/tf_model_list.csv
-keras_models = [
-    "resnet50",
-]
+keras_models = ["resnet50", "efficientnet-v2-s"]
 maskedlm_models = [
    "albert-base-v2",
    "bert-base-uncased",
@@ -87,7 +75,7 @@ class TFHuggingFaceLanguage(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=tf_bert_input)
+    @tf.function(input_signature=tf_bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

@@ -162,7 +150,7 @@ class MaskedLM(tf.Module):
        )
        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)[0]

-    @tf.function(input_signature=input_signature_maskedlm)
+    @tf.function(input_signature=input_signature_maskedlm, jit_compile=True)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

@@ -178,42 +166,81 @@ def get_causal_lm_model(hf_name, text="Hello, this is the default text."):
 ##################### TensorFlow Keras Resnet Models #########################################################
 # Static shape, including batch size (1).
 # Can be dynamic once dynamic shape support is ready.
-INPUT_SHAPE = [1, 224, 224, 3]
-
-tf_model = tf.keras.applications.resnet50.ResNet50(
-    weights="imagenet", include_top=True, input_shape=tuple(INPUT_SHAPE[1:])
-)
+RESNET_INPUT_SHAPE = [1, 224, 224, 3]
+EFFICIENTNET_INPUT_SHAPE = [1, 384, 384, 3]


 class ResNetModule(tf.Module):
    def __init__(self):
        super(ResNetModule, self).__init__()
-        self.m = tf_model
+        self.m = tf.keras.applications.resnet50.ResNet50(
+            weights="imagenet",
+            include_top=True,
+            input_shape=tuple(RESNET_INPUT_SHAPE[1:]),
+        )
        self.m.predict = lambda x: self.m.call(x, training=False)

-    @tf.function(input_signature=[tf.TensorSpec(INPUT_SHAPE, tf.float32)])
+    @tf.function(
+        input_signature=[tf.TensorSpec(RESNET_INPUT_SHAPE, tf.float32)],
+        jit_compile=True,
+    )
    def forward(self, inputs):
        return self.m.predict(inputs)

+    def input_shape(self):
+        return RESNET_INPUT_SHAPE

-def load_image(path_to_image):
+    def preprocess_input(self, image):
+        return tf.keras.applications.resnet50.preprocess_input(image)
+
+
+class EfficientNetModule(tf.Module):
+    def __init__(self):
+        super(EfficientNetModule, self).__init__()
+        self.m = tf.keras.applications.efficientnet_v2.EfficientNetV2S(
+            weights="imagenet",
+            include_top=True,
+            input_shape=tuple(EFFICIENTNET_INPUT_SHAPE[1:]),
+        )
+        self.m.predict = lambda x: self.m.call(x, training=False)
+
+    @tf.function(
+        input_signature=[tf.TensorSpec(EFFICIENTNET_INPUT_SHAPE, tf.float32)],
+        jit_compile=True,
+    )
+    def forward(self, inputs):
+        return self.m.predict(inputs)
+
+    def input_shape(self):
+        return EFFICIENTNET_INPUT_SHAPE
+
+    def preprocess_input(self, image):
+        return tf.keras.applications.efficientnet_v2.preprocess_input(image)
+
+
+def load_image(path_to_image, width, height, channels):
    image = tf.io.read_file(path_to_image)
-    image = tf.image.decode_image(image, channels=3)
-    image = tf.image.resize(image, (224, 224))
+    image = tf.image.decode_image(image, channels=channels)
+    image = tf.image.resize(image, (width, height))
    image = image[tf.newaxis, :]
    return image


 def get_keras_model(modelname):
-    model = ResNetModule()
+    if modelname == "efficientnet-v2-s":
+        model = EfficientNetModule()
+    else:
+        model = ResNetModule()
+
    content_path = tf.keras.utils.get_file(
        "YellowLabradorLooking_new.jpg",
        "https://storage.googleapis.com/download.tensorflow.org/example_images/YellowLabradorLooking_new.jpg",
    )
-    content_image = load_image(content_path)
-    input_tensor = tf.keras.applications.resnet50.preprocess_input(
-        content_image
+    input_shape = model.input_shape()
+    content_image = load_image(
+        content_path, input_shape[1], input_shape[2], input_shape[3]
    )
+    input_tensor = model.preprocess_input(content_image)
    input_data = tf.expand_dims(input_tensor, 0)
    actual_out = model.forward(*input_data)
    return model, input_data, actual_out
@@ -240,7 +267,7 @@ class AutoModelImageClassfication(tf.Module):
        )
        self.m.predict = lambda x: self.m(x)

-    @tf.function(input_signature=input_signature_img_cls)
+    @tf.function(input_signature=input_signature_img_cls, jit_compile=True)
    def forward(self, inputs):
        return self.m.predict(inputs)

--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -1,15 +1,11 @@
 from shark.iree_utils._common import (
    check_device_drivers,
    device_driver_info,
-    IREE_DEVICE_MAP,
+    get_supported_device_list,
 )
 from shark.iree_utils.vulkan_utils import get_vulkan_triple_flag
 from parameterized import parameterized
-from shark.shark_downloader import (
-    download_tf_model,
-    download_torch_model,
-    download_tflite_model,
-)
+from shark.shark_downloader import download_model
 from shark.shark_inference import SharkInference
 from shark.parser import shark_args
 import iree.compiler as ireec
@@ -20,6 +16,7 @@ import csv
 import tempfile
 import os
 import shutil
+import multiprocessing


 def load_csv_and_convert(filename, gen=False):
@@ -41,6 +38,11 @@ def load_csv_and_convert(filename, gen=False):
                    "rtol": float(row[3]),
                    "atol": float(row[4]),
                    "out_type": row[5],
+                    "flags": row[6],
+                    "xfail_cpu": row[7],
+                    "xfail_cuda": row[8],
+                    "xfail_vkm": row[9],
+                    "xfail_reason": row[10],
                }
            )
    # This is a pytest workaround
@@ -59,7 +61,7 @@ def get_valid_test_params():
    """
    device_list = [
        device
-        for device in IREE_DEVICE_MAP.keys()
+        for device in get_supported_device_list()
        if not check_device_drivers(device)
    ]
    dynamic_list = (True, False)
@@ -130,25 +132,19 @@ class SharkModuleTester:
        self.config = config

    def create_and_check_module(self, dynamic, device):
+
        shark_args.local_tank_cache = self.local_tank_cache
        shark_args.update_tank = self.update_tank
-        if self.config["framework"] == "tf":
-            model, func_name, inputs, golden_out = download_tf_model(
-                self.config["model_name"],
-                tank_url=self.tank_url,
-            )
-        elif self.config["framework"] == "torch":
-            model, func_name, inputs, golden_out = download_torch_model(
-                self.config["model_name"],
-                tank_url=self.tank_url,
-            )
-        elif self.config["framework"] == "tflite":
-            model, func_name, inputs, golden_out = download_tflite_model(
-                model_name=self.config["model_name"],
-                tank_url=self.tank_url,
-            )
-        else:
-            model, func_name, inputs, golden_out = None, None, None, None
+        if "nhcw-nhwc" in self.config["flags"] and not os.path.isfile(
+            ".use-iree"
+        ):
+            shark_args.enable_conv_transform = True
+
+        model, func_name, inputs, golden_out = download_model(
+            self.config["model_name"],
+            tank_url=self.tank_url,
+            frontend=self.config["framework"],
+        )

        shark_module = SharkInference(
            model,
@@ -176,16 +172,32 @@ class SharkModuleTester:
                rtol=self.config["rtol"],
                atol=self.config["atol"],
            )
-        except AssertionError:
+        except AssertionError as msg:
            if any([self.ci, self.save_repro, self.save_fails]) == True:
                self.save_reproducers()
            if self.ci == True:
                self.upload_repro()
            if self.benchmark == True:
+                # p = multiprocessing.Process(
+                #    target=self.benchmark_module,
+                #    args=(shark_module, inputs, dynamic, device),
+                # )
+                # p.start()
+                # p.join()
                self.benchmark_module(shark_module, inputs, dynamic, device)
-            raise
+                print(msg)
+                pytest.xfail(reason="Numerics Issue")

        if self.benchmark == True:
+            # We must create a new process each time we benchmark a model to allow
+            # for Tensorflow to release GPU resources. Using the same process to
+            # benchmark multiple models leads to OOM.
+            # p = multiprocessing.Process(
+            #    target=self.benchmark_module,
+            #    args=(shark_module, inputs, dynamic, device),
+            # )
+            # p.start()
+            # p.join()
            self.benchmark_module(shark_module, inputs, dynamic, device)

        if self.save_repro == True:
@@ -271,21 +283,17 @@ class SharkModuleTest(unittest.TestCase):
            "update_tank"
        )
        self.module_tester.tank_url = self.pytestconfig.getoption("tank_url")
-        if (
-            config["model_name"] == "distilbert-base-uncased"
-            and config["framework"] == "torch"
-        ):
-            pytest.xfail(reason="https://github.com/nod-ai/SHARK/issues/354")
-        if (
-            config["model_name"] == "facebook/convnext-tiny-224"
-            and device == "cuda"
-        ):
-            pytest.xfail(reason="https://github.com/nod-ai/SHARK/issues/311")
-        if (
-            config["model_name"] == "google/vit-base-patch16-224"
-            and device == "cuda"
-        ):
-            pytest.xfail(reason="https://github.com/nod-ai/SHARK/issues/311")
+
+        if config["xfail_cpu"] == "True" and device == "cpu":
+            pytest.xfail(reason=config["xfail_reason"])
+
+        if config["xfail_cuda"] == "True" and device == "cuda":
+            pytest.xfail(reason=config["xfail_reason"])
+
+        if config["xfail_vkm"] == "True" and device in ["metal", "vulkan"]:
+            pytest.xfail(reason=config["xfail_reason"])
+
+        # Special cases that need to be marked.
        if config["model_name"] == "resnet50" and device in [
            "metal",
            "vulkan",
@@ -295,83 +303,6 @@ class SharkModuleTest(unittest.TestCase):
                    pytest.xfail(
                        reason="M2: Assert Error & M1: CompilerToolError"
                    )
-        if config["model_name"] == "google/rembert":
-            pytest.skip(reason="Model too large to convert.")
-        if config[
-            "model_name"
-        ] == "dbmdz/convbert-base-turkish-cased" and device in [
-            "metal",
-            "vulkan",
-        ]:
-            pytest.xfail(
-                reason="Issue: https://github.com/iree-org/iree/issues/9971"
-            )
-        if config["model_name"] == "facebook/convnext-tiny-224" and device in [
-            "cuda",
-            "metal",
-            "vulkan",
-        ]:
-            pytest.xfail(
-                reason="https://github.com/nod-ai/SHARK/issues/311, https://github.com/nod-ai/SHARK/issues/342"
-            )
-        if config["model_name"] == "funnel-transformer/small" and device in [
-            "cpu",
-            "cuda",
-            "metal",
-            "vulkan",
-        ]:
-            pytest.xfail(
-                reason="failing in the iree-compiler passes, see https://github.com/nod-ai/SHARK/issues/201"
-            )
-        if (
-            config["model_name"] == "google/vit-base-patch16-224"
-            and device == "cuda"
-        ):
-            pytest.xfail(reason="https://github.com/nod-ai/SHARK/issues/311")
-        if config["model_name"] == "microsoft/mpnet-base":
-            pytest.xfail(reason="https://github.com/nod-ai/SHARK/issues/203")
-        if config["model_name"] == "nvidia/mit-b0":
-            pytest.xfail(reason="https://github.com/nod-ai/SHARK/issues/343")
-        if (
-            config["model_name"] == "google/mobilebert-uncased"
-            and device in ["metal", "vulkan"]
-            and config["framework"] == "torch"
-        ):
-            pytest.xfail(
-                reason="Numerics issues -- https://github.com/nod-ai/SHARK/issues/344"
-            )
-        if (
-            config["model_name"] == "facebook/deit-small-distilled-patch16-224"
-            and device == "cuda"
-        ):
-            pytest.xfail(
-                reason="Fails during iree-compile without reporting diagnostics."
-            )
-        if (
-            config["model_name"]
-            == "microsoft/beit-base-patch16-224-pt22k-ft22k"
-            and device == "cuda"
-        ):
-            pytest.xfail(reason="https://github.com/nod-ai/SHARK/issues/390")
-        if config["model_name"] == "squeezenet1_0" and device in [
-            "cpu",
-            "metal",
-            "vulkan",
-        ]:
-            pytest.xfail(
-                reason="Numerics Issues: https://github.com/nod-ai/SHARK/issues/388"
-            )
-        if config["model_name"] == "mobilenet_v3_small" and device in [
-            "metal",
-            "vulkan",
-        ]:
-            pytest.xfail(
-                reason="Numerics Issues: https://github.com/nod-ai/SHARK/issues/388"
-            )
-        if config["model_name"] == "hf-internal-testing/tiny-random-flaubert":
-            pytest.xfail(reason="Transformers API mismatch")
-        if config["model_name"] == "alexnet" and device in ["metal", "vulkan"]:
-            pytest.xfail(reason="Assertion Error: Zeros Output")
        if (
            config["model_name"] == "camembert-base"
            and dynamic == False
@@ -388,25 +319,28 @@ class SharkModuleTest(unittest.TestCase):
            pytest.xfail(
                reason="chlo.broadcast_compare failed to satify constraint"
            )
-        if config["model_name"] in [
-            "microsoft/MiniLM-L12-H384-uncased",
-            "wide_resnet50_2",
-            "resnet50",
-            "resnet18",
-            "resnet101",
-            "microsoft/resnet-50",
-        ] and device in ["metal", "vulkan"]:
-            pytest.xfail(reason="Vulkan Numerical Error (mostly conv)")
-        if config["model_name"] == "mobilenet_v3_small" and device in [
-            "cuda",
-            "cpu",
-        ]:
-            pytest.xfail(reason="https://github.com/nod-ai/SHARK/issues/424")
-        if config["framework"] == "tf" and dynamic == True:
-            pytest.skip(
-                reason="Dynamic shapes not supported for this framework."
+        if (
+            config["model_name"]
+            in [
+                "facebook/convnext-tiny-224",
+                "squeezenet1_0",
+            ]
+            and device == "rocm"
+        ):
+            pytest.xfail(
+                reason="iree-compile buffer limit issue: https://github.com/nod-ai/SHARK/issues/475"
+            )
+        if (
+            config["model_name"]
+            in [
+                "funnel-transformer/small",
+                "mobilenet_v3_small",
+            ]
+            and device == "rocm"
+        ):
+            pytest.xfail(
+                reason="Numerics issues: https://github.com/nod-ai/SHARK/issues/476"
            )
-
        safe_name = (
            f"{config['model_name']}_{config['framework']}_{dynamic}_{device}"
        )
--- a/tank/tf_model_list.csv
+++ b/tank/tf_model_list.csv
@@ -17,3 +17,4 @@ funnel-transformer/small,hf
 microsoft/mpnet-base,hf
 facebook/convnext-tiny-224,img
 google/vit-base-patch16-224,img
+efficientnet-v2-s,keras
--- a/tank/torch_model_list.csv
+++ b/tank/torch_model_list.csv
@@ -1,6 +1,5 @@
 model_name, use_tracing, model_type, dynamic, param_count, tags, notes
 microsoft/MiniLM-L12-H384-uncased,True,hf,True,66M,"nlp;bert-variant;transformer-encoder","Large version has 12 layers; 384 hidden size; Smaller than BERTbase (66M params vs 109M params)"
-albert-base-v2,True,hf,True,11M,"nlp;bert-variant;transformer-encoder","12 layers; 128 embedding dim; 768 hidden dim; 12 attention heads; Smaller than BERTbase (11M params vs 109M params); Uses weight sharing to reduce # params but computational cost is similar to BERT."
 bert-base-uncased,True,hf,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 bert-base-cased,True,hf,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 google/mobilebert-uncased,True,hf,True,25M,"nlp,bert-variant,transformer-encoder,mobile","24 layers, 512 hidden size, 128 embedding"
@@ -16,3 +15,4 @@ microsoft/resnet-50,True,hf_img_cls,False,23M,"image-classification,cnn,residual
 facebook/deit-small-distilled-patch16-224,True,hf_img_cls,False,22M,"image-classification,vision-transformer,cnn",N/A
 microsoft/beit-base-patch16-224-pt22k-ft22k,True,hf_img_cls,False,86M,"image-classification,transformer-encoder,bert-variant,vision-transformer",N/A
 nvidia/mit-b0,True,hf_img_cls,False,3.7M,"image-classification,transformer-encoder",SegFormer
+mnasnet1_0,False,vision,True,-,"cnn, torchvision, mobile, architecture-search","Outperforms other mobile CNNs on Accuracy vs. Latency"
--- a/web/index.py
+++ b/web/index.py
@@ -1,23 +1,15 @@
-# from models.resnet50 import resnet_inf
-# from models.albert_maskfill import albert_maskfill_inf
-from models.stable_diffusion.main import stable_diff_inf
-
-# from models.diffusion.v_diffusion import vdiff_inf
-import gradio as gr
-from PIL import Image
-import json
 import os

+os.environ["AMD_ENABLE_LLPC"] = "1"
+import gradio as gr
+from PIL import Image
+from models.stable_diffusion.resources import resource_path, prompt_examples
+from models.stable_diffusion.main import stable_diff_inf
+from models.stable_diffusion.stable_args import args
+from models.stable_diffusion.utils import get_available_devices

-def debug_event(debug):
-    return gr.Textbox.update(visible=debug)
-
-
-prompt_examples = []
-prompt_loc = "./prompts.json"
-if os.path.exists(prompt_loc):
-    with open("./prompts.json", encoding="utf-8") as fopen:
-        prompt_examples = json.load(fopen)
+nodlogo_loc = resource_path("logos/nod-logo.png")
+sdlogo_loc = resource_path("logos/sd-demo-logo.png")


 demo_css = """
@@ -41,201 +33,150 @@ demo_css = """
 footer {display: none !important;}
 """

-with gr.Blocks(css=demo_css) as shark_web:
-    # load prompt examples.
+
+with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:

    with gr.Row(elem_id="ui_title"):
-        with gr.Column(scale=1, elem_id="demo_title_outer"):
-            logo2 = Image.open("./logos/sd-demo-logo.png")
-            gr.Image(
-                value=logo2,
-                show_label=False,
-                interactive=False,
-                elem_id="demo_title",
-            ).style(width=230)
-            # with gr.Column(scale=1):
-            #    gr.Label(value="Ultra fast Stable Diffusion")
+        nod_logo = Image.open(nodlogo_loc)
+        logo2 = Image.open(sdlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=100)
+            with gr.Column(scale=5, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=logo2,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="demo_title",
+                ).style(width=150, height=100)

    with gr.Row(elem_id="ui_body"):
-        prompt = (
-            scheduler
-        ) = (
-            iters_count
-        ) = (
-            batch_size
-        ) = (
-            steps
-        ) = (
-            guidance
-        ) = (
-            height
-        ) = (
-            width
-        ) = (
-            seed
-        ) = (
-            precision
-        ) = (
-            device
-        ) = (
-            cache
-        ) = (
-            iree_vulkan_target_triple
-        ) = (
-            live_preview
-        ) = (
-            debug
-        ) = save_img = stable_diffusion = generated_img = std_output = None
-        # load prompts.

        with gr.Row():
            with gr.Column(scale=1, min_width=600):
                with gr.Group(elem_id="prompt_box_outer"):
                    prompt = gr.Textbox(
                        label="Prompt",
-                        value="A photograph of an astronaut riding a horse",
+                        value="cyberpunk forest by Salvador Dali",
                        lines=1,
                        elem_id="prompt_box",
                    )
-                with gr.Group():
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="trees, green",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                with gr.Row():
+                    variant = gr.Dropdown(
+                        label="Model Variant",
+                        value="stablediffusion",
+                        choices=[
+                            "stablediffusion",
+                            "anythingv3",
+                            "analogdiffusion",
+                            "openjourney",
+                            "dreamlike",
+                        ],
+                    )
+                    scheduler_key = gr.Dropdown(
+                        label="Scheduler",
+                        value="SharkEulerDiscrete",
+                        choices=[
+                            "DDIM",
+                            "PNDM",
+                            "LMSDiscrete",
+                            "DPMSolverMultistep",
+                            "EulerDiscrete",
+                            "EulerAncestralDiscrete",
+                            "SharkEulerDiscrete",
+                        ],
+                    )
+                with gr.Row():
+                    steps = gr.Slider(1, 100, value=50, step=1, label="Steps")
+                    guidance_scale = gr.Slider(
+                        0,
+                        50,
+                        value=7.5,
+                        step=0.1,
+                        label="CFG Scale",
+                    )
+                with gr.Row():
+                    seed = gr.Number(value=-1, precision=0, label="Seed")
+                    available_devices = get_available_devices()
+                    device_key = gr.Dropdown(
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        None,
+                        inputs=[],
+                        outputs=[seed],
+                        _js="() => Math.floor(Math.random() * 4294967295)",
+                    )
+                    stable_diffusion = gr.Button("Generate Image")
+                with gr.Accordion(label="Prompt Examples!"):
                    ex = gr.Examples(
-                        label="Examples",
                        examples=prompt_examples,
                        inputs=prompt,
                        cache_examples=False,
                        elem_id="prompt_examples",
                    )
-                with gr.Row():
-                    steps = gr.Slider(1, 100, value=50, step=1, label="Steps")
-                    guidance = gr.Slider(
-                        0,
-                        50,
-                        value=7.5,
-                        step=0.1,
-                        label="Guidance Scale",
-                        interactive=False,
-                    )
-                with gr.Row():
-                    height = gr.Slider(
-                        384,
-                        768,
-                        value=512,
-                        step=64,
-                        label="Height",
-                        interactive=False,
-                    )
-                    width = gr.Slider(
-                        384,
-                        768,
-                        value=512,
-                        step=64,
-                        label="Width",
-                        interactive=False,
-                    )
-                with gr.Row():
-                    precision = gr.Radio(
-                        label="Precision",
-                        value="fp16",
-                        choices=["fp16", "fp32"],
-                    )
-                    seed = gr.Textbox(value="42", max_lines=1, label="Seed")
-                with gr.Row():
-                    cache = gr.Checkbox(label="Cache", value=True)
-                    # debug = gr.Checkbox(label="DEBUG", value=False)
-                    save_img = gr.Checkbox(label="Save Image", value=False)
-                    live_preview = gr.Checkbox(
-                        label="Live Preview", value=False
-                    )
-                    # Hidden Items.
-                    scheduler = gr.Radio(
-                        label="Scheduler",
-                        value="LMS",
-                        choices=["PNDM", "LMS", "DDIM"],
-                        interactive=False,
-                        visible=False,
-                    )
-                    device = gr.Radio(
-                        label="Device",
-                        value="vulkan",
-                        choices=["cpu", "cuda", "vulkan"],
-                        interactive=False,
-                        visible=False,
-                        elem_id="ugly_line",
-                    )
-                    iters_count = gr.Slider(
-                        1,
-                        24,
-                        value=1,
-                        step=1,
-                        label="Iteration Count",
-                        visible=False,
-                    )
-                    batch_size = gr.Slider(
-                        1,
-                        4,
-                        value=1,
-                        step=1,
-                        label="Batch Size",
-                        visible=False,
-                    )
-                    iree_vulkan_target_triple = gr.Textbox(
-                        value="",
-                        max_lines=1,
-                        label="IREE VULKAN TARGET TRIPLE",
-                        visible=False,
-                        elem_id="ugly_line",
-                    )
-                stable_diffusion = gr.Button("Generate Image")
-                # logo
-                nod_logo = Image.open("./logos/amd-nod-logo.png")
-                gr.Image(
-                    value=nod_logo,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="top_logo",
-                ).style(width=230)
-            with gr.Column(scale=1, min_width=600):
-                generated_img = gr.Image(
-                    type="pil", elem_id="img_result", interactive=False
-                ).style(height=768, width=768)
-                std_output = gr.Textbox(
-                    label="Std Output",
-                    value="Nothing.",
-                    lines=5,
-                    visible=False,
-                    elem_id="ugly_line",
-                )
-        """
-        debug.change(
-            debug_event,
-            inputs=[debug],
-            outputs=[std_output],
-            show_progress=False,
-        )
-        """

+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    generated_img = gr.Image(
+                        type="pil", interactive=False
+                    ).style(height=512)
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=4,
+                        show_label=False,
+                    )
+
+        prompt.submit(
+            stable_diff_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                steps,
+                guidance_scale,
+                seed,
+                scheduler_key,
+                variant,
+                device_key,
+            ],
+            outputs=[generated_img, std_output],
+            show_progress=args.progress_bar,
+        )
        stable_diffusion.click(
            stable_diff_inf,
            inputs=[
                prompt,
-                scheduler,
-                iters_count,
-                batch_size,
+                negative_prompt,
                steps,
-                guidance,
-                height,
-                width,
+                guidance_scale,
                seed,
-                precision,
-                device,
-                cache,
-                iree_vulkan_target_triple,
-                live_preview,
-                save_img,
+                scheduler_key,
+                variant,
+                device_key,
            ],
            outputs=[generated_img, std_output],
-            show_progress=False,
+            show_progress=args.progress_bar,
        )

 shark_web.queue()
-shark_web.launch(server_name="0.0.0.0", server_port=8080, enable_queue=True)
+shark_web.launch(
+    share=False,
+    inbrowser=True,
+    server_name="0.0.0.0",
+    server_port=8080,
+)
--- a/web/logos/amd-nod-logo.png
+++ b/web/logos/amd-nod-logo.png
--- a/web/logos/other_logo.png
+++ b/web/logos/other_logo.png
--- a/web/models/resnet50.py
+++ b/web/models/resnet50.py
@@ -3,7 +3,7 @@ import requests
 import torch
 from torchvision import transforms
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model

 ################################## Preprocessing inputs and helper functions ########

@@ -69,8 +69,8 @@ def resnet_inf(numpy_img, device):
    if device not in compiled_module.keys():
        if DEBUG:
            log_write.write("Compiling the Resnet50 module.\n")
-        mlir_model, func_name, inputs, golden_out = download_torch_model(
-            "resnet50"
+        mlir_model, func_name, inputs, golden_out = download_model(
+            "resnet50", frontend="torch"
        )
        shark_module = SharkInference(
            mlir_model, func_name, device=device, mlir_dialect="linalg"
--- a/web/models/stable_diff.py
+++ b/web/models/stable_diff.py
@@ -1,266 +0,0 @@
-from transformers import CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
-import torch
-from PIL import Image
-from diffusers import LMSDiscreteScheduler
-from tqdm.auto import tqdm
-from shark.shark_inference import SharkInference
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-import torch_mlir
-import tempfile
-import numpy as np
-import os
-
-##############################################################################
-
-
-def load_mlir(mlir_loc):
-    if mlir_loc == None:
-        return None
-    with open(os.path.join(mlir_loc)) as f:
-        mlir_module = f.read()
-    return mlir_module
-
-
-def compile_through_fx(model, inputs, device, mlir_loc=None, extra_args=[]):
-
-    module = load_mlir(mlir_loc)
-    if mlir_loc == None:
-        fx_g = make_fx(
-            model,
-            decomposition_table=get_decompositions(
-                [
-                    torch.ops.aten.embedding_dense_backward,
-                    torch.ops.aten.native_layer_norm_backward,
-                    torch.ops.aten.slice_backward,
-                    torch.ops.aten.select_backward,
-                    torch.ops.aten.norm.ScalarOpt_dim,
-                    torch.ops.aten.native_group_norm,
-                    torch.ops.aten.upsample_bilinear2d.vec,
-                    torch.ops.aten.split.Tensor,
-                    torch.ops.aten.split_with_sizes,
-                ]
-            ),
-        )(*inputs)
-
-        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-        fx_g.recompile()
-
-        def strip_overloads(gm):
-            """
-            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-            Args:
-                gm(fx.GraphModule): The input Fx graph module to be modified
-            """
-            for node in gm.graph.nodes:
-                if isinstance(node.target, torch._ops.OpOverload):
-                    node.target = node.target.overloadpacket
-            gm.recompile()
-
-        strip_overloads(fx_g)
-
-        ts_g = torch.jit.script(fx_g)
-
-        module = torch_mlir.compile(
-            ts_g,
-            inputs,
-            torch_mlir.OutputType.LINALG_ON_TENSORS,
-            use_tracing=False,
-            verbose=False,
-        )
-
-    mlir_model = module
-    func_name = "forward"
-
-    shark_module = SharkInference(
-        mlir_model,
-        func_name,
-        device=device,
-        mlir_dialect="tm_tensor",
-    )
-    shark_module.compile(extra_args)
-
-    return shark_module
-
-
-##############################################################################
-
-DEBUG = False
-compiled_module = {}
-
-
-def stable_diff_inf(prompt: str, steps, device: str):
-
-    args = {}
-    args["prompt"] = [prompt]
-    args["steps"] = steps
-    args["device"] = device
-    args["mlir_loc"] = "./stable_diffusion.mlir"
-    output_loc = (
-        f"stored_results/stable_diffusion/{prompt}_{int(steps)}_{device}.jpg"
-    )
-
-    global DEBUG
-    global compiled_module
-
-    DEBUG = False
-    log_write = open(r"logs/stable_diffusion_log.txt", "w")
-    if log_write:
-        DEBUG = True
-
-    if args["device"] not in compiled_module.keys():
-        YOUR_TOKEN = "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"
-
-        # 1. Load the autoencoder model which will be used to decode the latents into image space.
-        compiled_module["vae"] = AutoencoderKL.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            subfolder="vae",
-            use_auth_token=YOUR_TOKEN,
-        )
-
-        # 2. Load the tokenizer and text encoder to tokenize and encode the text.
-        compiled_module["tokenizer"] = CLIPTokenizer.from_pretrained(
-            "openai/clip-vit-large-patch14"
-        )
-        compiled_module["text_encoder"] = CLIPTextModel.from_pretrained(
-            "openai/clip-vit-large-patch14"
-        )
-        if DEBUG:
-            log_write.write("Compiling the Unet module.\n")
-
-        # Wrap the unet model to return tuples.
-        class UnetModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.unet = UNet2DConditionModel.from_pretrained(
-                    "CompVis/stable-diffusion-v1-4",
-                    subfolder="unet",
-                    use_auth_token=YOUR_TOKEN,
-                )
-                self.in_channels = self.unet.in_channels
-                self.train(False)
-
-            def forward(self, x, y, z):
-                return self.unet.forward(x, y, z, return_dict=False)[0]
-
-        # 3. The UNet model for generating the latents.
-        unet = UnetModel()
-        latent_model_input = torch.rand([2, 4, 64, 64])
-        text_embeddings = torch.rand([2, 77, 768])
-        shark_unet = compile_through_fx(
-            unet,
-            (latent_model_input, torch.tensor([1.0]), text_embeddings),
-            args["device"],
-            args["mlir_loc"],
-            ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
-        )
-        compiled_module[args["device"]] = shark_unet
-        if DEBUG:
-            log_write.write("Compilation successful.\n")
-
-        compiled_module["unet"] = unet
-        compiled_module["scheduler"] = LMSDiscreteScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            num_train_timesteps=1000,
-        )
-
-    shark_unet = compiled_module[args["device"]]
-    vae = compiled_module["vae"]
-    unet = compiled_module["unet"]
-    tokenizer = compiled_module["tokenizer"]
-    text_encoder = compiled_module["text_encoder"]
-    scheduler = compiled_module["scheduler"]
-
-    height = 512  # default height of Stable Diffusion
-    width = 512  # default width of Stable Diffusion
-
-    num_inference_steps = int(args["steps"])  # Number of denoising steps
-
-    guidance_scale = 7.5  # Scale for classifier-free guidance
-
-    generator = torch.manual_seed(
-        42
-    )  # Seed generator to create the inital latent noise
-
-    batch_size = len(args["prompt"])
-
-    text_input = tokenizer(
-        args["prompt"],
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-
-    text_embeddings = text_encoder(text_input.input_ids)[0]
-
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-        [""] * batch_size,
-        padding="max_length",
-        max_length=max_length,
-        return_tensors="pt",
-    )
-    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
-
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-    latents = torch.randn(
-        (batch_size, unet.in_channels, height // 8, width // 8),
-        generator=generator,
-    )
-    scheduler.set_timesteps(num_inference_steps)
-    latents = latents * scheduler.sigmas[0]
-
-    for i, t in tqdm(enumerate(scheduler.timesteps)):
-
-        if DEBUG:
-            log_write.write(f"i = {i} t = {t}\n")
-        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-        latent_model_input = torch.cat([latents] * 2)
-        sigma = scheduler.sigmas[i]
-        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
-
-        # predict the noise residual
-        latent_model_input_numpy = latent_model_input.detach().numpy()
-        text_embeddings_numpy = text_embeddings.detach().numpy()
-
-        noise_pred = shark_unet.forward(
-            (
-                latent_model_input_numpy,
-                np.array([t]).astype(np.float32),
-                text_embeddings_numpy,
-            )
-        )
-        noise_pred = torch.from_numpy(noise_pred)
-
-        # perform guidance
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        noise_pred = noise_pred_uncond + guidance_scale * (
-            noise_pred_text - noise_pred_uncond
-        )
-
-        # compute the previous noisy sample x_t -> x_t-1
-        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
-
-    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    image = vae.decode(latents).sample
-
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-    output = pil_images[0]
-    # save the output image with the prompt name.
-    output.save(os.path.join(output_loc))
-    log_write.close()
-
-    std_output = ""
-    with open(r"logs/stable_diffusion_log.txt", "r") as log_read:
-        std_output = log_read.read()
-
-    return output, std_output
--- a/web/models/stable_diffusion/cache_objects.py
+++ b/web/models/stable_diffusion/cache_objects.py
@@ -0,0 +1,101 @@
+from transformers import CLIPTokenizer
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+)
+from models.stable_diffusion.opt_params import get_unet, get_vae, get_clip
+from models.stable_diffusion.utils import (
+    set_init_device_flags,
+    set_iree_runtime_flags,
+)
+from models.stable_diffusion.stable_args import args
+from models.stable_diffusion.schedulers import (
+    SharkEulerDiscreteScheduler,
+)
+
+
+model_config = {
+    "v2_1": "stabilityai/stable-diffusion-2-1",
+    "v2_1base": "stabilityai/stable-diffusion-2-1-base",
+    "v1_4": "CompVis/stable-diffusion-v1-4",
+}
+
+
+def get_schedulers(version):
+    schedulers = dict()
+    schedulers["PNDM"] = PNDMScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers["DDIM"] = DDIMScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers[
+        "DPMSolverMultistep"
+    ] = DPMSolverMultistepScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers[
+        "EulerAncestralDiscrete"
+    ] = EulerAncestralDiscreteScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers[
+        "SharkEulerDiscrete"
+    ] = SharkEulerDiscreteScheduler.from_pretrained(
+        model_config[version],
+        subfolder="scheduler",
+    )
+    schedulers["SharkEulerDiscrete"].compile()
+    return schedulers
+
+
+def get_tokenizer(version):
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    if version != "v1_4":
+        tokenizer = CLIPTokenizer.from_pretrained(
+            model_config[version], subfolder="tokenizer"
+        )
+    return tokenizer
+
+
+class ModelCache:
+    def __init__(self):
+        self.device = None
+        self.variant = None
+        self.version = None
+        self.schedulers = None
+        self.tokenizer = None
+
+    def set_models(self, device_key):
+        if self.device != device_key or self.variant != args.variant:
+            self.device = device_key
+            self.variant = args.variant
+            self.version = args.version
+            args.device = device_key.split("=>", 1)[0].strip()
+            args.max_length = 64
+            args.use_tuned = True
+            set_init_device_flags()
+            self.schedulers = get_schedulers(args.version)
+            self.tokenizer = get_tokenizer(args.version)
+            self.vae = get_vae()
+            self.unet = get_unet()
+            self.clip = get_clip()
+
+
+model_cache = ModelCache()
--- a/web/models/stable_diffusion/logos/Nod_logo.png
+++ b/web/models/stable_diffusion/logos/Nod_logo.png
--- a/web/models/stable_diffusion/logos/nod-logo.png
+++ b/web/models/stable_diffusion/logos/nod-logo.png
--- a/web/models/stable_diffusion/logos/sd-demo-logo.png
+++ b/web/models/stable_diffusion/logos/sd-demo-logo.png
--- a/web/models/stable_diffusion/main.py
+++ b/web/models/stable_diffusion/main.py
@@ -1,365 +1,227 @@
-from transformers import CLIPTextModel, CLIPTokenizer
 import torch
-from PIL import Image
-from diffusers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from tqdm.auto import tqdm
-import numpy as np
-from models.stable_diffusion.model_wrappers import (
-    get_vae32,
-    get_vae16,
-    get_unet16_wrapped,
-    get_unet32_wrapped,
-)
-from models.stable_diffusion.utils import get_shark_model
-import time
 import os
-
-GCLOUD_BUCKET = "gs://shark_tank/prashant_nod"
-VAE_FP16 = "vae_fp16"
-VAE_FP32 = "vae_fp32"
-UNET_FP16 = "unet_fp16"
-UNET_FP32 = "unet_fp32"
-
-TUNED_GCLOUD_BUCKET = "gs://shark_tank/quinn"
-UNET_FP16_TUNED = "unet_fp16_tunedv2"
-
-args = None
+from PIL import Image
+import torchvision.transforms as T
+from tqdm.auto import tqdm
+from models.stable_diffusion.cache_objects import model_cache
+from models.stable_diffusion.stable_args import args
+from random import randint
+import numpy as np
+import time
+import sys


-class Arguments:
-    def __init__(
-        self,
-        prompt: str,
-        scheduler: str,
-        iteration_count: int,
-        batch_size: int,
-        steps: int,
-        guidance: float,
-        height: int,
-        width: int,
-        seed: int,
-        precision: str,
-        device: str,
-        cache: bool,
-        iree_vulkan_target_triple: str,
-        live_preview: bool,
-        save_img: bool,
-        import_mlir: bool = False,
-        max_length: int = 77,
-        use_tuned: bool = True,
-    ):
-        self.prompt = prompt
-        self.scheduler = scheduler
-        self.iteration_count = iteration_count
-        self.batch_size = batch_size
-        self.steps = steps
-        self.guidance = guidance
-        self.height = height
-        self.width = width
-        self.seed = seed
-        self.precision = precision
-        self.device = device
-        self.cache = cache
-        self.iree_vulkan_target_triple = iree_vulkan_target_triple
-        self.live_preview = live_preview
-        self.save_img = save_img
-        self.import_mlir = import_mlir
-        self.max_length = max_length
-        self.use_tuned = use_tuned
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))


-def get_models():
+# Helper function to profile the vulkan device.
+def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
+    if args.vulkan_debug_utils and "vulkan" in args.device:
+        import iree

-    global args
-
-    IREE_EXTRA_ARGS = []
-    if args.precision == "fp16":
-        IREE_EXTRA_ARGS += [
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=32",
-        ]
-        if args.use_tuned:
-            unet_gcloud_bucket = TUNED_GCLOUD_BUCKET
-            vae_gcloud_bucket = GCLOUD_BUCKET
-            unet_args = IREE_EXTRA_ARGS
-            vae_args = IREE_EXTRA_ARGS + [
-                "--iree-flow-enable-conv-nchw-to-nhwc-transform"
-            ]
-            unet_name = UNET_FP16_TUNED
-            vae_name = VAE_FP16
-        else:
-            unet_gcloud_bucket = GCLOUD_BUCKET
-            vae_gcloud_bucket = GCLOUD_BUCKET
-            IREE_EXTRA_ARGS += [
-                "--iree-flow-enable-conv-nchw-to-nhwc-transform"
-            ]
-            unet_args = IREE_EXTRA_ARGS
-            vae_args = IREE_EXTRA_ARGS
-            unet_name = UNET_FP16
-            vae_name = VAE_FP16
-
-        if args.import_mlir == True:
-            return get_vae16(args, model_name=VAE_FP16), get_unet16_wrapped(
-                args, model_name=UNET_FP16
-            )
-        else:
-            return get_shark_model(
-                args,
-                vae_gcloud_bucket,
-                vae_name,
-                vae_args,
-            ), get_shark_model(
-                args,
-                unet_gcloud_bucket,
-                unet_name,
-                unet_args,
-            )
-
-    elif args.precision == "fp32":
-        IREE_EXTRA_ARGS += [
-            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=16",
-        ]
-        if args.import_mlir == True:
-            return get_vae32(args, model_name=VAE_FP32), get_unet32_wrapped(
-                args, model_name=UNET_FP32
-            )
-        else:
-            return get_shark_model(
-                args,
-                GCLOUD_BUCKET,
-                VAE_FP32,
-                IREE_EXTRA_ARGS,
-            ), get_shark_model(
-                args,
-                GCLOUD_BUCKET,
-                UNET_FP32,
-                IREE_EXTRA_ARGS,
-            )
+        print(f"Profiling and saving to {file_path}.")
+        vulkan_device = iree.runtime.get_device(args.device)
+        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
+        return vulkan_device
+    return None


-schedulers = dict()
-# set scheduler value
-schedulers["PNDM"] = PNDMScheduler(
-    beta_start=0.00085,
-    beta_end=0.012,
-    beta_schedule="scaled_linear",
-    num_train_timesteps=1000,
-)
-schedulers["LMS"] = LMSDiscreteScheduler(
-    beta_start=0.00085,
-    beta_end=0.012,
-    beta_schedule="scaled_linear",
-    num_train_timesteps=1000,
-)
-schedulers["DDIM"] = DDIMScheduler(
-    beta_start=0.00085,
-    beta_end=0.012,
-    beta_schedule="scaled_linear",
-    clip_sample=False,
-    set_alpha_to_one=False,
-)
+def end_profiling(device):
+    if device:
+        return device.end_profiling()

-cache_obj = dict()
-# cache tokenizer and text_encoder
-cache_obj["tokenizer"] = CLIPTokenizer.from_pretrained(
-    "openai/clip-vit-large-patch14"
-)
-cache_obj["text_encoder"] = CLIPTextModel.from_pretrained(
-    "openai/clip-vit-large-patch14"
-)

-# cache vae and unet.
-args = Arguments(
-    prompt="load unet/vmfb",
-    scheduler="LMS",
-    iteration_count=1,
-    batch_size=1,
-    steps=50,
-    guidance=7.5,
-    height=512,
-    width=512,
-    seed=42,
-    precision="fp16",
-    device="vulkan",
-    cache=True,
-    iree_vulkan_target_triple="",
-    live_preview=False,
-    save_img=False,
-    import_mlir=False,
-    max_length=77,
-    use_tuned=True,
-)
-cache_obj["vae_fp16_vulkan"], cache_obj["unet_fp16_vulkan"] = get_models()
-args.precision = "fp32"
-cache_obj["vae_fp32_vulkan"], cache_obj["unet_fp32_vulkan"] = get_models()
-
-output_dir = "./stored_results/stable_diffusion"
-os.makedirs(output_dir, exist_ok=True)
+def set_ui_params(
+    prompt,
+    negative_prompt,
+    steps,
+    guidance_scale,
+    seed,
+    scheduler_key,
+    variant,
+):
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.steps = steps
+    args.guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+    args.seed = seed
+    args.scheduler = scheduler_key
+    args.variant = variant


 def stable_diff_inf(
    prompt: str,
-    scheduler: str,
-    iteration_count: int,
-    batch_size: int,
+    negative_prompt: str,
    steps: int,
-    guidance: float,
-    height: int,
-    width: int,
-    seed: str,
-    precision: str,
-    device: str,
-    cache: bool,
-    iree_vulkan_target_triple: str,
-    live_preview: bool,
-    save_img: bool,
+    guidance_scale: float,
+    seed: int,
+    scheduler_key: str,
+    variant: str,
+    device_key: str,
 ):
+    # Handle out of range seeds.
+    uint32_info = np.iinfo(np.uint32)
+    uint32_min, uint32_max = uint32_info.min, uint32_info.max
+    if seed < uint32_min or seed >= uint32_max:
+        seed = randint(uint32_min, uint32_max)

-    global args
-    global schedulers
-    global cache_obj
-    global output_dir
-
-    start = time.time()
-
-    # set seed value
-    if seed == "":
-        seed = int(torch.randint(low=25, high=100, size=()))
-    else:
-        try:
-            seed = int(seed)
-            if seed < 0 or seed > 10000:
-                seed = hash(seed)
-        except (ValueError, OverflowError) as error:
-            seed = hash(seed)
-
-    scheduler = schedulers[scheduler]
-    args = Arguments(
+    set_ui_params(
        prompt,
-        scheduler,
-        iteration_count,
-        batch_size,
+        negative_prompt,
        steps,
-        guidance,
-        height,
-        width,
+        guidance_scale,
        seed,
-        precision,
-        device,
-        cache,
-        iree_vulkan_target_triple,
-        live_preview,
-        save_img,
+        scheduler_key,
+        variant,
    )
    dtype = torch.float32 if args.precision == "fp32" else torch.half
-    num_inference_steps = int(args.steps)  # Number of denoising steps
    generator = torch.manual_seed(
        args.seed
    )  # Seed generator to create the inital latent noise

-    # Initialize vae and unet models.
-    is_model_initialized = False
-    if (
-        args.cache
-        and args.use_tuned
-        and args.device == "vulkan"
-        and not args.import_mlir
-    ):
-        vae_key = f"vae_{args.precision}_vulkan"
-        unet_key = f"unet_{args.precision}_vulkan"
-        cached_keys = cache_obj.keys()
-        if vae_key in cached_keys and unet_key in cached_keys:
-            vae, unet = cache_obj[vae_key], cache_obj[unet_key]
-            is_model_initialized = True
-    if not is_model_initialized:
-        vae, unet = get_models()
+    # set height and width.
+    height = 512  # default height of Stable Diffusion
+    width = 512  # default width of Stable Diffusion
+    if args.version == "v2_1":
+        height = 768
+        width = 768

-    tokenizer = cache_obj["tokenizer"]
-    text_encoder = cache_obj["text_encoder"]
+    # get all cached data.
+    model_cache.set_models(device_key)
+    tokenizer = model_cache.tokenizer
+    scheduler = model_cache.schedulers[args.scheduler]
+    vae, unet, clip = model_cache.vae, model_cache.unet, model_cache.clip
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+
+    # create a random initial latent.
+    latents = torch.randn(
+        (1, 4, height // 8, width // 8),
+        generator=generator,
+        dtype=torch.float32,
+    ).to(dtype)
+
+    # Warmup phase to improve performance.
+    if args.warmup_count >= 1:
+        vae_warmup_input = torch.clone(latents).detach().numpy()
+        clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
+    for i in range(args.warmup_count):
+        vae.forward((vae_warmup_input,))
+        clip.forward((clip_warmup_input,))
+
+    start = time.time()
    text_input = tokenizer(
-        [args.prompt],
+        args.prompts,
        padding="max_length",
        max_length=args.max_length,
        truncation=True,
        return_tensors="pt",
    )
-
-    text_embeddings = text_encoder(text_input.input_ids)[0].to(dtype)
    max_length = text_input.input_ids.shape[-1]
    uncond_input = tokenizer(
-        [""] * batch_size,
+        args.negative_prompts,
        padding="max_length",
        max_length=max_length,
+        truncation=True,
        return_tensors="pt",
    )
-    uncond_embeddings = text_encoder(uncond_input.input_ids)[0].to(dtype)
+    text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])

-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-    latents = torch.randn(
-        (batch_size, 4, args.height // 8, args.width // 8),
-        generator=generator,
-        dtype=torch.float32,
-    ).to(dtype)
-
-    scheduler.set_timesteps(num_inference_steps)
-    scheduler.is_scale_input_called = True
-
-    latents = latents * scheduler.sigmas[0]
+    clip_inf_start = time.time()
+    text_embeddings = clip.forward((text_input,))
+    clip_inf_end = time.time()
+    text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
    text_embeddings_numpy = text_embeddings.detach().numpy()

+    scheduler.set_timesteps(args.steps)
+    scheduler.is_scale_input_called = True
+
+    latents = latents * scheduler.init_noise_sigma
+
    avg_ms = 0
-    out_img = None
-    text_output = ""
    for i, t in tqdm(enumerate(scheduler.timesteps)):

-        text_output += f"\n Iteration = {i} | Timestep = {t} | "
        step_start = time.time()
        timestep = torch.tensor([t]).to(dtype).detach().numpy()
-        latents_numpy = latents.detach().numpy()
-        sigma_numpy = np.array(scheduler.sigmas[i]).astype(np.float32)
+        latent_model_input = scheduler.scale_model_input(latents, t)
+        if cpu_scheduling:
+            latent_model_input = latent_model_input.detach().numpy()

+        profile_device = start_profiling(file_path="unet.rdc")
        noise_pred = unet.forward(
-            (latents_numpy, timestep, text_embeddings_numpy, sigma_numpy)
+            (
+                latent_model_input,
+                timestep,
+                text_embeddings_numpy,
+                args.guidance_scale,
+            ),
+            send_to_host=False,
        )
-        noise_pred = torch.from_numpy(noise_pred)
+        end_profiling(profile_device)
+
+        if cpu_scheduling:
+            noise_pred = torch.from_numpy(noise_pred.to_host())
+            latents = scheduler.step(noise_pred, t, latents).prev_sample
+        else:
+            latents = scheduler.step(noise_pred, t, latents)
        step_time = time.time() - step_start
        avg_ms += step_time
        step_ms = int((step_time) * 1000)
-        text_output += f"Time = {step_ms}ms."
-        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
-
-        if live_preview and i % 5 == 0:
-            scaled_latents = 1 / 0.18215 * latents
-            latents_numpy = scaled_latents.detach().numpy()
-            image = vae.forward((latents_numpy,))
-            image = torch.from_numpy(image)
-            image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-            images = (image * 255).round().astype("uint8")
-            pil_images = [Image.fromarray(image) for image in images]
-            out_img = pil_images[0]
-            yield out_img, text_output
+        if not args.hide_steps:
+            print(f" \nIteration = {i}, Time = {step_ms}ms")

    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    latents_numpy = latents.detach().numpy()
-    image = vae.forward((latents_numpy,))
-    image = torch.from_numpy(image)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-    out_img = pil_images[0]
+    if args.use_base_vae:
+        latents = 1 / 0.18215 * latents
+    latents_numpy = latents
+    if cpu_scheduling:
+        latents_numpy = latents.detach().numpy()
+    profile_device = start_profiling(file_path="vae.rdc")
+    vae_start = time.time()
+    images = vae.forward((latents_numpy,))
+    vae_end = time.time()
+    end_profiling(profile_device)
+    if args.use_base_vae:
+        image = torch.from_numpy(images)
+        image = (image.detach().cpu() * 255.0).numpy()
+        images = image.round()
+    end_time = time.time()

    avg_ms = 1000 * avg_ms / args.steps
-    text_output += f"\n\nAverage step time: {avg_ms}ms/it"
+    clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
+    vae_inf_time = (vae_end - vae_start) * 1000
+    total_time = end_time - start
+    print(f"\nAverage step time: {avg_ms}ms/it")
+    print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
+    print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
+    print(f"\nTotal image generation time: {total_time}sec")

-    total_time = time.time() - start
-    text_output += f"\n\nTotal image generation time: {total_time}sec"
+    # generate outputs to web.
+    transform = T.ToPILImage()
+    pil_images = [
+        transform(image) for image in torch.from_numpy(images).to(torch.uint8)
+    ]

-    if args.save_img:
-        # save outputs.
-        output_loc = f"{output_dir}/{time.time()}_{int(args.steps)}_{args.precision}_{args.device}.jpg"
-        out_img.save(os.path.join(output_loc))
-    yield out_img, text_output
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nvariant={args.variant}, scheduler={args.scheduler}, device={device_key}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={height}x{width}"
+    text_output += f"\nAverage step time: {avg_ms:.4f}ms/it"
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    return pil_images[0], text_output
--- a/web/models/stable_diffusion/model_wrappers.py
+++ b/web/models/stable_diffusion/model_wrappers.py
@@ -1,201 +1,285 @@
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from transformers import CLIPTextModel
 from models.stable_diffusion.utils import compile_through_fx
+from models.stable_diffusion.stable_args import args
 import torch

-YOUR_TOKEN = "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"
+model_config = {
+    "v2_1": "stabilityai/stable-diffusion-2-1",
+    "v2_1base": "stabilityai/stable-diffusion-2-1-base",
+    "v1_4": "CompVis/stable-diffusion-v1-4",
+}

+# clip has 2 variants of max length 77 or 64.
+model_clip_max_length = 64 if args.max_length == 64 else 77
+if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
+    model_clip_max_length = 77
+elif args.variant == "openjourney":
+    model_clip_max_length = 64

-def get_vae32(args, model_name="vae_fp32"):
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="vae",
-                use_auth_token=YOUR_TOKEN,
-            )
+model_variant = {
+    "stablediffusion": "SD",
+    "anythingv3": "Linaqruf/anything-v3.0",
+    "dreamlike": "dreamlike-art/dreamlike-diffusion-1.0",
+    "openjourney": "prompthero/openjourney",
+    "analogdiffusion": "wavymulder/Analog-Diffusion",
+}

-        def forward(self, input):
-            x = self.vae.decode(input, return_dict=False)[0]
-            return (x / 2 + 0.5).clamp(0, 1)
-
-    vae = VaeModel()
-    vae_input = torch.rand(1, 4, 64, 64)
-    shark_vae = compile_through_fx(
-        args,
-        vae,
-        (vae_input,),
-        model_name,
-    )
-    return shark_vae
-
-
-def get_vae16(args, model_name="vae_fp16"):
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="vae",
-                use_auth_token=YOUR_TOKEN,
-                revision="fp16",
-            )
-
-        def forward(self, input):
-            x = self.vae.decode(input, return_dict=False)[0]
-            return (x / 2 + 0.5).clamp(0, 1)
-
-    vae = VaeModel()
-    vae = vae.half().cuda()
-    vae_input = torch.rand(1, 4, 64, 64, dtype=torch.half).cuda()
-    shark_vae = compile_through_fx(
-        args,
-        vae,
-        (vae_input,),
-        model_name,
-    )
-    return shark_vae
-
-
-def get_unet32(args, model_name="unet_fp32"):
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-        def forward(self, x, y, z):
-            return self.unet.forward(x, y, z, return_dict=False)[0]
-
-    unet = UnetModel()
-    latent_model_input = torch.rand([2, 4, 64, 64])
-    text_embeddings = torch.rand([2, args.max_length, 768])
-    shark_unet = compile_through_fx(
-        args,
-        unet,
-        (latent_model_input, torch.tensor([1.0]), text_embeddings),
-        model_name,
-    )
-    return shark_unet
-
-
-def get_unet16(args, model_name="unet_fp16"):
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-                revision="fp16",
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-        def forward(self, x, y, z):
-            return self.unet.forward(x, y, z, return_dict=False)[0]
-
-    unet = UnetModel()
-    unet = unet.half().cuda()
-    latent_model_input = torch.rand([2, 4, 64, 64]).half().cuda()
-    text_embeddings = torch.rand([2, args.max_length, 768]).half().cuda()
-    shark_unet = compile_through_fx(
-        args,
-        unet,
-        (
-            latent_model_input,
-            torch.tensor([1.0]).half().cuda(),
-            text_embeddings,
+model_input = {
+    "v2_1": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 96, 96),),
+        "unet": (
+            torch.randn(1, 4, 96, 96),  # latents
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 1024),  # embedding
+            torch.tensor(1).to(torch.float32),  # guidance_scale
        ),
-        model_name,
+    },
+    "v2_1base": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 64, 64),),
+        "unet": (
+            torch.randn(1, 4, 64, 64),  # latents
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 1024),  # embedding
+            torch.tensor(1).to(torch.float32),  # guidance_scale
+        ),
+    },
+    "v1_4": {
+        "clip": (torch.randint(1, 2, (2, model_clip_max_length)),),
+        "vae": (torch.randn(1, 4, 64, 64),),
+        "unet": (
+            torch.randn(1, 4, 64, 64),
+            torch.tensor([1]).to(torch.float32),  # timestep
+            torch.randn(2, model_clip_max_length, 768),
+            torch.tensor(1).to(torch.float32),
+        ),
+    },
+}
+
+# revision param for from_pretrained defaults to "main" => fp32
+model_revision = {
+    "stablediffusion": "fp16" if args.precision == "fp16" else "main",
+    "anythingv3": "diffusers",
+    "analogdiffusion": "main",
+    "openjourney": "main",
+    "dreamlike": "main",
+}
+
+
+def get_clip_mlir(model_name="clip_text", extra_args=[]):
+
+    text_encoder = CLIPTextModel.from_pretrained(
+        "openai/clip-vit-large-patch14"
    )
-    return shark_unet
+    if args.variant == "stablediffusion":
+        if args.version != "v1_4":
+            text_encoder = CLIPTextModel.from_pretrained(
+                model_config[args.version], subfolder="text_encoder"
+            )
+
+    elif args.variant in [
+        "anythingv3",
+        "analogdiffusion",
+        "openjourney",
+        "dreamlike",
+    ]:
+        text_encoder = CLIPTextModel.from_pretrained(
+            model_variant[args.variant],
+            subfolder="text_encoder",
+            revision=model_revision[args.variant],
+        )
+    else:
+        raise ValueError(f"{args.variant} not yet added")
+
+    class CLIPText(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.text_encoder = text_encoder
+
+        def forward(self, input):
+            return self.text_encoder(input)[0]
+
+    clip_model = CLIPText()
+    shark_clip = compile_through_fx(
+        clip_model,
+        model_input[args.version]["clip"],
+        model_name=model_name,
+        extra_args=extra_args,
+    )
+    return shark_clip


-def get_unet16_wrapped(args, model_name="unet_fp16_wrapped"):
+def get_base_vae_mlir(model_name="vae", extra_args=[]):
+    class BaseVaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                model_config[args.version]
+                if args.variant == "stablediffusion"
+                else model_variant[args.variant],
+                subfolder="vae",
+                revision=model_revision[args.variant],
+            )
+
+        def forward(self, input):
+            x = self.vae.decode(input, return_dict=False)[0]
+            return (x / 2 + 0.5).clamp(0, 1)
+
+    vae = BaseVaeModel()
+    if args.variant == "stablediffusion":
+        if args.precision == "fp16":
+            vae = vae.half().cuda()
+            inputs = tuple(
+                [
+                    inputs.half().cuda()
+                    for inputs in model_input[args.version]["vae"]
+                ]
+            )
+        else:
+            inputs = model_input[args.version]["vae"]
+    elif args.variant in [
+        "anythingv3",
+        "analogdiffusion",
+        "openjourney",
+        "dreamlike",
+    ]:
+        if args.precision == "fp16":
+            vae = vae.half().cuda()
+            inputs = tuple(
+                [inputs.half().cuda() for inputs in model_input["v1_4"]["vae"]]
+            )
+        else:
+            inputs = model_input["v1_4"]["vae"]
+    else:
+        raise ValueError(f"{args.variant} not yet added")
+
+    shark_vae = compile_through_fx(
+        vae,
+        inputs,
+        model_name=model_name,
+        extra_args=extra_args,
+    )
+    return shark_vae
+
+
+def get_vae_mlir(model_name="vae", extra_args=[]):
+    class VaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                model_config[args.version]
+                if args.variant == "stablediffusion"
+                else model_variant[args.variant],
+                subfolder="vae",
+                revision=model_revision[args.variant],
+            )
+
+        def forward(self, input):
+            input = 1 / 0.18215 * input
+            x = self.vae.decode(input, return_dict=False)[0]
+            x = (x / 2 + 0.5).clamp(0, 1)
+            x = x * 255.0
+            return x.round()
+
+    vae = VaeModel()
+    if args.variant == "stablediffusion":
+        if args.precision == "fp16":
+            vae = vae.half().cuda()
+            inputs = tuple(
+                [
+                    inputs.half().cuda()
+                    for inputs in model_input[args.version]["vae"]
+                ]
+            )
+        else:
+            inputs = model_input[args.version]["vae"]
+    elif args.variant in [
+        "anythingv3",
+        "analogdiffusion",
+        "openjourney",
+        "dreamlike",
+    ]:
+        if args.precision == "fp16":
+            vae = vae.half().cuda()
+            inputs = tuple(
+                [inputs.half().cuda() for inputs in model_input["v1_4"]["vae"]]
+            )
+        else:
+            inputs = model_input["v1_4"]["vae"]
+    else:
+        raise ValueError(f"{args.variant} not yet added")
+
+    shark_vae = compile_through_fx(
+        vae,
+        inputs,
+        model_name=model_name,
+        extra_args=extra_args,
+    )
+    return shark_vae
+
+
+def get_unet_mlir(model_name="unet", extra_args=[]):
    class UnetModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
+                model_config[args.version]
+                if args.variant == "stablediffusion"
+                else model_variant[args.variant],
                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-                revision="fp16",
+                revision=model_revision[args.variant],
            )
            self.in_channels = self.unet.in_channels
-            self.guidance_scale = args.guidance_scale
            self.train(False)

-        def forward(self, latent, timestep, text_embedding, sigma):
+        def forward(self, latent, timestep, text_embedding, guidance_scale):
            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
            latents = torch.cat([latent] * 2)
-            latents = latents / (torch.pow((torch.pow(sigma, 2) + 1), 0.5))
            unet_out = self.unet.forward(
                latents, timestep, text_embedding, return_dict=False
            )[0]
            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (
+            noise_pred = noise_pred_uncond + guidance_scale * (
                noise_pred_text - noise_pred_uncond
            )
            return noise_pred

    unet = UnetModel()
-    unet = unet.half().cuda()
-    latent_model_input = torch.rand([1, 4, 64, 64]).half().cuda()
-    text_embeddings = torch.rand([2, args.max_length, 768]).half().cuda()
-    sigma = torch.tensor(1).to(torch.float32)
+    if args.variant == "stablediffusion":
+        if args.precision == "fp16":
+            unet = unet.half().cuda()
+            inputs = tuple(
+                [
+                    inputs.half().cuda() if len(inputs.shape) != 0 else inputs
+                    for inputs in model_input[args.version]["unet"]
+                ]
+            )
+        else:
+            inputs = model_input[args.version]["unet"]
+    elif args.variant in [
+        "anythingv3",
+        "analogdiffusion",
+        "openjourney",
+        "dreamlike",
+    ]:
+        if args.precision == "fp16":
+            unet = unet.half().cuda()
+            inputs = tuple(
+                [
+                    inputs.half().cuda() if len(inputs.shape) != 0 else inputs
+                    for inputs in model_input["v1_4"]["unet"]
+                ]
+            )
+        else:
+            inputs = model_input["v1_4"]["unet"]
+    else:
+        raise ValueError(f"{args.variant} is not yet added")
    shark_unet = compile_through_fx(
-        args,
        unet,
-        (
-            latent_model_input,
-            torch.tensor([1.0]).half().cuda(),
-            text_embeddings,
-            sigma,
-        ),
-        model_name,
-    )
-    return shark_unet
-
-
-def get_unet32_wrapped(args, model_name="unet_fp32_wrapped"):
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-            )
-            self.in_channels = self.unet.in_channels
-            self.guidance_scale = args.guidance_scale
-            self.train(False)
-
-        def forward(self, latent, timestep, text_embedding, sigma):
-            latents = torch.cat([latent] * 2)
-            latents = latents / (torch.pow((torch.pow(sigma, 2) + 1), 0.5))
-            unet_out = self.unet.forward(
-                latents, timestep, text_embedding, return_dict=False
-            )[0]
-            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
-            return noise_pred
-
-    unet = UnetModel()
-    latent_model_input = torch.rand([1, 4, 64, 64])
-    text_embeddings = torch.rand([2, args.max_length, 768])
-    sigma = torch.tensor(1).to(torch.float32)
-    shark_unet = compile_through_fx(
-        args,
-        unet,
-        (latent_model_input, torch.tensor([1.0]), text_embeddings, sigma),
-        model_name,
+        inputs,
+        model_name=model_name,
+        extra_args=extra_args,
    )
    return shark_unet
--- a/web/models/stable_diffusion/opt_params.py
+++ b/web/models/stable_diffusion/opt_params.py
@@ -0,0 +1,99 @@
+import sys
+from models.stable_diffusion.model_wrappers import (
+    get_base_vae_mlir,
+    get_vae_mlir,
+    get_unet_mlir,
+    get_clip_mlir,
+)
+from models.stable_diffusion.resources import models_db
+from models.stable_diffusion.stable_args import args
+from models.stable_diffusion.utils import get_shark_model
+
+BATCH_SIZE = len(args.prompts)
+if BATCH_SIZE != 1:
+    sys.exit("Only batch size 1 is supported.")
+
+
+def get_params(bucket_key, model_key, model, is_tuned, precision):
+    iree_flags = []
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    try:
+        bucket = models_db[0][bucket_key]
+        model_name = models_db[1][model_key]
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "default_compilation_flags"
+        ]
+    except KeyError:
+        raise Exception(
+            f"{bucket}/{model_key} is not present in the models database"
+        )
+
+    if (
+        "specified_compilation_flags"
+        in models_db[2][model][is_tuned][precision]
+    ):
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in models_db[2][model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return bucket, model_name, iree_flags
+
+
+def get_unet():
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    bucket_key = f"{args.variant}/{is_tuned}"
+    model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "unet", is_tuned, args.precision
+    )
+    if not args.use_tuned and args.import_mlir:
+        return get_unet_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_vae():
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    is_base = "/base" if args.use_base_vae else ""
+    bucket_key = f"{args.variant}/{is_tuned}"
+    model_key = f"{args.variant}/{args.version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "vae", is_tuned, args.precision
+    )
+    if not args.use_tuned and args.import_mlir:
+        if args.use_base_vae:
+            return get_base_vae_mlir(model_name, iree_flags)
+        return get_vae_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_clip():
+    bucket_key = f"{args.variant}/untuned"
+    model_key = f"{args.variant}/{args.version}/clip/fp32/length_{args.max_length}/untuned"
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "clip", "untuned", "fp32"
+    )
+    if args.import_mlir:
+        return get_clip_mlir(model_name, iree_flags)
+    return get_shark_model(bucket, model_name, iree_flags)
--- a/web/models/stable_diffusion/resources.py
+++ b/web/models/stable_diffusion/resources.py
@@ -0,0 +1,31 @@
+import os
+import json
+import sys
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+prompt_examples = []
+prompts_loc = resource_path("resources/prompts.json")
+if os.path.exists(prompts_loc):
+    with open(prompts_loc, encoding="utf-8") as fopen:
+        prompt_examples = json.load(fopen)
+
+if not prompt_examples:
+    print("Unable to fetch prompt examples.")
+
+
+models_db = []
+models_loc = resource_path("resources/model_db.json")
+if os.path.exists(models_loc):
+    with open(models_loc, encoding="utf-8") as fopen:
+        models_db = json.load(fopen)
+
+if len(models_db) != 3:
+    sys.exit("Error: Unable to load models database.")
--- a/web/models/stable_diffusion/resources/model_db.json
+++ b/web/models/stable_diffusion/resources/model_db.json
@@ -0,0 +1,165 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_1dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip2base_18dec_fp32",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_19dec_v2p1base_fp32_64",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet2_14dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae2_19dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform",
+            "--iree-flow-enable-conv-winograd-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform",
+            "--iree-flow-enable-conv-winograd-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/web/models/stable_diffusion/resources/prompts.json
+++ b/web/models/stable_diffusion/resources/prompts.json
@@ -0,0 +1,8 @@
+[["A high tech solarpunk utopia in the Amazon rainforest"],
+["A pikachu fine dining with a view to the Eiffel Tower"],
+["A mecha robot in a favela in expressionist style"],
+["an insect robot preparing a delicious meal"],
+["A digital Illustration of the Babel tower, 4k, detailed, trending in artstation, fantasy vivid colors"],
+["Cluttered house in the woods, anime, oil painting, high resolution, cottagecore, ghibli inspired, 4k"],
+["A beautiful mansion beside a waterfall in the woods, by josef thoma, matte painting, trending on artstation HQ"],
+["portrait photo of a asia old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes"]]
--- a/web/models/stable_diffusion/schedulers.py
+++ b/web/models/stable_diffusion/schedulers.py
@@ -0,0 +1,131 @@
+import sys
+import numpy as np
+from typing import List, Optional, Tuple, Union
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
+from diffusers.configuration_utils import register_to_config
+from models.stable_diffusion.utils import compile_through_fx, get_shark_model
+from models.stable_diffusion.stable_args import args
+import torch
+
+SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+
+model_input = {
+    "euler": {
+        "latent": torch.randn(1, 4, 64, 64),
+        "output": torch.randn(1, 4, 64, 64),
+        "sigma": torch.tensor(1).to(torch.float32),
+        "dt": torch.tensor(1).to(torch.float32),
+    },
+}
+
+
+class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
+        super().__init__(
+            num_train_timesteps,
+            beta_start,
+            beta_end,
+            beta_schedule,
+            trained_betas,
+            prediction_type,
+        )
+
+    def compile(self):
+        example_latent = model_input["euler"]["latent"]
+        example_output = model_input["euler"]["output"]
+        if args.precision == "fp16":
+            example_latent = example_latent.half()
+            example_output = example_output.half()
+        example_sigma = model_input["euler"]["sigma"]
+        example_dt = model_input["euler"]["dt"]
+
+        class ScalingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, latent, sigma):
+                return latent / ((sigma**2 + 1) ** 0.5)
+
+        class SchedulerStepModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, noise_pred, sigma, latent, dt):
+                pred_original_sample = latent - sigma * noise_pred
+                derivative = (latent - pred_original_sample) / sigma
+                return latent + derivative * dt
+
+        iree_flags = []
+        if len(args.iree_vulkan_target_triple) > 0:
+            iree_flags.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+        # Disable bindings fusion to work with moltenVK.
+        if sys.platform == "darwin":
+            iree_flags.append("-iree-stream-fuse-binding=false")
+
+        if args.import_mlir:
+            scaling_model = ScalingModel()
+            self.scaling_model = compile_through_fx(
+                scaling_model,
+                (example_latent, example_sigma),
+                model_name="euler_scale_model_input_" + args.precision,
+                extra_args=iree_flags,
+            )
+
+            step_model = SchedulerStepModel()
+            self.step_model = compile_through_fx(
+                step_model,
+                (example_output, example_sigma, example_latent, example_dt),
+                model_name="euler_step_" + args.precision,
+                extra_args=iree_flags,
+            )
+        else:
+            self.scaling_model = get_shark_model(
+                SCHEDULER_BUCKET,
+                "euler_scale_model_input_" + args.precision,
+                iree_flags,
+            )
+            self.step_model = get_shark_model(
+                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
+            )
+
+    def scale_model_input(self, sample, timestep):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        return self.scaling_model.forward(
+            (
+                sample,
+                sigma,
+            ),
+            send_to_host=False,
+        )
+
+    def step(self, noise_pred, timestep, latent):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        dt = self.sigmas[step_index + 1] - sigma
+        return self.step_model.forward(
+            (
+                noise_pred,
+                sigma,
+                latent,
+                dt,
+            ),
+            send_to_host=False,
+        )
--- a/web/models/stable_diffusion/stable_args.py
+++ b/web/models/stable_diffusion/stable_args.py
@@ -0,0 +1,226 @@
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Stable Diffusion Params
+##############################################################################
+
+p.add_argument(
+    "--prompts",
+    nargs="+",
+    default=["cyberpunk forest by Salvador Dali"],
+    help="text of which images to be generated.",
+)
+
+p.add_argument(
+    "--negative-prompts",
+    nargs="+",
+    default=[""],
+    help="text you don't want to see in the generated image.",
+)
+
+p.add_argument(
+    "--steps",
+    type=int,
+    default=50,
+    help="the no. of steps to do the sampling.",
+)
+
+p.add_argument(
+    "--seed",
+    type=int,
+    default=42,
+    help="the seed to use.",
+)
+
+p.add_argument(
+    "--guidance_scale",
+    type=float,
+    default=7.5,
+    help="the value to be used for guidance scaling.",
+)
+
+p.add_argument(
+    "--max_length",
+    type=int,
+    default=64,
+    help="max length of the tokenizer output, options are 64 and 77.",
+)
+
+##############################################################################
+### Model Config and Usage Params
+##############################################################################
+
+p.add_argument(
+    "--device", type=str, default="vulkan", help="device to run the model."
+)
+
+p.add_argument(
+    "--version",
+    type=str,
+    default="v2_1base",
+    help="Specify version of stable diffusion model",
+)
+
+p.add_argument(
+    "--precision", type=str, default="fp16", help="precision to run the model."
+)
+
+p.add_argument(
+    "--import_mlir",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
+)
+
+p.add_argument(
+    "--load_vmfb",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
+)
+
+p.add_argument(
+    "--save_vmfb",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="saves the compiled flatbuffer to the local directory",
+)
+
+p.add_argument(
+    "--use_tuned",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Download and use the tuned version of the model if available",
+)
+
+p.add_argument(
+    "--use_base_vae",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Do conversion from the VAE output to pixel space on cpu.",
+)
+
+p.add_argument(
+    "--variant",
+    default="stablediffusion",
+    help="We now support multiple vairants of SD finetuned for different dataset. you can use the following anythingv3, ...",  # TODO add more once supported
+)
+
+p.add_argument(
+    "--scheduler",
+    type=str,
+    default="SharkEulerDiscrete",
+    help="other supported schedulers are [PNDM, DDIM, LMSDiscrete, EulerDiscrete, DPMSolverMultistep]",
+)
+
+##############################################################################
+### IREE - Vulkan supported flags
+##############################################################################
+
+p.add_argument(
+    "--iree-vulkan-target-triple",
+    type=str,
+    default="",
+    help="Specify target triple for vulkan",
+)
+
+p.add_argument(
+    "--vulkan_debug_utils",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Profiles vulkan device and collects the .rdc info",
+)
+
+p.add_argument(
+    "--vulkan_large_heap_block_size",
+    default="4147483648",
+    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
+)
+
+p.add_argument(
+    "--vulkan_validation_layers",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for disabling vulkan validation layers when benchmarking",
+)
+
+##############################################################################
+### Misc. Debug and Optimization flags
+##############################################################################
+
+p.add_argument(
+    "--use_compiled_scheduler",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="use the default scheduler precompiled into the model if available",
+)
+
+p.add_argument(
+    "--local_tank_cache",
+    default="",
+    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
+)
+
+p.add_argument(
+    "--dump_isa",
+    default=False,
+    action="store_true",
+    help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
+)
+
+p.add_argument(
+    "--dispatch_benchmarks",
+    default=None,
+    help='dispatches to return benchamrk data on.  use "All" for all, and None for none.',
+)
+
+p.add_argument(
+    "--dispatch_benchmarks_dir",
+    default="temp_dispatch_benchmarks",
+    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
+)
+
+p.add_argument(
+    "--enable_rgp",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for inserting debug frames between iterations for use with rgp.",
+)
+
+p.add_argument(
+    "--hide_steps",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for hiding the details of iteration/sec for each step.",
+)
+
+p.add_argument(
+    "--warmup_count",
+    type=int,
+    default=0,
+    help="flag setting warmup count for clip and vae [>= 0].",
+)
+
+p.add_argument(
+    "--clear_all",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
+)
+
+##############################################################################
+### Web UI flags
+##############################################################################
+
+p.add_argument(
+    "--progress_bar",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for removing the pregress bar animation during image generation",
+)
+
+args = p.parse_args()
--- a/web/models/stable_diffusion/utils.py
+++ b/web/models/stable_diffusion/utils.py
@@ -1,84 +1,67 @@
+import os
 import torch
 from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-import torch_mlir
-import os
+from models.stable_diffusion.stable_args import args
+from shark.shark_importer import import_with_fx
+from shark.iree_utils.vulkan_utils import (
+    set_iree_vulkan_runtime_flags,
+    get_vulkan_target_triple,
+)


-def _compile_module(args, shark_module, model_name, extra_args=[]):
-    extended_name = "{}_{}".format(model_name, args.device)
-    if args.cache:
+def _compile_module(shark_module, model_name, extra_args=[]):
+    if args.load_vmfb or args.save_vmfb:
+        device = (
+            args.device
+            if "://" not in args.device
+            else "-".join(args.device.split("://"))
+        )
+        extended_name = "{}_{}".format(model_name, device)
        vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
-        if os.path.isfile(vmfb_path):
-            print("Loading flatbuffer from {}".format(vmfb_path))
-            shark_module.load_module(vmfb_path)
-            return shark_module
-        print("No vmfb found. Compiling and saving to {}".format(vmfb_path))
-    path = shark_module.save_module(os.getcwd(), extended_name, extra_args)
-    shark_module.load_module(path)
+        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
+            print(f"loading existing vmfb from: {vmfb_path}")
+            shark_module.load_module(vmfb_path, extra_args=extra_args)
+        else:
+            if args.save_vmfb:
+                print("Saving to {}".format(vmfb_path))
+            else:
+                print(
+                    "No vmfb found. Compiling and saving to {}".format(
+                        vmfb_path
+                    )
+                )
+            path = shark_module.save_module(
+                os.getcwd(), extended_name, extra_args
+            )
+            shark_module.load_module(path, extra_args=extra_args)
+    else:
+        shark_module.compile(extra_args)
    return shark_module


 # Downloads the model from shark_tank and returns the shark_module.
-def get_shark_model(args, tank_url, model_name, extra_args=[]):
-    from shark.shark_downloader import download_torch_model
+def get_shark_model(tank_url, model_name, extra_args=[]):
+    from shark.shark_downloader import download_model
+    from shark.parser import shark_args

-    mlir_model, func_name, inputs, golden_out = download_torch_model(
-        model_name, tank_url=tank_url
+    # Set local shark_tank cache directory.
+    shark_args.local_tank_cache = args.local_tank_cache
+
+    mlir_model, func_name, inputs, golden_out = download_model(
+        model_name,
+        tank_url=tank_url,
+        frontend="torch",
    )
    shark_module = SharkInference(
        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
    )
-    return _compile_module(args, shark_module, model_name, extra_args)
+    return _compile_module(shark_module, model_name, extra_args)


-# Converts the torch-module into shark_module.
-def compile_through_fx(args, model, inputs, model_name, extra_args=[]):
+# Converts the torch-module into a shark_module.
+def compile_through_fx(model, inputs, model_name, extra_args=[]):

-    fx_g = make_fx(
-        model,
-        decomposition_table=get_decompositions(
-            [
-                torch.ops.aten.embedding_dense_backward,
-                torch.ops.aten.native_layer_norm_backward,
-                torch.ops.aten.slice_backward,
-                torch.ops.aten.select_backward,
-                torch.ops.aten.norm.ScalarOpt_dim,
-                torch.ops.aten.native_group_norm,
-                torch.ops.aten.upsample_bilinear2d.vec,
-                torch.ops.aten.split.Tensor,
-                torch.ops.aten.split_with_sizes,
-            ]
-        ),
-    )(*inputs)
-
-    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-    fx_g.recompile()
-
-    def strip_overloads(gm):
-        """
-        Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-        Args:
-            gm(fx.GraphModule): The input Fx graph module to be modified
-        """
-        for node in gm.graph.nodes:
-            if isinstance(node.target, torch._ops.OpOverload):
-                node.target = node.target.overloadpacket
-        gm.recompile()
-
-    strip_overloads(fx_g)
-
-    ts_g = torch.jit.trace(fx_g, inputs)
-
-    mlir_importer = SharkImporter(
-        ts_g,
-        inputs,
-        frontend="torch",
-    )
-
-    (mlir_module, func_name), _, _ = mlir_importer.import_debug()
+    mlir_module, func_name = import_with_fx(model, inputs)

    shark_module = SharkInference(
        mlir_module,
@@ -87,4 +70,163 @@ def compile_through_fx(args, model, inputs, model_name, extra_args=[]):
        mlir_dialect="linalg",
    )

-    return _compile_module(args, shark_module, model_name, extra_args)
+    return _compile_module(shark_module, model_name, extra_args)
+
+
+def set_iree_runtime_flags():
+
+    vulkan_runtime_flags = [
+        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
+        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
+    ]
+    if args.enable_rgp:
+        vulkan_runtime_flags += [
+            f"--enable_rgp=true",
+            f"--vulkan_debug_utils=true",
+        ]
+    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
+
+
+def get_all_devices(driver_name):
+    """
+    Inputs: driver_name
+    Returns a list of all the available devices for a given driver sorted by
+    the iree path names of the device as in --list_devices option in iree.
+    """
+    from iree.runtime import get_driver
+
+    driver = get_driver(driver_name)
+    device_list_src = driver.query_available_devices()
+    device_list_src.sort(key=lambda d: d["path"])
+    return device_list_src
+
+
+def get_device_mapping(driver, key_combination=3):
+    """This method ensures consistent device ordering when choosing
+    specific devices for execution
+    Args:
+        driver (str): execution driver (vulkan, cuda, rocm, etc)
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Returns:
+        dict: map to possible device names user can input mapped to desired combination of name/path.
+    """
+    from shark.iree_utils._common import iree_device_map
+
+    driver = iree_device_map(driver)
+    device_list = get_all_devices(driver)
+    device_map = dict()
+
+    def get_output_value(dev_dict):
+        if key_combination == 1:
+            return f"{driver}://{dev_dict['path']}"
+        if key_combination == 2:
+            return dev_dict["name"]
+        if key_combination == 3:
+            return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
+
+    # mapping driver name to default device (driver://0)
+    device_map[f"{driver}"] = get_output_value(device_list[0])
+    for i, device in enumerate(device_list):
+        # mapping with index
+        device_map[f"{driver}://{i}"] = get_output_value(device)
+        # mapping with full path
+        device_map[f"{driver}://{device['path']}"] = get_output_value(device)
+    return device_map
+
+
+def map_device_to_name_path(device, key_combination=3):
+    """Gives the appropriate device data (supported name/path) for user selected execution device
+    Args:
+        device (str): user
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Raises:
+        ValueError:
+    Returns:
+        str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
+    """
+    driver = device.split("://")[0]
+    device_map = get_device_mapping(driver, key_combination)
+    try:
+        device_mapping = device_map[device]
+    except KeyError:
+        raise ValueError(f"Device '{device}' is not a valid device.")
+    return device_mapping
+
+
+def set_init_device_flags():
+    if "vulkan" in args.device:
+        # set runtime flags for vulkan.
+        set_iree_runtime_flags()
+
+        # set triple flag to avoid multiple calls to get_vulkan_triple_flag
+        device_name, args.device = map_device_to_name_path(args.device)
+        if not args.iree_vulkan_target_triple:
+            triple = get_vulkan_target_triple(device_name)
+            if triple is not None:
+                args.iree_vulkan_target_triple = triple
+        print(
+            f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
+        )
+    elif "cuda" in args.device:
+        args.device = "cuda"
+    elif "cpu" in args.device:
+        args.device = "cpu"
+
+    # set max_length based on availability.
+    if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
+        args.max_length = 77
+    elif args.variant == "openjourney":
+        args.max_length = 64
+
+    # use tuned models only in the case of stablediffusion/fp16 and rdna3 cards.
+    if (
+        args.variant in ["openjourney", "dreamlike"]
+        or args.precision != "fp16"
+        or "vulkan" not in args.device
+        or "rdna3" not in args.iree_vulkan_target_triple
+    ):
+        args.use_tuned = False
+        print("Tuned models are currently not supported for this setting.")
+
+    elif args.use_base_vae and args.variant != "stablediffusion":
+        args.use_tuned = False
+        print("Tuned models are currently not supported for this setting.")
+
+    if args.use_tuned:
+        print("Using tuned models for stablediffusion/fp16 and rdna3 card.")
+
+
+# Utility to get list of devices available.
+def get_available_devices():
+    def get_devices_by_name(driver_name):
+        from shark.iree_utils._common import iree_device_map
+
+        device_list = []
+        try:
+            driver_name = iree_device_map(driver_name)
+            device_list_dict = get_all_devices(driver_name)
+            print(f"{driver_name} devices are available.")
+        except:
+            print(f"{driver_name} devices are not available.")
+        else:
+            for i, device in enumerate(device_list_dict):
+                device_list.append(f"{driver_name}://{i} => {device['name']}")
+        return device_list
+
+    set_iree_runtime_flags()
+
+    available_devices = []
+    vulkan_devices = get_devices_by_name("vulkan")
+    available_devices.extend(vulkan_devices)
+    cuda_devices = get_devices_by_name("cuda")
+    available_devices.extend(cuda_devices)
+    available_devices.append("cpu")
+    return available_devices
--- a/Show More
+++ b/Show More