Revert "move beta to release (#898 )" (#905 )

This reverts commit 7edcaf5a06.
Revert "replace new model_db.json (#902 )" (#904 )
2026-01-11 23:08:19 -05:00 · 2023-01-31 20:31:41 -08:00 · 2023-01-31 20:29:40 -08:00 · 2023-02-01 09:12:45 +05:30 · 2023-01-31 18:55:22 -08:00 · 2023-01-31 17:14:08 -06:00
198 changed files with 14759 additions and 1756 deletions
--- a/.github/workflows/gh-pages-releases.yml
+++ b/.github/workflows/gh-pages-releases.yml
@@ -23,7 +23,7 @@ jobs:
      - run: git fetch --all
      - run: git switch github-pages
      - run: git config --global user.email "none@none.com"
-      - run: git config --global user.name "nod-team"
+      - run: git config --global user.name "nod-ai"
      - run: mv /tmp/index.html package-index/index.html
      - run: git add package-index/index.html

--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -9,7 +9,84 @@ on:
  workflow_dispatch:

 jobs:
-  build:
+  windows-build:
+    runs-on: 7950X
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Compute version
+      shell: powershell
+      run: |
+        $package_version = $(Get-Date -UFormat "%Y%m%d")+"."+${{ github.run_number }}
+        $package_version_ = $(Get-Date -UFormat "%Y%m%d")+"_"+${{ github.run_number }}
+        $tag_name=$package_version
+        echo "package_version=$package_version" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+        echo "package_version_=$package_version_" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+        echo "tag_name=$tag_name" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+
+    - name: Create Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        tag_name: ${{ env.tag_name }}
+        release_name: nod.ai SHARK ${{ env.tag_name }}
+        body: |
+          Automatic snapshot release of nod.ai SHARK.
+        draft: true
+        prerelease: false
+
+    - name: Build Package 
+      shell: powershell
+      run: |
+        ./setup_venv.ps1
+        pyinstaller web/shark_sd.spec
+        mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
+        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
+        pyinstaller .\shark\examples\shark_inference\stable_diffusion\shark_sd_cli.spec
+        mv ./dist/shark_sd_cli.exe ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
+        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
+
+        
+    # GHA windows VM OOMs so disable for now
+    #- name: Build and validate the SHARK Runtime package
+    #  shell: powershell
+    #  run: |
+    #    $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
+    #    pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
+
+    - uses: actions/upload-artifact@v2
+      with:
+        path: dist/*
+    
+    - name: Upload Release Assets
+      id: upload-release-assets
+      uses: dwenegar/upload-release-assets@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+        assets_path: ./dist/*
+
+    - name: Publish Release
+      id: publish_release
+      uses: eregon/publish-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+
+  linux-build:

    runs-on: a100
    strategy:
@@ -32,40 +109,13 @@ jobs:
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
-    
-    - name: Compute version
-      run: |
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        tag_name="${package_version}"
-        echo "package_version=${package_version}" >> $GITHUB_ENV
-        echo "tag_name=${tag_name}" >> $GITHUB_ENV    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-    - name: Create Release
-      id: create_release
-      uses: actions/create-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        tag_name: ${{ env.tag_name }}
-        release_name: nod.ai SHARK ${{ env.tag_name }}
-        body: |
-          Automatic snapshot release of nod.ai SHARK.
-        draft: true
-        prerelease: false
-    - name: Find Torch-MLIR Release
-      run: |
-        TM_HTML_URL="$(python3 -c "import urllib.request, json, sys; u=json.loads(urllib.request.urlopen('https://api.github.com/repos/llvm/torch-mlir/releases/latest').read().decode()).get('html_url', False); print(u) if u else sys.exit(1);")"
-        TM_RELEASE_DIR=${TM_HTML_URL/"tag"/"expanded_assets"}
-        echo "TM_RELEASE_DIR=${TM_RELEASE_DIR}" >> $GITHUB_ENV
+
    - name: Install dependencies
      run: |
-        echo "Torch-MLIR Release DIR is ${{ env.TM_RELEASE_DIR }}"
+        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
        python -m pip install --upgrade pip
        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
+        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
@@ -74,25 +124,26 @@ jobs:
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
    - name: Build and validate the IREE package
      if: ${{ matrix.backend == 'IREE' }}
+      continue-on-error: true
      run: |
        cd $GITHUB_WORKSPACE
        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
        source iree.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/iree-org/iree/releases
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://iree-org.github.io/iree/pip-release-links.html
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
          tail -n 1 |
          tee -a pytest_results.txt
        if !(grep -Fxq " failed" pytest_results.txt) 
          then 
            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/$SHA
-            gsutil -m cp -r gs://shark_tank/$SHA/* gs://shark_tank/latest/
+            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
+            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/latest/
        fi
        rm -rf ./wheelhouse/nodai*

@@ -104,29 +155,10 @@ jobs:
        source shark.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
+        pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
          tail -n 1 |
          tee -a pytest_results.txt
-    
-    - name: Upload Release Assets
-      if: ${{ matrix.backend == 'SHARK' }}
-      id: upload-release-assets
-      uses: dwenegar/upload-release-assets@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ${GITHUB_WORKSPACE}/wheelhouse/nodai_*.whl
-
-    - name: Publish Release
-      if: ${{ matrix.backend == 'SHARK' }}
-      id: publish_release
-      uses: eregon/publish-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -6,10 +6,24 @@ name: Validate Models on Shark Runtime
 on:
  push:
    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
  pull_request:
    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
  workflow_dispatch:

+# Ensure that only a single job or workflow using the same
+# concurrency group will run at a time. This would cancel
+# any in-progress jobs in the same github workflow and github
+# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  build-validate:
    strategy:
@@ -32,8 +46,6 @@ jobs:
            suite: cuda
          - os: MacStudio
            suite: cpu
-          - os: MacStudio
-            suite: vulkan
          - os: icelake
            suite: vulkan
          - os: icelake
@@ -88,9 +100,9 @@ jobs:
      if: matrix.suite == 'cpu'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cpu
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k cpu
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

@@ -100,14 +112,28 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cuda
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
+        # Disabled due to black image bug
+        # python build_tools/stable_diffusion_testing.py --device=cuda 

-    - name: Validate Vulkan Models
-      if: matrix.suite == 'vulkan'
+    - name: Validate Vulkan Models (MacOS)
+      if: matrix.suite == 'vulkan' && matrix.os == 'MacStudio'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k vulkan
+        export DYLD_LIBRARY_PATH=/usr/local/lib/
+        echo $PATH
+        pip list | grep -E "torch|iree"
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" -k vulkan --update_tank
+
+    - name: Validate Vulkan Models (a100)
+      if: matrix.suite == 'vulkan' && matrix.os != 'MacStudio'
+      run: |
+        cd $GITHUB_WORKSPACE
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k vulkan
+        python build_tools/stable_diffusion_testing.py --device=vulkan
--- a/.gitignore
+++ b/.gitignore
@@ -31,7 +31,6 @@ MANIFEST
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
-*.spec

 # Installer logs
 pip-log.txt
@@ -163,7 +162,14 @@ cython_debug/
 # Shark related artefacts
 *venv/
 shark_tmp/
+*.vmfb
+.use-iree
+tank/dict_configs.py

 # ORT related artefacts
 cache_models/
 onnx_models/
+
+#web logging
+web/logs/
+web/stored_results/stable_diffusion/
--- a/README.md
+++ b/README.md
@@ -5,25 +5,119 @@ High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerator
 [![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
 [![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)

-## Communication Channels

-*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
-*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+## Installation (Windows, Linux and macOS)
+
+## Check out the code
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+```
+
+## Setup your Python VirtualEnvironment and Dependencies
+
+### Windows 10/11 Users
+
+* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
+
+* Install Git for Windows from [here](https://git-scm.com/download/win)
+
+#### Allow the install script to run in Powershell
+```powershell
+set-executionpolicy remotesigned
+```
+
+#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
+```powershell
+./setup_venv.ps1 #You can re-run this script to get the latest version
+```
+
+### Linux / macOS Users
+
+```shell
+./setup_venv.sh
+source shark.venv/bin/activate
+```


-## Installation
+### Run Stable Diffusion on your device - WebUI
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\Users\nod\SHARK> cd web
+(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
+```
+#### Linux Users
+```shell
+(shark.venv) > cd web
+(shark.venv) > python index.py
+```
+
+#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
+
+
+<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
+
+
+
+### Run Stable Diffusion on your device - Commandline
+
+#### Install your hardware drivers
+* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree)
+* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
+* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
+
+Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
+
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+```
+
+#### Linux / macOS Users
+```shell
+python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+```
+
+You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
+
+The output on a 7900XTX would like:
+
+```shell 
+Stats for run 0:
+Average step time: 47.19188690185547ms/it
+Clip Inference time (ms) = 109.531
+VAE Inference time (ms): 78.590
+
+Total image generation time: 2.5788655281066895sec
+```
+
+Here are some samples generated:
+
+![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
+
+![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
+
+
+
+For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
+
+Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
+

 <details>
-  <summary>Installation (Linux and macOS)</summary>
+  <summary>Binary Installation</summary>

 ### Setup a new pip Virtual Environment

 This step sets up a new VirtualEnv for Python

 ```shell
-python --version #Check you have 3.7->3.10 on Linux or 3.10 on macOS
+python --version #Check you have 3.10 on Linux, macOS or Windows Powershell
 python -m venv shark_venv
-source shark_venv/bin/activate
+source shark_venv/bin/activate   # Use shark_venv/Scripts/activate on Windows

 # If you are using conda create and activate a new conda env

@@ -38,9 +132,14 @@ python -m pip install --upgrade pip
 This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10

 ```shell
-pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f https://github.com/nod-ai/shark-runtime/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 ```
-If you are on an Intel macOS machine you need this [workaround](https://github.com/nod-ai/SHARK/issues/102) for an upstream issue.
+
+### Run shark tank model tests.
+```shell
+pytest tank/test_models.py
+```
+See tank/README.md for a more detailed walkthrough of our pytest suite and CLI.

 ### Download and run Resnet50 sample

@@ -61,29 +160,27 @@ python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
 </details>


+
 <details>
-  <summary>Source Installation</summary>
+  <summary>Development, Testing and Benchmarks</summary>

-## Check out the code
+If you want to use Python3.10 and with TF Import tools you can use the environment variables like:
+Set `USE_IREE=1` to use upstream IREE
+```
+# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
+```

+### Run any of the hundreds of SHARK tank models via the test framework
 ```shell
-git clone https://github.com/nod-ai/SHARK.git
-```
-
-## Setup your Python VirtualEnvironment and Dependencies
-```shell
-# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
-./setup_venv.sh
-source shark.venv/bin/activate
-```
-For example if you want to use Python3.10 and upstream IREE with TF Import tools you can use the environment variables like:
-```
-# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 USE_IREE=1 ./setup_venv.sh 
+python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
+# Or a pytest
+pytest tank/test_models.py -k "MiniLM"
 ```
+  

 If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
 the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
-with Python bindings and set your PYTHONPATH as mentioned [here](https://google.github.io/iree/bindings/python/)
+with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
 for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
 for Torch-MLIR.

@@ -102,82 +199,39 @@ for Torch-MLIR.
 ```
 Now the SHARK will use your locally build Torch-MLIR repo.

-### Run a demo script
-```shell
-python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
-# Or a pytest
-pytest tank/test_models.py -k "MiniLM"
+
+## Benchmarking Dispatches
+
+To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your command line argument.  
+If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`.  E.G. `--dispatch_benchmarks="0 1 2 10"`
+
+if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled.  E.G:
+
 ```
+shark_module = SharkInference(
+        mlir_model,
+        func_name,
+        device=args.device,
+        mlir_dialect="tm_tensor",
+        dispatch_benchmarks="all",
+        dispatch_benchmarks_dir="results"
+    )
+```
+
+Output will include:
+- An ordered list ordered-dispatches.txt of all the dispatches with their runtime
+- Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
+- An .mlir file containing the dispatch benchmark 
+- A compiled .vmfb file containing the dispatch benchmark
+- An .mlir file containing just the hal executable
+- A compiled .vmfb file of the hal executable
+- A .txt file containing benchmark output
+
+
+See tank/README.md for instructions on how to run model tests and benchmarks from the SHARK tank.

 </details>

-<details>
-  <summary>Testing and Benchmarks</summary>
-
-### Run all model tests on CPU/GPU/VULKAN/Metal
-```shell
-pytest tank/test_models.py
-
-# If on Linux for multithreading on CPU (faster results):
-pytest tank/test_models.py -n auto
-```
-
-### Running specific tests
-```shell
-
-# Search for test cases by including a keyword that matches all or part of the test case's name;
-pytest tank/test_models.py -k "keyword" 
-
-# Test cases are named uniformly by format test_module_<model_name_underscores_only>_<torch/tf>_<static/dynamic>_<device>.
-
-# Example: Test all models on nvidia gpu:
-pytest tank/test_models.py -k "cuda"
-
-# Example: Test all tensorflow resnet models on Vulkan backend:
-pytest tank/test_models.py -k "resnet and tf and vulkan"
-
-# Exclude a test case:
-pytest tank/test_models.py -k "not ..."
-
-### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.
-
-(the following requires source installation with `IMPORTER=1 ./setup_venv.sh`)
-
-```shell
-pytest --benchmark tank/test_models.py
-  
-# Just do static GPU benchmarks for PyTorch tests:
-pytest --benchmark tank/test_models.py -k "pytorch and static and cuda"
-
-```
-  
-### Benchmark Resnet50, MiniLM on CPU
-
-(requires source installation with `IMPORTER=1 ./setup_venv.sh`)  
-  
-```shell
-# We suggest running the following commands as root before running benchmarks on CPU:
-  
-cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print $2}' | sort -n | uniq | ( while read X ; do echo $X ; echo 0 > /sys/devices/system/cpu/cpu$X/online ; done )
-echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
-
-# Benchmark canonical Resnet50 on CPU via pytest
-pytest --benchmark tank/test_models -k "resnet50 and tf_static_cpu"
-
-# Benchmark canonical MiniLM on CPU via pytest
-pytest --benchmark tank/test_models -k "MiniLM and cpu"
-
-# Benchmark MiniLM on CPU via transformer-benchmarks:
-git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
-cd transformer-benchmarks
-./perf-ci.sh -n
-# Check detail.csv for MLIR/IREE results.
-
-```
-
-</details>
-
-
 <details>
  <summary>API Reference</summary>

@@ -228,160 +282,26 @@ result = shark_module.forward((arg0, arg1))
 ```
 </details>

-
 ## Supported and Validated Models

-<details>
-  <summary>PyTorch Models</summary>
+SHARK is maintained to support the latest innovations in ML Models: 

-### Huggingface PyTorch Models
+| TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------|----------|-------------|
+| BERT                | :green_heart:         | :green_heart:         | :green_heart:            |
+| DistilBERT         | :green_heart:         | :green_heart:         | :green_heart:            |
+| GPT2         | :green_heart:         | :green_heart:         | :green_heart:            |
+| BLOOM         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Stable Diffusion         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Vision Transformer       | :green_heart:         | :green_heart:         | :green_heart:            |
+| ResNet50         | :green_heart:         | :green_heart:         | :green_heart:            |

-| Hugging Face Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :green_heart: (JIT)          | :green_heart:         | :green_heart:         | :green_heart:            |
-| Albert              | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
-| BigBird             | :green_heart: (AOT)            |          |          |             |
-| DistilBERT          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
-| GPT2                | :broken_heart: (AOT)            |          |          |             |
-| MobileBert          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
+For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).

-### Torchvision  Models
+## Communication Channels

-| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|--------------------|----------------------|----------|----------|-------------|
-| AlexNet            | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| DenseNet121        | :green_heart: (Script)         |          |          |             |
-| MNasNet1_0         | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| MobileNetV2        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| MobileNetV3        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| Unet               | :broken_heart: (Script)         |          |          |             |
-| Resnet18           | :green_heart: (Script)         | :green_heart:         |  :green_heart:        | :green_heart:            |
-| Resnet50           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
-| Resnet101           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
-| Resnext50_32x4d    | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| ShuffleNet_v2      | :broken_heart: (Script)         |          |          |             |
-| SqueezeNet         | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
-| EfficientNet       | :green_heart: (Script)         |          |          |             |
-| Regnet             | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| Resnest            | :broken_heart: (Script)         |          |          |             |
-| Vision Transformer | :green_heart: (Script)         |          |          |             |
-| VGG 16             | :green_heart: (Script)         | :green_heart:         |   :green_heart:       |             |
-| Wide Resnet        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| RAFT               | :broken_heart: (JIT)            |          |          |             |
-
-For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
-
-### PyTorch Training Models
-
-| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :broken_heart:           | :broken_heart:         |          |             |
-| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
-
-</details>
-
-<details>
-  <summary>JAX Models</summary>
-
-
-### JAX  Models
-
-| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| DALL-E                | :broken_heart:           | :broken_heart:         |          |             |
-| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
-
-</details>
-
-<details>
-  <summary>TFLite Models</summary>
-
-### TFLite Models
-
-| Models | TOSA/LinAlg  | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :broken_heart:           | :broken_heart:         |          |             |
-| FullyConnected      | :green_heart:           | :green_heart:         |          |             |
-| albert | :green_heart:           | :green_heart:         |          |             |
-| asr_conformer | :green_heart:           | :green_heart:         |          |             |
-| bird_classifier | :green_heart:           | :green_heart:         |          |             |
-| cartoon_gan | :green_heart:           | :green_heart:         |          |             |
-| craft_text | :green_heart:           | :green_heart:         |          |             |
-| deeplab_v3 | :green_heart:           | :green_heart:         |          |             |
-| densenet | :green_heart:           | :green_heart:         |          |             |
-| east_text_detector | :green_heart:           | :green_heart:         |          |             |
-| efficientnet_lite0_int8 | :green_heart:           | :green_heart:         |          |             |
-| efficientnet | :green_heart:           | :green_heart:         |          |             |
-| gpt2 | :green_heart:           | :green_heart:         |          |             |
-| image_stylization | :green_heart:           | :green_heart:         |          |             |
-| inception_v4 | :green_heart:           | :green_heart:         |          |             |
-| inception_v4_uint8 | :green_heart:           | :green_heart:         |          |             |
-| lightning_fp16 | :green_heart:           | :green_heart:         |          |             |
-| lightning_i8 | :green_heart:           | :green_heart:         |          |             |
-| lightning | :green_heart:           | :green_heart:         |          |             |
-| magenta | :green_heart:           | :green_heart:         |          |             |
-| midas | :green_heart:           | :green_heart:         |          |             |
-| mirnet | :green_heart:           | :green_heart:         |          |             |
-| mnasnet | :green_heart:           | :green_heart:         |          |             |
-| mobilebert_edgetpu_s_float | :green_heart:           | :green_heart:         |          |             |
-| mobilebert_edgetpu_s_quant | :green_heart:           | :green_heart:         |          |             |
-| mobilebert | :green_heart:           | :green_heart:         |          |             |
-| mobilebert_tf2_float | :green_heart:           | :green_heart:         |          |             |
-| mobilebert_tf2_quant | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_ssd_quant | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v2_uint8 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v3-large | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v3-large_uint8 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v35-int8 | :green_heart:           | :green_heart:         |          |             |
-| nasnet | :green_heart:           | :green_heart:         |          |             |
-| person_detect | :green_heart:           | :green_heart:         |          |             |
-| posenet | :green_heart:           | :green_heart:         |          |             |
-| resnet_50_int8 | :green_heart:           | :green_heart:         |          |             |
-| rosetta | :green_heart:           | :green_heart:         |          |             |
-| spice | :green_heart:           | :green_heart:         |          |             |
-| squeezenet | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v2_fpnlite | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
-| ssd_spaghettinet_large | :green_heart:           | :green_heart:         |          |             |
-| ssd_spaghettinet_large_uint8 | :green_heart:           | :green_heart:         |          |             |
-| visual_wake_words_i8 | :green_heart:           | :green_heart:         |          |             |
-
-</details>
-
-<details>
-  <summary>TF Models</summary>
-
-### Tensorflow Models (Inference)
-
-| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| albert-base-v2              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| DistilBERT          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| CamemBert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| ConvBert              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| Deberta              |            |         |          |             |
-| electra          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| funnel              |            |         |          |             |
-| layoutlm              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| longformer              |            |         |          |             |
-| mobile-bert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| remembert              |            |         |          |             |
-| tapas              |            |         |          |             |
-| flaubert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| roberta                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| xlm-roberta              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| mpnet              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-
-</details>
+*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
+*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc

 ## Related Projects

--- a/web/logs/albert_maskfill_log.txt
+++ b/web/logs/albert_maskfill_log.txt
--- a/apps/stable_diffusion/init.py
+++ b/apps/stable_diffusion/init.py
--- a/apps/stable_diffusion/resources/base_model.json
+++ b/apps/stable_diffusion/resources/base_model.json
@@ -0,0 +1,98 @@
+{
+    "stabilityai/stable-diffusion-2-1": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    },
+    "CompVis/stable-diffusion-v1-4": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    }
+}
--- a/apps/stable_diffusion/resources/model_db.json
+++ b/apps/stable_diffusion/resources/model_db.json
@@ -0,0 +1,177 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip2base_18dec_fp32",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_19dec_v2p1base_fp32_64",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet2_14dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae2_19dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/apps/stable_diffusion/resources/opt_flags.json
+++ b/apps/stable_diffusion/resources/opt_flags.json
@@ -0,0 +1,95 @@
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
--- a/apps/stable_diffusion/resources/prompts.json
+++ b/apps/stable_diffusion/resources/prompts.json
@@ -0,0 +1,8 @@
+[["A high tech solarpunk utopia in the Amazon rainforest"],
+["A pikachu fine dining with a view to the Eiffel Tower"],
+["A mecha robot in a favela in expressionist style"],
+["an insect robot preparing a delicious meal"],
+["A digital Illustration of the Babel tower, 4k, detailed, trending in artstation, fantasy vivid colors"],
+["Cluttered house in the woods, anime, oil painting, high resolution, cottagecore, ghibli inspired, 4k"],
+["A beautiful mansion beside a waterfall in the woods, by josef thoma, matte painting, trending on artstation HQ"],
+["portrait photo of a asia old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes"]]
--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -0,0 +1 @@
+from .txt2img import txt2img_inf
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
--- a/apps/stable_diffusion/scripts/txt2img.py
+++ b/apps/stable_diffusion/scripts/txt2img.py
@@ -0,0 +1,241 @@
+import os
+
+os.environ["AMD_ENABLE_LLPC"] = "1"
+
+import torch
+import re
+import time
+from pathlib import Path
+from datetime import datetime as dt
+from dataclasses import dataclass
+from csv import DictWriter
+from apps.stable_diffusion.src import (
+    args,
+    Text2ImagePipeline,
+    get_schedulers,
+    set_init_device_flags,
+)
+
+
+@dataclass
+class Config:
+    model_id: str
+    ckpt_loc: str
+    precision: str
+    batch_size: int
+    max_length: int
+    height: int
+    width: int
+    device: str
+
+
+# This has to come before importing cache objects
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+# save output images and the inputs correspoding to it.
+def save_output_img(output_img):
+    output_path = args.output_dir if args.output_dir else Path.cwd()
+    generated_imgs_path = Path(output_path, "generated_imgs")
+    generated_imgs_path.mkdir(parents=True, exist_ok=True)
+    csv_path = Path(generated_imgs_path, "imgs_details.csv")
+
+    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
+    out_img_name = (
+        f"{prompt_slice}_{args.seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+    )
+    out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+    output_img.save(out_img_path, quality=95, subsampling=0)
+
+    new_entry = {
+        "VARIANT": args.hf_model_id,
+        "SCHEDULER": args.scheduler,
+        "PROMPT": args.prompts[0],
+        "NEG_PROMPT": args.negative_prompts[0],
+        "SEED": args.seed,
+        "CFG_SCALE": args.guidance_scale,
+        "PRECISION": args.precision,
+        "STEPS": args.steps,
+        "HEIGHT": args.height,
+        "WIDTH": args.width,
+        "MAX_LENGTH": args.max_length,
+        "OUTPUT": out_img_path,
+    }
+
+    with open(csv_path, "a") as csv_obj:
+        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
+        dictwriter_obj.writerow(new_entry)
+        csv_obj.close()
+
+
+txt2img_obj = None
+config_obj = None
+schedulers = None
+
+
+# Exposed to UI.
+def txt2img_inf(
+    prompt: str,
+    negative_prompt: str,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_size: int,
+    scheduler: str,
+    model_id: str,
+    custom_model_id: str,
+    ckpt_file_obj,
+    precision: str,
+    device: str,
+    max_length: int,
+):
+    global txt2img_obj
+    global config_obj
+    global schedulers
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.scheduler = scheduler
+    args.hf_model_id = custom_model_id if custom_model_id else model_id
+    args.ckpt_loc = ckpt_file_obj.name if ckpt_file_obj else ""
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+    )
+    if config_obj != new_config_obj:
+        config_obj = new_config_obj
+        args.precision = precision
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.use_tuned = True
+        args.import_mlir = False
+        set_init_device_flags()
+        schedulers = get_schedulers(model_id)
+        scheduler_obj = schedulers[scheduler]
+        txt2img_obj = Text2ImagePipeline.from_pretrained(
+            scheduler_obj,
+            args.import_mlir,
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.precision,
+            args.max_length,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.use_base_vae,
+        )
+    txt2img_obj.scheduler = schedulers[scheduler]
+
+    start_time = time.time()
+    txt2img_obj.log = ""
+    generated_imgs = txt2img_obj.generate_images(
+        prompt,
+        negative_prompt,
+        batch_size,
+        height,
+        width,
+        steps,
+        guidance_scale,
+        seed,
+        args.max_length,
+        dtype,
+        args.use_base_vae,
+        cpu_scheduling,
+    )
+    total_time = time.time() - start_time
+    save_output_img(generated_imgs[0])
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={device}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={args.height}x{args.width}"
+    text_output += (
+        f", batch size={args.batch_size}, max_length={args.max_length}"
+    )
+    text_output += txt2img_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    return generated_imgs, text_output
+
+
+if __name__ == "__main__":
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)
+    scheduler_obj = schedulers[args.scheduler]
+
+    txt2img_obj = Text2ImagePipeline.from_pretrained(
+        scheduler_obj,
+        args.import_mlir,
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.precision,
+        args.max_length,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.use_base_vae,
+    )
+
+    start_time = time.time()
+    generated_imgs = txt2img_obj.generate_images(
+        args.prompts,
+        args.negative_prompts,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.steps,
+        args.guidance_scale,
+        args.seed,
+        args.max_length,
+        dtype,
+        args.use_base_vae,
+        cpu_scheduling,
+    )
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={args.height}x{args.width}"
+    text_output += (
+        f", batch size={args.batch_size}, max_length={args.max_length}"
+    )
+    text_output += txt2img_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    save_output_img(generated_imgs[0])
+    print(text_output)
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -0,0 +1,8 @@
+from .utils import (
+    args,
+    set_init_device_flags,
+    prompt_examples,
+    get_available_devices,
+)
+from .pipelines import Text2ImagePipeline
+from .schedulers import get_schedulers
--- a/apps/stable_diffusion/src/models/init.py
+++ b/apps/stable_diffusion/src/models/init.py
@@ -0,0 +1,2 @@
+from .model_wrappers import SharkifyStableDiffusionModel
+from .opt_params import get_vae, get_unet, get_clip, get_tokenizer
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -0,0 +1,228 @@
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from transformers import CLIPTextModel
+from collections import defaultdict
+import torch
+import sys
+import traceback
+import re
+from ..utils import compile_through_fx, get_opt_flags, base_models, args
+
+
+# These shapes are parameter dependent.
+def replace_shape_str(shape, max_len, width, height, batch_size):
+    new_shape = []
+    for i in range(len(shape)):
+        if shape[i] == "max_len":
+            new_shape.append(max_len)
+        elif shape[i] == "height":
+            new_shape.append(height)
+        elif shape[i] == "width":
+            new_shape.append(width)
+        elif isinstance(shape[i], str):
+            if "batch_size" in shape[i]:
+                mul_val = int(shape[i].split("*")[0])
+                new_shape.append(batch_size * mul_val)
+        else:
+            new_shape.append(shape[i])
+    return new_shape
+
+
+# Get the input info for various models i.e. "unet", "clip", "vae".
+def get_input_info(model_info, max_len, width, height, batch_size):
+    dtype_config = {"f32": torch.float32, "i64": torch.int64}
+    input_map = defaultdict(list)
+    for k in model_info:
+        for inp in model_info[k]:
+            shape = model_info[k][inp]["shape"]
+            dtype = dtype_config[model_info[k][inp]["dtype"]]
+            tensor = None
+            if isinstance(shape, list):
+                clean_shape = replace_shape_str(
+                    shape, max_len, width, height, batch_size
+                )
+                if dtype == torch.int64:
+                    tensor = torch.randint(1, 3, tuple(clean_shape))
+                else:
+                    tensor = torch.randn(*clean_shape).to(dtype)
+            elif isinstance(shape, int):
+                tensor = torch.tensor(shape).to(dtype)
+            else:
+                sys.exit("shape isn't specified correctly.")
+            input_map[k].append(tensor)
+    return input_map
+
+
+class SharkifyStableDiffusionModel:
+    def __init__(
+        self,
+        model_id: str,
+        custom_weights: str,
+        precision: str,
+        max_len: int = 64,
+        width: int = 512,
+        height: int = 512,
+        batch_size: int = 1,
+        use_base_vae: bool = False,
+    ):
+        self.check_params(max_len, width, height)
+        self.max_len = max_len
+        self.height = height // 8
+        self.width = width // 8
+        self.batch_size = batch_size
+        self.model_id = model_id if custom_weights == "" else custom_weights
+        self.precision = precision
+        self.base_vae = use_base_vae
+        self.model_name = (
+            str(batch_size)
+            + "_"
+            + str(max_len)
+            + "_"
+            + str(height)
+            + "_"
+            + str(width)
+            + "_"
+            + precision
+        )
+        # We need a better naming convention for the .vmfbs because despite
+        # using the custom model variant the .vmfb names remain the same and
+        # it'll always pick up the compiled .vmfb instead of compiling the
+        # custom model.
+        # So, currently, we add `self.model_id` in the `self.model_name` of
+        # .vmfb file.
+        # TODO: Have a better way of naming the vmfbs using self.model_name.
+
+        model_name = re.sub(r"\W+", "_", self.model_id)
+        if model_name[0] == "_":
+            model_name = model_name[1:]
+        self.model_name = self.model_name + "_" + model_name
+
+    def check_params(self, max_len, width, height):
+        if not (max_len >= 32 and max_len <= 77):
+            sys.exit("please specify max_len in the range [32, 77].")
+        if not (width % 8 == 0 and width >= 384):
+            sys.exit("width should be greater than 384 and multiple of 8")
+        if not (height % 8 == 0 and height >= 384):
+            sys.exit("height should be greater than 384 and multiple of 8")
+
+    def get_vae(self):
+        class VaeModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, base_vae=self.base_vae):
+                super().__init__()
+                self.vae = AutoencoderKL.from_pretrained(
+                    model_id,
+                    subfolder="vae",
+                )
+                self.base_vae = base_vae
+
+            def forward(self, input):
+                if not self.base_vae:
+                    input = 1 / 0.18215 * input
+                x = self.vae.decode(input, return_dict=False)[0]
+                x = (x / 2 + 0.5).clamp(0, 1)
+                if self.base_vae:
+                    return x
+                x = x * 255.0
+                return x.round()
+
+        vae = VaeModel()
+        inputs = tuple(self.inputs["vae"])
+        is_f16 = True if self.precision == "fp16" else False
+        vae_name = "base_vae" if self.base_vae else "vae"
+        shark_vae = compile_through_fx(
+            vae,
+            inputs,
+            is_f16=is_f16,
+            model_name=vae_name + self.model_name,
+            extra_args=get_opt_flags("vae", precision=self.precision),
+        )
+        return shark_vae
+
+    def get_unet(self):
+        class UnetModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.unet = UNet2DConditionModel.from_pretrained(
+                    model_id,
+                    subfolder="unet",
+                )
+                self.in_channels = self.unet.in_channels
+                self.train(False)
+
+            def forward(
+                self, latent, timestep, text_embedding, guidance_scale
+            ):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latents = torch.cat([latent] * 2)
+                unet_out = self.unet.forward(
+                    latents, timestep, text_embedding, return_dict=False
+                )[0]
+                noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+                return noise_pred
+
+        unet = UnetModel()
+        is_f16 = True if self.precision == "fp16" else False
+        inputs = tuple(self.inputs["unet"])
+        input_mask = [True, True, True, False]
+        shark_unet = compile_through_fx(
+            unet,
+            inputs,
+            model_name="unet" + self.model_name,
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+        )
+        return shark_unet
+
+    def get_clip(self):
+        class CLIPText(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.text_encoder = CLIPTextModel.from_pretrained(
+                    model_id,
+                    subfolder="text_encoder",
+                )
+
+            def forward(self, input):
+                return self.text_encoder(input)[0]
+
+        clip_model = CLIPText()
+
+        shark_clip = compile_through_fx(
+            clip_model,
+            tuple(self.inputs["clip"]),
+            model_name="clip" + self.model_name,
+            extra_args=get_opt_flags("clip", precision="fp32"),
+        )
+        return shark_clip
+
+    def __call__(self):
+        for model_id in base_models:
+            self.inputs = get_input_info(
+                base_models[model_id],
+                self.max_len,
+                self.width,
+                self.height,
+                self.batch_size,
+            )
+            try:
+                compiled_clip = self.get_clip()
+                compiled_unet = self.get_unet()
+                compiled_vae = self.get_vae()
+            except Exception as e:
+                if args.enable_stack_trace:
+                    traceback.print_exc()
+                print("Retrying with a different base model configuration")
+                continue
+            # This is done just because in main.py we are basing the choice of tokenizer and scheduler
+            # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
+            # model and rely on retrying method to find the input configuration, we should also update
+            # the knowledge of base model id accordingly into `args.hf_model_id`.
+            if args.ckpt_loc != "":
+                args.hf_model_id = model_id
+            return compiled_clip, compiled_unet, compiled_vae
+        sys.exit(
+            "Cannot compile the model. Please use `enable_stack_trace` and create an issue at https://github.com/nod-ai/SHARK/issues"
+        )
--- a/apps/stable_diffusion/src/models/opt_params.py
+++ b/apps/stable_diffusion/src/models/opt_params.py
@@ -0,0 +1,113 @@
+import sys
+from transformers import CLIPTokenizer
+from ..utils import models_db, args, get_shark_model
+
+
+hf_model_variant_map = {
+    "Linaqruf/anything-v3.0": ["anythingv3", "v2_1base"],
+    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v2_1base"],
+    "prompthero/openjourney": ["openjourney", "v2_1base"],
+    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v2_1base"],
+    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1"],
+    "stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
+    "CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
+}
+
+
+def get_params(bucket_key, model_key, model, is_tuned, precision):
+    iree_flags = []
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    try:
+        bucket = models_db[0][bucket_key]
+        model_name = models_db[1][model_key]
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "default_compilation_flags"
+        ]
+    except KeyError:
+        raise Exception(
+            f"{bucket_key}/{model_key} is not present in the models database"
+        )
+
+    if (
+        "specified_compilation_flags"
+        in models_db[2][model][is_tuned][precision]
+    ):
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in models_db[2][model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return bucket, model_name, iree_flags
+
+
+def get_unet():
+    variant, version = hf_model_variant_map[args.hf_model_id]
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if "vulkan" not in args.device and args.use_tuned:
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
+    else:
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "unet", is_tuned, args.precision
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_vae():
+    variant, version = hf_model_variant_map[args.hf_model_id]
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    is_base = "/base" if args.use_base_vae else ""
+    if "vulkan" not in args.device and args.use_tuned:
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}/{args.device}"
+    else:
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "vae", is_tuned, args.precision
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_clip():
+    variant, version = hf_model_variant_map[args.hf_model_id]
+    bucket_key = f"{variant}/untuned"
+    model_key = (
+        f"{variant}/{version}/clip/fp32/length_{args.max_length}/untuned"
+    )
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "clip", "untuned", "fp32"
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_tokenizer():
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.hf_model_id, subfolder="tokenizer"
+    )
+    return tokenizer
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -0,0 +1 @@
+from .pipeline_shark_stable_diffusion_txt2img import Text2ImagePipeline
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -0,0 +1,132 @@
+import torch
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from ..schedulers import SharkEulerDiscreteScheduler
+from .pipeline_shark_stable_diffusion_utils import StableDiffusionPipeline
+
+
+class Text2ImagePipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get initial latents
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Get Image latents
+        latents = self.produce_img_latents(
+            latents=init_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=self.scheduler.timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -0,0 +1,205 @@
+import torch
+from transformers import CLIPTokenizer
+import torchvision.transforms as T
+from tqdm.auto import tqdm
+import time
+from typing import Union
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from shark.shark_inference import SharkInference
+from ..schedulers import SharkEulerDiscreteScheduler
+from ..models import (
+    SharkifyStableDiffusionModel,
+    get_vae,
+    get_clip,
+    get_unet,
+    get_tokenizer,
+)
+from ..utils import start_profiling, end_profiling, preprocessCKPT
+
+
+class StableDiffusionPipeline:
+    def __init__(
+        self,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        self.vae = vae
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+        self.unet = unet
+        self.scheduler = scheduler
+        # TODO: Implement using logging python utility.
+        self.log = ""
+
+    def encode_prompts(self, prompts, neg_prompts, max_length):
+        # Tokenize text and get embeddings
+        text_input = self.tokenizer(
+            prompts,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        # Get unconditional embeddings as well
+        uncond_input = self.tokenizer(
+            neg_prompts,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])
+
+        clip_inf_start = time.time()
+        text_embeddings = self.text_encoder("forward", (text_input,))
+        clip_inf_time = (time.time() - clip_inf_start) * 1000
+        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"
+
+        return text_embeddings
+
+    def decode_latents(self, latents, use_base_vae, cpu_scheduling):
+        if use_base_vae:
+            latents = 1 / 0.18215 * latents
+
+        latents_numpy = latents
+        if cpu_scheduling:
+            latents_numpy = latents.detach().numpy()
+
+        profile_device = start_profiling(file_path="vae.rdc")
+        vae_start = time.time()
+        images = self.vae("forward", (latents_numpy,))
+        vae_inf_time = (time.time() - vae_start) * 1000
+        end_profiling(profile_device)
+        self.log += f"\nVAE Inference time (ms): {vae_inf_time:.3f}"
+
+        if use_base_vae:
+            images = torch.from_numpy(images)
+            images = (images.detach().cpu() * 255.0).numpy()
+            images = images.round()
+
+        transform = T.ToPILImage()
+        pil_images = [
+            transform(image)
+            for image in torch.from_numpy(images).to(torch.uint8)
+        ]
+        return pil_images
+
+    def produce_img_latents(
+        self,
+        latents,
+        text_embeddings,
+        guidance_scale,
+        total_timesteps,
+        dtype,
+        cpu_scheduling,
+        return_all_latents=False,
+    ):
+        step_time_sum = 0
+        latent_history = [latents]
+        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            timestep = torch.tensor([t]).to(dtype).detach().numpy()
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            # Profiling Unet.
+            profile_device = start_profiling(file_path="unet.rdc")
+            noise_pred = self.unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                ),
+                send_to_host=False,
+            )
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = self.scheduler.step(
+                    noise_pred, t, latents
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents)
+
+            latent_history.append(latents)
+            step_time = (time.time() - step_start_time) * 1000
+            #  self.log += (
+            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
+            #  )
+            step_time_sum += step_time
+
+        avg_step_time = step_time_sum / len(total_timesteps)
+        self.log += f"\nAverage step time: {avg_step_time}ms/it"
+
+        if not return_all_latents:
+            return latents
+        all_latents = torch.cat(latent_history, dim=0)
+        return all_latents
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+        import_mlir: bool,
+        model_id: str,
+        ckpt_loc: str,
+        precision: str,
+        max_length: int,
+        batch_size: int,
+        height: int,
+        width: int,
+        use_base_vae: bool,
+    ):
+        init_kwargs = None
+        if import_mlir:
+            if ckpt_loc:
+                preprocessCKPT()
+            mlir_import = SharkifyStableDiffusionModel(
+                model_id,
+                ckpt_loc,
+                precision,
+                max_len=max_length,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                use_base_vae=use_base_vae,
+            )
+            clip, unet, vae = mlir_import()
+            return cls(vae, clip, get_tokenizer(), unet, scheduler)
+        return cls(
+            get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
+        )
--- a/apps/stable_diffusion/src/schedulers/init.py
+++ b/apps/stable_diffusion/src/schedulers/init.py
@@ -0,0 +1,2 @@
+from .sd_schedulers import get_schedulers
+from .shark_eulerdiscrete import SharkEulerDiscreteScheduler
--- a/apps/stable_diffusion/src/schedulers/sd_schedulers.py
+++ b/apps/stable_diffusion/src/schedulers/sd_schedulers.py
@@ -0,0 +1,51 @@
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+)
+from .shark_eulerdiscrete import (
+    SharkEulerDiscreteScheduler,
+)
+
+
+def get_schedulers(model_id):
+    schedulers = dict()
+    schedulers["PNDM"] = PNDMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["DDIM"] = DDIMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "DPMSolverMultistep"
+    ] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "EulerAncestralDiscrete"
+    ] = EulerAncestralDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "SharkEulerDiscrete"
+    ] = SharkEulerDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["SharkEulerDiscrete"].compile()
+    return schedulers
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -0,0 +1,139 @@
+import sys
+import numpy as np
+from typing import List, Optional, Tuple, Union
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
+from diffusers.configuration_utils import register_to_config
+from ..utils import compile_through_fx, get_shark_model, args
+import torch
+
+
+class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
+        super().__init__(
+            num_train_timesteps,
+            beta_start,
+            beta_end,
+            beta_schedule,
+            trained_betas,
+            prediction_type,
+        )
+
+    def compile(self):
+        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+        BATCH_SIZE = args.batch_size
+
+        model_input = {
+            "euler": {
+                "latent": torch.randn(
+                    BATCH_SIZE, 4, args.height // 8, args.width // 8
+                ),
+                "output": torch.randn(
+                    BATCH_SIZE, 4, args.height // 8, args.width // 8
+                ),
+                "sigma": torch.tensor(1).to(torch.float32),
+                "dt": torch.tensor(1).to(torch.float32),
+            },
+        }
+
+        example_latent = model_input["euler"]["latent"]
+        example_output = model_input["euler"]["output"]
+        if args.precision == "fp16":
+            example_latent = example_latent.half()
+            example_output = example_output.half()
+        example_sigma = model_input["euler"]["sigma"]
+        example_dt = model_input["euler"]["dt"]
+
+        class ScalingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, latent, sigma):
+                return latent / ((sigma**2 + 1) ** 0.5)
+
+        class SchedulerStepModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, noise_pred, sigma, latent, dt):
+                pred_original_sample = latent - sigma * noise_pred
+                derivative = (latent - pred_original_sample) / sigma
+                return latent + derivative * dt
+
+        iree_flags = []
+        if len(args.iree_vulkan_target_triple) > 0:
+            iree_flags.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+        # Disable bindings fusion to work with moltenVK.
+        if sys.platform == "darwin":
+            iree_flags.append("-iree-stream-fuse-binding=false")
+
+        if args.import_mlir:
+            scaling_model = ScalingModel()
+            self.scaling_model = compile_through_fx(
+                scaling_model,
+                (example_latent, example_sigma),
+                model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+
+            step_model = SchedulerStepModel()
+            self.step_model = compile_through_fx(
+                step_model,
+                (example_output, example_sigma, example_latent, example_dt),
+                model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+        else:
+            self.scaling_model = get_shark_model(
+                SCHEDULER_BUCKET,
+                "euler_scale_model_input_" + args.precision,
+                iree_flags,
+            )
+            self.step_model = get_shark_model(
+                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
+            )
+
+    def scale_model_input(self, sample, timestep):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        return self.scaling_model(
+            "forward",
+            (
+                sample,
+                sigma,
+            ),
+            send_to_host=False,
+        )
+
+    def step(self, noise_pred, timestep, latent):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        dt = self.sigmas[step_index + 1] - sigma
+        return self.step_model(
+            "forward",
+            (
+                noise_pred,
+                sigma,
+                latent,
+                dt,
+            ),
+            send_to_host=False,
+        )
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -0,0 +1,19 @@
+from .profiler import start_profiling, end_profiling
+from .resources import (
+    prompt_examples,
+    models_db,
+    base_models,
+    opt_flags,
+    resource_path,
+)
+from .stable_args import args
+from .utils import (
+    get_shark_model,
+    compile_through_fx,
+    set_iree_runtime_flags,
+    map_device_to_name_path,
+    set_init_device_flags,
+    get_available_devices,
+    get_opt_flags,
+    preprocessCKPT,
+)
--- a/apps/stable_diffusion/src/utils/profiler.py
+++ b/apps/stable_diffusion/src/utils/profiler.py
@@ -0,0 +1,18 @@
+from .stable_args import args
+
+
+# Helper function to profile the vulkan device.
+def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
+    if args.vulkan_debug_utils and "vulkan" in args.device:
+        import iree
+
+        print(f"Profiling and saving to {file_path}.")
+        vulkan_device = iree.runtime.get_device(args.device)
+        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
+        return vulkan_device
+    return None
+
+
+def end_profiling(device):
+    if device:
+        return device.end_profiling()
--- a/apps/stable_diffusion/src/utils/resources.py
+++ b/apps/stable_diffusion/src/utils/resources.py
@@ -0,0 +1,37 @@
+import os
+import json
+import sys
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+def get_json_file(path):
+    json_var = []
+    loc_json = resource_path(path)
+    if os.path.exists(loc_json):
+        with open(loc_json, encoding="utf-8") as fopen:
+            json_var = json.load(fopen)
+
+    if not json_var:
+        print(f"Unable to fetch {path}")
+
+    return json_var
+
+
+# TODO: This shouldn't be called from here, every time the file imports
+# it will run all the global vars.
+prompt_examples = get_json_file("../../resources/prompts.json")
+models_db = get_json_file("../../resources/model_db.json")
+
+# The base_model contains the input configuration for the different
+# models and also helps in providing information for the variants.
+base_models = get_json_file("../../resources/base_model.json")
+
+# Contains optimization flags for different models.
+opt_flags = get_json_file("../../resources/opt_flags.json")
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -0,0 +1,323 @@
+import argparse
+from pathlib import Path
+
+
+def path_expand(s):
+    return Path(s).expanduser().resolve()
+
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Stable Diffusion Params
+##############################################################################
+
+p.add_argument(
+    "-p",
+    "--prompts",
+    action="append",
+    default=[],
+    help="text of which images to be generated.",
+)
+
+p.add_argument(
+    "--negative-prompts",
+    nargs="+",
+    default=[""],
+    help="text you don't want to see in the generated image.",
+)
+
+p.add_argument(
+    "--steps",
+    type=int,
+    default=50,
+    help="the no. of steps to do the sampling.",
+)
+
+p.add_argument(
+    "--seed",
+    type=int,
+    default=42,
+    help="the seed to use.",
+)
+
+p.add_argument(
+    "--batch_size",
+    type=int,
+    default=1,
+    choices=range(1, 4),
+    help="the number of inferences to be made in a single `run`.",
+)
+
+p.add_argument(
+    "--height",
+    type=int,
+    default=512,
+    help="the height of the output image.",
+)
+
+p.add_argument(
+    "--width",
+    type=int,
+    default=512,
+    help="the width of the output image.",
+)
+
+p.add_argument(
+    "--guidance_scale",
+    type=float,
+    default=7.5,
+    help="the value to be used for guidance scaling.",
+)
+
+p.add_argument(
+    "--max_length",
+    type=int,
+    default=64,
+    help="max length of the tokenizer output, options are 64 and 77.",
+)
+
+##############################################################################
+### Model Config and Usage Params
+##############################################################################
+
+p.add_argument(
+    "--device", type=str, default="vulkan", help="device to run the model."
+)
+
+p.add_argument(
+    "--precision", type=str, default="fp16", help="precision to run the model."
+)
+
+p.add_argument(
+    "--import_mlir",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
+)
+
+p.add_argument(
+    "--load_vmfb",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
+)
+
+p.add_argument(
+    "--save_vmfb",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="saves the compiled flatbuffer to the local directory",
+)
+
+p.add_argument(
+    "--use_tuned",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Download and use the tuned version of the model if available",
+)
+
+p.add_argument(
+    "--use_base_vae",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Do conversion from the VAE output to pixel space on cpu.",
+)
+
+p.add_argument(
+    "--scheduler",
+    type=str,
+    default="SharkEulerDiscrete",
+    help="other supported schedulers are [PNDM, DDIM, LMSDiscrete, EulerDiscrete, DPMSolverMultistep]",
+)
+
+p.add_argument(
+    "--output_img_format",
+    type=str,
+    default="png",
+    help="specify the format in which output image is save. Supported options: jpg / png",
+)
+
+p.add_argument(
+    "--output_dir",
+    type=str,
+    default=None,
+    help="Directory path to save the output images and json",
+)
+
+p.add_argument(
+    "--runs",
+    type=int,
+    default=1,
+    help="number of images to be generated with random seeds in single execution",
+)
+
+p.add_argument(
+    "--ckpt_loc",
+    type=str,
+    default="",
+    help="Path to SD's .ckpt file.",
+)
+
+p.add_argument(
+    "--hf_model_id",
+    type=str,
+    default="stabilityai/stable-diffusion-2-1-base",
+    help="The repo-id of hugging face.",
+)
+
+p.add_argument(
+    "--enable_stack_trace",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Enable showing the stack trace when retrying the base model configuration",
+)
+
+##############################################################################
+### IREE - Vulkan supported flags
+##############################################################################
+
+p.add_argument(
+    "--iree-vulkan-target-triple",
+    type=str,
+    default="",
+    help="Specify target triple for vulkan",
+)
+
+p.add_argument(
+    "--vulkan_debug_utils",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Profiles vulkan device and collects the .rdc info",
+)
+
+p.add_argument(
+    "--vulkan_large_heap_block_size",
+    default="4147483648",
+    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
+)
+
+p.add_argument(
+    "--vulkan_validation_layers",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for disabling vulkan validation layers when benchmarking",
+)
+
+##############################################################################
+### Misc. Debug and Optimization flags
+##############################################################################
+
+p.add_argument(
+    "--use_compiled_scheduler",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="use the default scheduler precompiled into the model if available",
+)
+
+p.add_argument(
+    "--local_tank_cache",
+    default="",
+    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
+)
+
+p.add_argument(
+    "--dump_isa",
+    default=False,
+    action="store_true",
+    help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
+)
+
+p.add_argument(
+    "--dispatch_benchmarks",
+    default=None,
+    help='dispatches to return benchamrk data on.  use "All" for all, and None for none.',
+)
+
+p.add_argument(
+    "--dispatch_benchmarks_dir",
+    default="temp_dispatch_benchmarks",
+    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
+)
+
+p.add_argument(
+    "--enable_rgp",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for inserting debug frames between iterations for use with rgp.",
+)
+
+p.add_argument(
+    "--hide_steps",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for hiding the details of iteration/sec for each step.",
+)
+
+p.add_argument(
+    "--warmup_count",
+    type=int,
+    default=0,
+    help="flag setting warmup count for clip and vae [>= 0].",
+)
+
+p.add_argument(
+    "--clear_all",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
+)
+
+##############################################################################
+### Web UI flags
+##############################################################################
+
+p.add_argument(
+    "--progress_bar",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for removing the pregress bar animation during image generation",
+)
+
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
+##############################################################################
+### SD model auto-annotation flags
+##############################################################################
+
+p.add_argument(
+    "--annotation_output",
+    type=path_expand,
+    default="./",
+    help="Directory to save the annotated mlir file",
+)
+
+p.add_argument(
+    "--annotation_model",
+    type=str,
+    default="unet",
+    help="Options are unet and vae.",
+)
+
+p.add_argument(
+    "--use_winograd",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Apply Winograd on selected conv ops.",
+)
+
+args = p.parse_args()
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -0,0 +1,351 @@
+import os
+import torch
+from shark.shark_inference import SharkInference
+from shark.shark_importer import import_with_fx
+from shark.iree_utils.vulkan_utils import (
+    set_iree_vulkan_runtime_flags,
+    get_vulkan_target_triple,
+)
+from shark.iree_utils.gpu_utils import get_cuda_sm_cc
+from .stable_args import args
+from .resources import opt_flags
+import sys
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    load_pipeline_from_original_stable_diffusion_ckpt,
+)
+
+
+def _compile_module(shark_module, model_name, extra_args=[]):
+    if args.load_vmfb or args.save_vmfb:
+        device = (
+            args.device
+            if "://" not in args.device
+            else "-".join(args.device.split("://"))
+        )
+        extended_name = "{}_{}".format(model_name, device)
+        vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
+        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
+            print(f"loading existing vmfb from: {vmfb_path}")
+            shark_module.load_module(vmfb_path, extra_args=extra_args)
+        else:
+            if args.save_vmfb:
+                print("Saving to {}".format(vmfb_path))
+            else:
+                print(
+                    "No vmfb found. Compiling and saving to {}".format(
+                        vmfb_path
+                    )
+                )
+            path = shark_module.save_module(
+                os.getcwd(), extended_name, extra_args
+            )
+            shark_module.load_module(path, extra_args=extra_args)
+    else:
+        shark_module.compile(extra_args)
+    return shark_module
+
+
+# Downloads the model from shark_tank and returns the shark_module.
+def get_shark_model(tank_url, model_name, extra_args=[]):
+    from shark.shark_downloader import download_model
+    from shark.parser import shark_args
+
+    # Set local shark_tank cache directory.
+    shark_args.local_tank_cache = args.local_tank_cache
+    if "cuda" in args.device:
+        shark_args.enable_tf32 = True
+
+    mlir_model, func_name, inputs, golden_out = download_model(
+        model_name,
+        tank_url=tank_url,
+        frontend="torch",
+    )
+    shark_module = SharkInference(
+        mlir_model, device=args.device, mlir_dialect="linalg"
+    )
+    return _compile_module(shark_module, model_name, extra_args)
+
+
+# Converts the torch-module into a shark_module.
+def compile_through_fx(
+    model,
+    inputs,
+    model_name,
+    is_f16=False,
+    f16_input_mask=None,
+    extra_args=[],
+):
+    mlir_module, func_name = import_with_fx(
+        model, inputs, is_f16, f16_input_mask
+    )
+    shark_module = SharkInference(
+        mlir_module,
+        device=args.device,
+        mlir_dialect="linalg",
+    )
+
+    return _compile_module(shark_module, model_name, extra_args)
+
+
+def set_iree_runtime_flags():
+    vulkan_runtime_flags = [
+        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
+        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
+    ]
+    if args.enable_rgp:
+        vulkan_runtime_flags += [
+            f"--enable_rgp=true",
+            f"--vulkan_debug_utils=true",
+        ]
+    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
+
+
+def get_all_devices(driver_name):
+    """
+    Inputs: driver_name
+    Returns a list of all the available devices for a given driver sorted by
+    the iree path names of the device as in --list_devices option in iree.
+    """
+    from iree.runtime import get_driver
+
+    driver = get_driver(driver_name)
+    device_list_src = driver.query_available_devices()
+    device_list_src.sort(key=lambda d: d["path"])
+    return device_list_src
+
+
+def get_device_mapping(driver, key_combination=3):
+    """This method ensures consistent device ordering when choosing
+    specific devices for execution
+    Args:
+        driver (str): execution driver (vulkan, cuda, rocm, etc)
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Returns:
+        dict: map to possible device names user can input mapped to desired combination of name/path.
+    """
+    from shark.iree_utils._common import iree_device_map
+
+    driver = iree_device_map(driver)
+    device_list = get_all_devices(driver)
+    device_map = dict()
+
+    def get_output_value(dev_dict):
+        if key_combination == 1:
+            return f"{driver}://{dev_dict['path']}"
+        if key_combination == 2:
+            return dev_dict["name"]
+        if key_combination == 3:
+            return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
+
+    # mapping driver name to default device (driver://0)
+    device_map[f"{driver}"] = get_output_value(device_list[0])
+    for i, device in enumerate(device_list):
+        # mapping with index
+        device_map[f"{driver}://{i}"] = get_output_value(device)
+        # mapping with full path
+        device_map[f"{driver}://{device['path']}"] = get_output_value(device)
+    return device_map
+
+
+def map_device_to_name_path(device, key_combination=3):
+    """Gives the appropriate device data (supported name/path) for user selected execution device
+    Args:
+        device (str): user
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Raises:
+        ValueError:
+    Returns:
+        str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
+    """
+    driver = device.split("://")[0]
+    device_map = get_device_mapping(driver, key_combination)
+    try:
+        device_mapping = device_map[device]
+    except KeyError:
+        raise ValueError(f"Device '{device}' is not a valid device.")
+    return device_mapping
+
+
+def set_init_device_flags():
+    if "vulkan" in args.device:
+        # set runtime flags for vulkan.
+        set_iree_runtime_flags()
+
+        # set triple flag to avoid multiple calls to get_vulkan_triple_flag
+        device_name, args.device = map_device_to_name_path(args.device)
+        if not args.iree_vulkan_target_triple:
+            triple = get_vulkan_target_triple(device_name)
+            if triple is not None:
+                args.iree_vulkan_target_triple = triple
+        print(
+            f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
+        )
+    elif "cuda" in args.device:
+        args.device = "cuda"
+    elif "cpu" in args.device:
+        args.device = "cpu"
+
+    # set max_length based on availability.
+    if args.hf_model_id in [
+        "Linaqruf/anything-v3.0",
+        "wavymulder/Analog-Diffusion",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+    ]:
+        args.max_length = 77
+    elif args.hf_model_id == "prompthero/openjourney":
+        args.max_length = 64
+
+    # Use tuned models in the case of a specific setting.
+    if (
+        args.hf_model_id
+        in ["prompthero/openjourney", "dreamlike-art/dreamlike-diffusion-1.0"]
+        or args.precision != "fp16"
+    ):
+        args.use_tuned = False
+
+    elif (
+        "vulkan" in args.device
+        and "rdna3" not in args.iree_vulkan_target_triple
+    ):
+        args.use_tuned = False
+
+    elif "cuda" in args.device and get_cuda_sm_cc() not in ["sm_80", "sm_89"]:
+        args.use_tuned = False
+
+    elif args.use_base_vae and args.hf_model_id not in [
+        "stabilityai/stable-diffusion-2-1-base",
+        "CompVis/stable-diffusion-v1-4",
+    ]:
+        args.use_tuned = False
+
+    if args.use_tuned:
+        print(f"Using tuned models for {args.hf_model_id}/fp16/{args.device}.")
+    else:
+        print("Tuned models are currently not supported for this setting.")
+
+    # set import_mlir to True for unuploaded models.
+    if args.hf_model_id not in [
+        "Linaqruf/anything-v3.0",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+        "prompthero/openjourney",
+        "wavymulder/Analog-Diffusion",
+        "stabilityai/stable-diffusion-2-1",
+        "stabilityai/stable-diffusion-2-1-base",
+        "CompVis/stable-diffusion-v1-4",
+    ]:
+        args.import_mlir = True
+
+    if args.height != 512 or args.width != 512 or args.batch_size != 1:
+        args.import_mlir = True
+
+
+# Utility to get list of devices available.
+def get_available_devices():
+    def get_devices_by_name(driver_name):
+        from shark.iree_utils._common import iree_device_map
+
+        device_list = []
+        try:
+            driver_name = iree_device_map(driver_name)
+            device_list_dict = get_all_devices(driver_name)
+            print(f"{driver_name} devices are available.")
+        except:
+            print(f"{driver_name} devices are not available.")
+        else:
+            for i, device in enumerate(device_list_dict):
+                device_list.append(f"{device['name']} => {driver_name}://{i}")
+        return device_list
+
+    set_iree_runtime_flags()
+
+    available_devices = []
+    vulkan_devices = get_devices_by_name("vulkan")
+    available_devices.extend(vulkan_devices)
+    cuda_devices = get_devices_by_name("cuda")
+    available_devices.extend(cuda_devices)
+    available_devices.append("cpu")
+    return available_devices
+
+
+def disk_space_check(path, lim=20):
+    from shutil import disk_usage
+
+    du = disk_usage(path)
+    free = du.free / (1024 * 1024 * 1024)
+    if free <= lim:
+        print(f"[WARNING] Only {free:.2f}GB space available in {path}.")
+
+
+def get_opt_flags(model, precision="fp16"):
+    iree_flags = []
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    if "specified_compilation_flags" in opt_flags[model][is_tuned][precision]:
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in opt_flags[model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += opt_flags[model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return iree_flags
+
+
+def preprocessCKPT():
+    from pathlib import Path
+
+    path = Path(args.ckpt_loc)
+    diffusers_path = path.parent.absolute()
+    diffusers_directory_name = path.stem
+    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
+    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
+    print(
+        "Created directory : ",
+        diffusers_directory_name,
+        " at -> ",
+        diffusers_path,
+    )
+    path_to_diffusers = complete_path_to_diffusers.as_posix()
+    from_safetensors = (
+        True if args.ckpt_loc.lower().endswith(".safetensors") else False
+    )
+    # EMA weights usually yield higher quality images for inference but non-EMA weights have
+    # been yielding better results in our case.
+    # TODO: Add an option `--ema` (`--no-ema`) for users to specify if they want to go for EMA
+    #       weight extraction or not.
+    extract_ema = False
+    print("Loading pipeline from original stable diffusion checkpoint")
+    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path=args.ckpt_loc,
+        extract_ema=extract_ema,
+        from_safetensors=from_safetensors,
+    )
+    pipe.save_pretrained(path_to_diffusers)
+    print("Loading complete")
+    args.ckpt_loc = path_to_diffusers
+    print("Custom model path is : ", args.ckpt_loc)
--- a/apps/stable_diffusion/web/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/css/sd_dark_theme.css
@@ -0,0 +1,67 @@
+.gradio-container {
+    background-color: black
+}
+
+.container {
+    background-color: black !important;
+    padding-top: 20px !important;
+}
+
+#ui_title {
+    padding: 10px !important;
+}
+
+#top_logo {
+    background-color: transparent;
+    border-radius: 0 !important;
+    border: 0;
+}
+
+#demo_title {
+    background-color: black;
+    border-radius: 0 !important;
+    border: 0;
+    padding-top: 50px;
+    padding-bottom: 0px;
+    width: 460px !important;
+}
+
+#demo_title_outer {
+    border-radius: 0;
+}
+
+#prompt_box_outer div:first-child {
+    border-radius: 0 !important
+}
+
+#prompt_box textarea {
+    background-color: #1d1d1d !important
+}
+
+#prompt_examples {
+    margin: 0 !important
+}
+
+#prompt_examples svg {
+    display: none !important;
+}
+
+.gr-sample-textbox {
+    border-radius: 1rem !important;
+    border-color: rgb(31, 41, 55) !important;
+    border-width: 2px !important;
+}
+
+#ui_body {
+    background-color: #111111 !important;
+    padding: 10px !important;
+    border-radius: 0.5em !important;
+}
+
+#img_result+div {
+    display: none !important;
+}
+
+footer {
+    display: none !important;
+}
--- a/apps/stable_diffusion/web/gradio/img2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/img2img_ui.py
--- a/apps/stable_diffusion/web/gradio/txt2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/txt2img_ui.py
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -0,0 +1,247 @@
+import os
+import sys
+from pathlib import Path
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"
+
+if sys.platform == "darwin":
+    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.src import (
+    prompt_examples,
+    args,
+    get_available_devices,
+)
+from apps.stable_diffusion.scripts import txt2img_inf
+
+nodlogo_loc = resource_path("logos/nod-logo.png")
+sdlogo_loc = resource_path("logos/sd-demo-logo.png")
+
+
+demo_css = resource_path("css/sd_dark_theme.css")
+
+
+with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        logo2 = Image.open(sdlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=100)
+            with gr.Column(scale=5, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=logo2,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="demo_title",
+                ).style(width=150, height=100)
+
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    with gr.Group():
+                        model_id = gr.Dropdown(
+                            label="Model ID",
+                            value="stabilityai/stable-diffusion-2-1-base",
+                            choices=[
+                                "Linaqruf/anything-v3.0",
+                                "prompthero/openjourney",
+                                "wavymulder/Analog-Diffusion",
+                                "stabilityai/stable-diffusion-2-1",
+                                "stabilityai/stable-diffusion-2-1-base",
+                                "CompVis/stable-diffusion-v1-4",
+                            ],
+                        )
+                        custom_model_id = gr.Textbox(
+                            placeholder="check here: https://huggingface.co/models eg. runwayml/stable-diffusion-v1-5",
+                            value="",
+                            label="HuggingFace Model ID",
+                        )
+                    with gr.Group():
+                        ckpt_loc = gr.File(
+                            label="Upload checkpoint",
+                            file_types=[".ckpt", ".safetensors"],
+                        )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value="cyberpunk forest by Salvador Dali",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="trees, green",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                with gr.Accordion(label="Advance Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            label="Scheduler",
+                            value="SharkEulerDiscrete",
+                            choices=[
+                                "DDIM",
+                                "PNDM",
+                                "LMSDiscrete",
+                                "DPMSolverMultistep",
+                                "EulerDiscrete",
+                                "EulerAncestralDiscrete",
+                                "SharkEulerDiscrete",
+                            ],
+                        )
+                        batch_size = gr.Slider(
+                            1, 4, value=1, step=1, label="Number of Images"
+                        )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 786, value=512, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 786, value=512, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value="fp16",
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=64,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=50, step=1, label="Steps"
+                        )
+                        guidance_scale = gr.Slider(
+                            0,
+                            50,
+                            value=7.5,
+                            step=0.1,
+                            label="CFG Scale",
+                        )
+                with gr.Row():
+                    seed = gr.Number(value=-1, precision=0, label="Seed")
+                    available_devices = get_available_devices()
+                    device = gr.Dropdown(
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        None,
+                        inputs=[],
+                        outputs=[seed],
+                        _js="() => Math.floor(Math.random() * 4294967295)",
+                    )
+                    stable_diffusion = gr.Button("Generate Image")
+                with gr.Accordion(label="Prompt Examples!", open=False):
+                    ex = gr.Examples(
+                        examples=prompt_examples,
+                        inputs=prompt,
+                        cache_examples=False,
+                        elem_id="prompt_examples",
+                    )
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2], height="auto")
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=4,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+
+        prompt.submit(
+            txt2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_size,
+                scheduler,
+                model_id,
+                custom_model_id,
+                ckpt_loc,
+                precision,
+                device,
+                max_length,
+            ],
+            outputs=[gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+        stable_diffusion.click(
+            txt2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_size,
+                scheduler,
+                model_id,
+                custom_model_id,
+                ckpt_loc,
+                precision,
+                device,
+                max_length,
+            ],
+            outputs=[gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+shark_web.queue()
+shark_web.launch(
+    share=args.share,
+    inbrowser=True,
+    server_name="0.0.0.0",
+    server_port=args.server_port,
+)
--- a/apps/stable_diffusion/web/logos/Nod_logo.png
+++ b/apps/stable_diffusion/web/logos/Nod_logo.png
--- a/apps/stable_diffusion/web/logos/nod-logo.png
+++ b/apps/stable_diffusion/web/logos/nod-logo.png
--- a/apps/stable_diffusion/web/logos/sd-demo-logo.png
+++ b/apps/stable_diffusion/web/logos/sd-demo-logo.png
--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -42,7 +42,7 @@ class TFHuggingFaceLanguage(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=tf_bert_input)
+    @tf.function(input_signature=tf_bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -0,0 +1,45 @@
+import argparse
+from PIL import Image
+import numpy as np
+
+import requests
+import shutil
+import os
+import subprocess
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-n", "--newfile")
+parser.add_argument(
+    "-g",
+    "--golden_url",
+    default="https://storage.googleapis.com/shark_tank/testdata/cyberpunk_fores_42_0_230119_021148.png",
+)
+
+
+def get_image(url, local_filename):
+    res = requests.get(url, stream=True)
+    if res.status_code == 200:
+        with open(local_filename, "wb") as f:
+            shutil.copyfileobj(res.raw, f)
+
+
+def compare_images(new_filename, golden_filename):
+    new = np.array(Image.open(new_filename)) / 255.0
+    golden = np.array(Image.open(golden_filename)) / 255.0
+    diff = np.abs(new - golden)
+    mean = np.mean(diff)
+    if mean > 0.01:
+        subprocess.run(
+            ["gsutil", "cp", new_filename, "gs://shark_tank/testdata/builder/"]
+        )
+        raise SystemExit("new and golden not close")
+    else:
+        print("SUCCESS")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    tempfile_name = os.path.join(os.getcwd(), "golden.png")
+    get_image(args.golden_url, tempfile_name)
+    compare_images(args.newfile, tempfile_name)
--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -1,5 +1,5 @@
 #!/bin/bash

-IMPORTER=1 ./setup_venv.sh
+IMPORTER=1 BENCHMARK=1 ./setup_venv.sh
 source $GITHUB_WORKSPACE/shark.venv/bin/activate
 python generate_sharktank.py --upload=False --ci_tank_dir=True
--- a/build_tools/stable_diff_main_test.sh
+++ b/build_tools/stable_diff_main_test.sh
@@ -0,0 +1,7 @@
+rm -rf ./test_images
+mkdir test_images
+python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned
+python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned --beta_models=True
+
+python build_tools/image_comparison.py -n ./test_images/*.png
+exit $?
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -0,0 +1,77 @@
+import os
+import subprocess
+from shark.examples.shark_inference.stable_diffusion.resources import (
+    get_json_file,
+)
+from shark.shark_downloader import download_public_file
+from image_comparison import compare_images
+import argparse
+from glob import glob
+import shutil
+
+model_config_dicts = get_json_file(
+    os.path.join(
+        os.getcwd(),
+        "shark/examples/shark_inference/stable_diffusion/resources/model_config.json",
+    )
+)
+
+
+def test_loop(device="vulkan", beta=False, extra_flags=[]):
+    # Get golden values from tank
+    shutil.rmtree("./test_images", ignore_errors=True)
+    os.mkdir("./test_images")
+    os.mkdir("./test_images/golden")
+    hf_model_names = model_config_dicts[0].values()
+    tuned_options = ["--no-use_tuned"]  #'use_tuned']
+    devices = ["vulkan"]
+    if beta:
+        extra_flags.append("--beta_models=True")
+    for model_name in hf_model_names:
+        for use_tune in tuned_options:
+            command = [
+                "python",
+                "shark/examples/shark_inference/stable_diffusion/main.py",
+                "--device=" + device,
+                "--output_dir=./test_images/" + model_name,
+                "--hf_model_id=" + model_name,
+                use_tune,
+            ]
+            command += extra_flags
+            generated_image = not subprocess.call(
+                command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+            )
+            if generated_image:
+                os.makedirs(
+                    "./test_images/golden/" + model_name, exist_ok=True
+                )
+                download_public_file(
+                    "gs://shark_tank/testdata/golden/" + model_name,
+                    "./test_images/golden/" + model_name,
+                )
+                comparison = [
+                    "python",
+                    "build_tools/image_comparison.py",
+                    "--golden_url=gs://shark_tank/testdata/golden/"
+                    + model_name
+                    + "/*.png",
+                    "--newfile=./test_images/" + model_name + "/*.png",
+                ]
+                test_file = glob("./test_images/" + model_name + "/*.png")[0]
+                golden_path = "./test_images/golden/" + model_name + "/*.png"
+                golden_file = glob(golden_path)[0]
+                compare_images(test_file, golden_file)
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-d", "--device", default="vulkan")
+parser.add_argument(
+    "-b", "--beta", action=argparse.BooleanOptionalAction, default=False
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    test_loop(args.device, args.beta, [])
--- a/conftest.py
+++ b/conftest.py
@@ -36,6 +36,12 @@ def pytest_addoption(parser):
        default="False",
        help="Enables uploading of reproduction artifacts upon test case failure during iree-compile or validation. Must be passed with --ci_sha option ",
    )
+    parser.addoption(
+        "--update_tank",
+        action="store_true",
+        default="False",
+        help="Update local shark tank with latest artifacts.",
+    )
    parser.addoption(
        "--ci_sha",
        action="store",
--- a/cpp/.gitignore
+++ b/cpp/.gitignore
@@ -0,0 +1,3 @@
+*.mlir
+*.vmfb
+*.ini
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -54,5 +54,29 @@ python -m pip install tensorflow

 *Run the vulkan_gui*
 ```bash
-./build/vulkan_gui/iree-samples-vulkan-gui
+./build/vulkan_gui/iree-samples-resnet-vulkan-gui
+```
+
+## Other models
+A tool for benchmarking other models is built and can be invoked with a command like the following
+```bash
+./build/vulkan_gui/iree-vulkan-gui --module-file=path/to/.vmfb --function_input=...
+```
+see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
+```bash
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
+```
+VAE and Autoencoder are also available
+```bash
+# VAE
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32
+
+# CLIP Autoencoder
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
 ```
--- a/cpp/save_img.py
+++ b/cpp/save_img.py
@@ -1,7 +1,6 @@
 import numpy as np
 import tensorflow as tf
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model


 def load_and_preprocess_image(fname: str):
--- a/cpp/vulkan_gui/CMakeLists.txt
+++ b/cpp/vulkan_gui/CMakeLists.txt
@@ -40,45 +40,77 @@ set(IMGUI_DIR ${CMAKE_BINARY_DIR}/_deps/imgui-src)
 message("Looking for Imgui in ${IMGUI_DIR}")
 include_directories(${IMGUI_DIR} ${IMGUI_DIR}/backends ..)

-# Define the sample executable.
-set(_NAME "iree-samples-vulkan-gui")
-add_executable(${_NAME} "")
-target_sources(${_NAME}
-  PRIVATE
-    vulkan_inference_gui.cc
-    "${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
-    "${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
-    "${IMGUI_DIR}/imgui.cpp"
-    "${IMGUI_DIR}/imgui_draw.cpp"
-    "${IMGUI_DIR}/imgui_demo.cpp"
-    "${IMGUI_DIR}/imgui_tables.cpp"
-    "${IMGUI_DIR}/imgui_widgets.cpp"
-)
-set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-samples-vulkan-gui")
-target_include_directories(${_NAME} PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
-)
-target_link_libraries(${_NAME}
-  SDL2::SDL2
-  Vulkan::Vulkan
-  iree_runtime_runtime
-  iree_base_internal_main
-  iree_hal_drivers_vulkan_registration_registration
-  iree_modules_hal_hal
-  iree_vm_vm
-  iree_vm_bytecode_module
-  iree_vm_cc
+
+function(iree_vulkan_sample)
+
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "NAME"
+    "SRCS"
+    ${ARGN}
+  )
+
+
+  # Define the sample executable.
+  set(_NAME "${_RULE_NAME}")
+  set(SRCS "${_RULE_SRCS}")
+  add_executable(${_NAME} "")
+  target_sources(${_NAME}
+    PRIVATE
+      ${SRCS}
+      "${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
+      "${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
+      "${IMGUI_DIR}/imgui.cpp"
+      "${IMGUI_DIR}/imgui_draw.cpp"
+      "${IMGUI_DIR}/imgui_demo.cpp"
+      "${IMGUI_DIR}/imgui_tables.cpp"
+      "${IMGUI_DIR}/imgui_widgets.cpp"
+  )
+  set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_NAME}")
+  target_include_directories(${_NAME} PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+  )
+  target_link_libraries(${_NAME}
+    SDL2::SDL2
+    Vulkan::Vulkan
+    iree_runtime_runtime
+    iree_base_internal_main
+    iree_hal_drivers_vulkan_registration_registration
+    iree_modules_hal_hal
+    iree_vm_vm
+    iree_vm_bytecode_module
+    iree_vm_cc
+    iree_tooling_vm_util_cc
+    iree_tooling_context_util
+  )
+
+  if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+    set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
+  else()
+    set(_GUI_LINKOPTS "")
+  endif()
+
+  target_link_options(${_NAME}
+    PRIVATE
+      ${_GUI_LINKOPTS}
+  )
+endfunction()
+
+iree_vulkan_sample(
+    NAME
+      iree-samples-resnet-vulkan-gui
+
+    SRCS
+      vulkan_resnet_inference_gui.cc
 )

-if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-  set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
-else()
-  set(_GUI_LINKOPTS "")
-endif()
+iree_vulkan_sample(
+    NAME
+      iree-vulkan-gui

-target_link_options(${_NAME}
-  PRIVATE
-    ${_GUI_LINKOPTS}
+    SRCS
+      vulkan_inference_gui.cc
 )

 message(STATUS "Configured vulkan_gui sample successfully")
--- a/cpp/vulkan_gui/vulkan_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_inference_gui.cc
@@ -18,6 +18,12 @@
 #include <set>
 #include <vector>
 #include <fstream>
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+#include <iterator>
+#include <string>
+#include <utility>

 #include "iree/hal/drivers/vulkan/api.h"

@@ -30,6 +36,15 @@
 #include "iree/vm/bytecode_module.h"
 #include "iree/vm/ref_cc.h"

+// iree-run-module
+#include "iree/base/internal/flags.h"
+#include "iree/base/status_cc.h"
+#include "iree/base/tracing.h"
+#include "iree/modules/hal/types.h"
+#include "iree/tooling/comparison.h"
+#include "iree/tooling/context_util.h"
+#include "iree/tooling/vm_util_cc.h"
+
 // Other dependencies (helpers, etc.)
 #include "iree/base/internal/main.h"

@@ -38,6 +53,49 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

+IREE_FLAG(string, entry_function, "",
+          "Name of a function contained in the module specified by module_file "
+          "to run.");
+
+// TODO(benvanik): move --function_input= flag into a util.
+static iree_status_t parse_function_io(iree_string_view_t flag_name,
+                                       void* storage,
+                                       iree_string_view_t value) {
+  auto* list = (std::vector<std::string>*)storage;
+  list->push_back(std::string(value.data, value.size));
+  return iree_ok_status();
+}
+static void print_function_io(iree_string_view_t flag_name, void* storage,
+                              FILE* file) {
+  auto* list = (std::vector<std::string>*)storage;
+  if (list->empty()) {
+    fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
+  } else {
+    for (size_t i = 0; i < list->size(); ++i) {
+      fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
+              list->at(i).c_str());
+    }
+  }
+}
+static std::vector<std::string> FLAG_function_inputs;
+IREE_FLAG_CALLBACK(
+    parse_function_io, print_function_io, &FLAG_function_inputs, function_input,
+    "An input (a) value or (b) buffer of the format:\n"
+    "  (a) scalar value\n"
+    "     value\n"
+    "     e.g.: --function_input=\"3.14\"\n"
+    "  (b) buffer:\n"
+    "     [shape]xtype=[value]\n"
+    "     e.g.: --function_input=\"2x2xi32=1 2 3 4\"\n"
+    "Optionally, brackets may be used to separate the element values:\n"
+    "  2x2xi32=[[1 2][3 4]]\n"
+    "Raw binary files can be read to provide buffer contents:\n"
+    "  2x2xi32=@some/file.bin\n"
+    "numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
+    "  @some.npy\n"
+    "Each occurrence of the flag indicates an input in the order they were\n"
+    "specified on the command line.");
+
 typedef struct iree_file_toc_t {
  const char* name;             // the file's original name
  char* data;             // beginning of the file
@@ -87,225 +145,6 @@ static void check_vk_result(VkResult err) {
  abort();
 }

-// Helper function to find Vulkan memory type bits. See ImGui_ImplVulkan_MemoryType() in imgui_impl_vulkan.cpp
-uint32_t findMemoryType(uint32_t type_filter, VkMemoryPropertyFlags properties)
-{
-  VkPhysicalDeviceMemoryProperties mem_properties;
-  vkGetPhysicalDeviceMemoryProperties(g_PhysicalDevice, &mem_properties);
-
-  for (uint32_t i = 0; i < mem_properties.memoryTypeCount; i++)
-  {
-    if ((type_filter & (1 << i)) && (mem_properties.memoryTypes[i].propertyFlags & properties) == properties)
-    {
-      return i;
-    }
-  }
-
-  return 0xFFFFFFFF; // Unable to find memoryType
-}
-
-// Helper function to load an image with common settings and return a VkDescriptorSet as a sort of Vulkan pointer
-bool LoadTextureFromFile(const char* filename, VkDescriptorSet* img_ds, int* image_width, int* image_height)
-{
-  // Specifying 4 channels forces stb to load the image in RGBA which is an easy format for Vulkan
-  int image_channels = 4;
-  unsigned char* image_data = stbi_load(filename, image_width, image_height, 0, image_channels);
-
-  if (image_data == NULL)
-  {
-    return false;
-  }
-
-  // Calculate allocation size (in number of bytes)
-  size_t image_size = (*image_width)*(*image_height)*image_channels;
-
-  VkResult err;
-
-  // Create the Vulkan image.
-  VkImage texture_image;
-  VkDeviceMemory texture_image_memory;
-  {
-    VkImageCreateInfo info = {};
-    info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
-    info.imageType = VK_IMAGE_TYPE_2D;
-    info.format = VK_FORMAT_R8G8B8A8_UNORM;
-    info.extent.width = *image_width;
-    info.extent.height = *image_height;
-    info.extent.depth = 1;
-    info.mipLevels = 1;
-    info.arrayLayers = 1;
-    info.samples = VK_SAMPLE_COUNT_1_BIT;
-    info.tiling = VK_IMAGE_TILING_OPTIMAL;
-    info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
-    info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-    err = vkCreateImage(g_Device, &info, g_Allocator, &texture_image);
-    check_vk_result(err);
-    VkMemoryRequirements req;
-    vkGetImageMemoryRequirements(g_Device, texture_image, &req);
-    VkMemoryAllocateInfo alloc_info = {};
-    alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    alloc_info.allocationSize = req.size;
-    alloc_info.memoryTypeIndex = findMemoryType(req.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
-    err = vkAllocateMemory(g_Device, &alloc_info, g_Allocator, &texture_image_memory);
-    check_vk_result(err);
-    err = vkBindImageMemory(g_Device, texture_image, texture_image_memory, 0);
-    check_vk_result(err);
-  }
-
-  // Create the Image View
-  VkImageView image_view;
-  {
-    VkImageViewCreateInfo info = {};
-    info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
-    info.image = texture_image;
-    info.viewType = VK_IMAGE_VIEW_TYPE_2D;
-    info.format = VK_FORMAT_R8G8B8A8_UNORM;
-    info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-    info.subresourceRange.levelCount = 1;
-    info.subresourceRange.layerCount = 1;
-    err = vkCreateImageView(g_Device, &info, g_Allocator, &image_view);
-    check_vk_result(err);
-  }
-
-  // Create Sampler
-  VkSampler sampler;
-  {
-    VkSamplerCreateInfo sampler_info{};
-    sampler_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
-    sampler_info.magFilter = VK_FILTER_LINEAR;
-    sampler_info.minFilter = VK_FILTER_LINEAR;
-    sampler_info.mipmapMode  = VK_SAMPLER_MIPMAP_MODE_LINEAR;
-    sampler_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; // outside image bounds just use border color
-    sampler_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT;
-    sampler_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
-    sampler_info.minLod = -1000;
-    sampler_info.maxLod = 1000;
-    sampler_info.maxAnisotropy = 1.0f;
-    err = vkCreateSampler(g_Device, &sampler_info, g_Allocator, &sampler);
-    check_vk_result(err);
-  }
-
-  // Create Descriptor Set using ImGUI's implementation
-  *img_ds = ImGui_ImplVulkan_AddTexture(sampler, image_view, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
-
-  // Create Upload Buffer
-  VkBuffer upload_buffer;
-  VkDeviceMemory upload_buffer_memory;
-  {
-    VkBufferCreateInfo buffer_info = {};
-    buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    buffer_info.size = image_size;
-    buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
-    buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    err = vkCreateBuffer(g_Device, &buffer_info, g_Allocator, &upload_buffer);
-    check_vk_result(err);
-    VkMemoryRequirements req;
-    vkGetBufferMemoryRequirements(g_Device, upload_buffer, &req);
-    VkMemoryAllocateInfo alloc_info = {};
-    alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    alloc_info.allocationSize = req.size;
-    alloc_info.memoryTypeIndex = findMemoryType(req.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-    err = vkAllocateMemory(g_Device, &alloc_info, g_Allocator, &upload_buffer_memory);
-    check_vk_result(err);
-    err = vkBindBufferMemory(g_Device, upload_buffer, upload_buffer_memory, 0);
-    check_vk_result(err);
-  }
-
-  // Upload to Buffer:
-  {
-    void* map = NULL;
-    err = vkMapMemory(g_Device, upload_buffer_memory, 0, image_size, 0, &map);
-    check_vk_result(err);
-    memcpy(map, image_data, image_size);
-    VkMappedMemoryRange range[1] = {};
-    range[0].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
-    range[0].memory = upload_buffer_memory;
-    range[0].size = image_size;
-    err = vkFlushMappedMemoryRanges(g_Device, 1, range);
-    check_vk_result(err);
-    vkUnmapMemory(g_Device, upload_buffer_memory);
-  }
-
-  // Release image memory using stb
-  stbi_image_free(image_data);
-
-  // Create a command buffer that will perform following steps when hit in the command queue.
-  // TODO: this works in the example, but may need input if this is an acceptable way to access the pool/create the command buffer.
-  VkCommandPool command_pool = g_MainWindowData.Frames[g_MainWindowData.FrameIndex].CommandPool;
-  VkCommandBuffer command_buffer;
-  {
-    VkCommandBufferAllocateInfo alloc_info{};
-    alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-    alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-    alloc_info.commandPool = command_pool;
-    alloc_info.commandBufferCount = 1;
-
-    err = vkAllocateCommandBuffers(g_Device, &alloc_info, &command_buffer);
-    check_vk_result(err);
-
-    VkCommandBufferBeginInfo begin_info = {};
-    begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-    begin_info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-    err = vkBeginCommandBuffer(command_buffer, &begin_info);
-    check_vk_result(err);
-  }
-
-  // Copy to Image
-  {
-    VkImageMemoryBarrier copy_barrier[1] = {};
-    copy_barrier[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
-    copy_barrier[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-    copy_barrier[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-    copy_barrier[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-    copy_barrier[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    copy_barrier[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    copy_barrier[0].image = texture_image;
-    copy_barrier[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-    copy_barrier[0].subresourceRange.levelCount = 1;
-    copy_barrier[0].subresourceRange.layerCount = 1;
-    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, NULL, 0, NULL, 1, copy_barrier);
-
-    VkBufferImageCopy region = {};
-    region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-    region.imageSubresource.layerCount = 1;
-    region.imageExtent.width = *image_width;
-    region.imageExtent.height = *image_height;
-    region.imageExtent.depth = 1;
-    vkCmdCopyBufferToImage(command_buffer, upload_buffer, texture_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);
-
-    VkImageMemoryBarrier use_barrier[1] = {};
-    use_barrier[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
-    use_barrier[0].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-    use_barrier[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-    use_barrier[0].oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-    use_barrier[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-    use_barrier[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    use_barrier[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    use_barrier[0].image = texture_image;
-    use_barrier[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-    use_barrier[0].subresourceRange.levelCount = 1;
-    use_barrier[0].subresourceRange.layerCount = 1;
-    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, 0, NULL, 0, NULL, 1, use_barrier);
-  }
-
-  // End command buffer
-  {
-    VkSubmitInfo end_info = {};
-    end_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-    end_info.commandBufferCount = 1;
-    end_info.pCommandBuffers = &command_buffer;
-    err = vkEndCommandBuffer(command_buffer);
-    check_vk_result(err);
-    err = vkQueueSubmit(g_Queue, 1, &end_info, VK_NULL_HANDLE);
-    check_vk_result(err);
-    err = vkDeviceWaitIdle(g_Device);
-    check_vk_result(err);
-  }
-
-  return true;
-}
-
 // Returns the names of the Vulkan layers used for the given IREE
 // |extensibility_set| and |features|.
 std::vector<const char*> GetIreeLayers(
@@ -723,7 +562,16 @@ namespace iree {

 extern "C" int iree_main(int argc, char** argv) {

-  fprintf(stdout, "starting yo\n");
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
+  if (argc > 1) {
+    // Avoid iree-run-module spinning endlessly on stdin if the user uses single
+    // dashes for flags.
+    printf(
+        "[ERROR] unexpected positional argument (expected none)."
+        " Did you use pass a flag with a single dash ('-')?"
+        " Use '--' instead.\n");
+    return 1;
+  }

  // --------------------------------------------------------------------------
  // Create a window.
@@ -835,8 +683,6 @@ extern "C" int iree_main(int argc, char** argv) {

  // Demo state.
  bool show_iree_window = true;
-  // --------------------------------------------------------------------------
-
  // --------------------------------------------------------------------------
  // Setup IREE.

@@ -900,69 +746,44 @@ extern "C" int iree_main(int argc, char** argv) {


  // Load bytecode module
-  iree_file_toc_t module_file_toc;
-  const char network_model[] = "resnet50_tf.vmfb";
-  fprintf(stdout, "Loading: %s\n", network_model);
-  if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
-  {
-      abort();
-      return 1;
-  }
-  fprintf(stdout, "module size: %zu\n", module_file_toc.size);
-
-  static float input_res50[224*224*3];
-  static float output_res50[1000];
-
-  char filename[] = "dog_imagenet.jpg";
-  fprintf(stdout, "loading: %s\n", filename);
-  int x,y,n;
-  //unsigned char *image_raw = stbi_load(filename, &x, &y, &n, 3);
-  stbi_load(filename, &x, &y, &n, 3);
-  fprintf(stdout, "res: %i x %i x %i\n", x, y, n);
-
-  /* Preprocessing needs to go here. For now use a buffer preprocessed in python.
-
-  //convert image into floating point format
-  for(int i=0;i<224*224*3;i++)
-  {
-    input_res50[i]= ((float)image_raw[i])/255.0f;
-  }*/
-
-  std::ifstream fin("dog.bin", std::ifstream::in | std::ifstream::binary);
-  fin.read((char*)input_res50, 224*224*3*sizeof(float));
-
-  // load image again so imgui can display it
-  int my_image_width = 0;
-  int my_image_height = 0;
-  VkDescriptorSet my_image_texture = 0;
-  bool ret = LoadTextureFromFile(filename, &my_image_texture, &my_image_width, &my_image_height);
-  fprintf(stdout, "creating vulkan image: %s\n", ret ?"OK":"FAIL");
-  IM_ASSERT(ret);
+  //iree_file_toc_t module_file_toc;
+  //const char network_model[] = "resnet50_tf.vmfb";
+  //fprintf(stdout, "Loading: %s\n", network_model);
+  //if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
+  //{
+  //    abort();
+  //    return 1;
+  //}
+  //fprintf(stdout, "module size: %zu\n", module_file_toc.size);

  iree_vm_module_t* bytecode_module = nullptr;
-  IREE_CHECK_OK(iree_vm_bytecode_module_create(
-      iree_instance,
-      iree_const_byte_span_t{
-          reinterpret_cast<const uint8_t*>(module_file_toc.data),
-          module_file_toc.size},
-      iree_allocator_null(), iree_allocator_system(), &bytecode_module));
-  // Query for details about what is in the loaded module.
-  iree_vm_module_signature_t bytecode_module_signature =
-      iree_vm_module_signature(bytecode_module);
-  fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
-          bytecode_module_signature.export_function_count);
-  for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
-    iree_vm_function_t function;
-    IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
-        bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
-    auto function_name = iree_vm_function_name(&function);
-    auto function_signature = iree_vm_function_signature(&function);
+  iree_status_t module_status = iree_tooling_load_module_from_flags(
+      iree_instance, iree_allocator_system(), &bytecode_module);
+  if (!iree_status_is_ok(module_status))
+    return -1;
+  //IREE_CHECK_OK(iree_vm_bytecode_module_create(
+  //    iree_instance,
+  //    iree_const_byte_span_t{
+  //        reinterpret_cast<const uint8_t*>(module_file_toc.data),
+  //        module_file_toc.size},
+  //    iree_allocator_null(), iree_allocator_system(), &bytecode_module));
+  //// Query for details about what is in the loaded module.
+  //iree_vm_module_signature_t bytecode_module_signature =
+  //    iree_vm_module_signature(bytecode_module);
+  //fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
+  //        bytecode_module_signature.export_function_count);
+  //for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
+  //  iree_vm_function_t function;
+  //  IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
+  //      bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
+  //  auto function_name = iree_vm_function_name(&function);
+  //  auto function_signature = iree_vm_function_signature(&function);

-    fprintf(stdout, "  %d: '%.*s' with calling convention '%.*s'\n", i,
-            (int)function_name.size, function_name.data,
-            (int)function_signature.calling_convention.size,
-            function_signature.calling_convention.data);
-  }
+  //  fprintf(stdout, "  %d: '%.*s' with calling convention '%.*s'\n", i,
+  //          (int)function_name.size, function_name.data,
+  //          (int)function_signature.calling_convention.size,
+  //          function_signature.calling_convention.data);
+  //}

  // Allocate a context that will hold the module state across invocations.
  iree_vm_context_t* iree_context = nullptr;
@@ -988,33 +809,42 @@ extern "C" int iree_main(int argc, char** argv) {
        // Write inputs into mappable buffers.
        iree_hal_allocator_t* allocator =
            iree_hal_device_allocator(iree_vk_device);
-        iree_hal_memory_type_t input_memory_type =
-            static_cast<iree_hal_memory_type_t>(
-                IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
-                IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
-        iree_hal_buffer_usage_t input_buffer_usage =
-            static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
-        iree_hal_buffer_params_t buffer_params;
-        buffer_params.type = input_memory_type;
-        buffer_params.usage = input_buffer_usage;
-        buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;
+        //iree_hal_memory_type_t input_memory_type =
+        //    static_cast<iree_hal_memory_type_t>(
+        //        IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+        //        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
+        //iree_hal_buffer_usage_t input_buffer_usage =
+        //    static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
+        //iree_hal_buffer_params_t buffer_params;
+        //buffer_params.type = input_memory_type;
+        //buffer_params.usage = input_buffer_usage;
+        //buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;

       // Wrap input buffers in buffer views.

-        iree_hal_buffer_view_t* input0_buffer_view = nullptr;
-        constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
-        IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
-            allocator,
-            /*shape_rank=*/4, /*shape=*/input_buffer_shape,
-            IREE_HAL_ELEMENT_TYPE_FLOAT_32,
-            IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
-            iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
-            &input0_buffer_view));
-
        vm::ref<iree_vm_list_t> inputs;
-        IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
-        auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
-        IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));
+        iree_status_t input_status = ParseToVariantList(
+            allocator,
+            iree::span<const std::string>{FLAG_function_inputs.data(),
+                                          FLAG_function_inputs.size()},
+            iree_allocator_system(), &inputs);
+        if (!iree_status_is_ok(input_status))
+            return -1;
+        //vm::ref<iree_vm_list_t> inputs;
+        //IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
+
+        //iree_hal_buffer_view_t* input0_buffer_view = nullptr;
+        //constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
+        //IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
+        //    allocator,
+        //    /*shape_rank=*/4, /*shape=*/input_buffer_shape,
+        //    IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+        //    IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
+        //    iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
+        //    &input0_buffer_view));
+
+        //auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
+        //IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));

        // Prepare outputs list to accept results from the invocation.

@@ -1023,6 +853,7 @@ extern "C" int iree_main(int argc, char** argv) {
        IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, kOutputCount * sizeof(float), iree_allocator_system(), &outputs));

  // --------------------------------------------------------------------------
+
  // Main loop.
  bool done = false;
  while (!done) {
@@ -1076,46 +907,11 @@ extern "C" int iree_main(int argc, char** argv) {
                                     /*policy=*/nullptr, inputs.get(),
                                     outputs.get(), iree_allocator_system()));

-        // Read back the results.
-        auto* output_buffer_view = reinterpret_cast<iree_hal_buffer_view_t*>(
-            iree_vm_list_get_ref_deref(outputs.get(),
-            0,
-            iree_hal_buffer_view_get_descriptor()));
-        IREE_CHECK_OK(iree_hal_device_transfer_d2h(
-            iree_vk_device,
-            iree_hal_buffer_view_buffer(output_buffer_view),
-            0,
-            output_res50, sizeof(output_res50),
-            IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout()));

        // we want to run continuously so we can use tools like RenderDoc, RGP, etc...
        dirty = true;
      }

-      // find maxarg from results
-      float max = 0.0f;
-      int max_idx = -1;
-      for(int i=0;i<1000;i++)
-      {
-        if (output_res50[i] > max)
-        {
-          max = output_res50[i];
-          max_idx = i;
-        }
-      }
-
-      ImGui::Text("pointer = %p", my_image_texture);
-      ImGui::Text("size = %d x %d", my_image_width, my_image_height);
-      ImGui::Image((ImTextureID)my_image_texture, ImVec2(my_image_width, my_image_height));
-
-      // Display the latest computation output.
-      ImGui::Text("Max   idx = [%i]", max_idx);
-      ImGui::Text("Max value = [%f]", max);
-
-      ImGui::Text("Resnet50 categories:");
-      ImGui::PlotHistogram("Histogram", output_res50, IM_ARRAYSIZE(output_res50), 0, NULL, 0.0f, 1.0f, ImVec2(0,80));
-      ImGui::Separator();
-
      // Framerate counter.
      ImGui::Text("Application average %.3f ms/frame (%.1f FPS)",
                  1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate);
@@ -1137,6 +933,7 @@ extern "C" int iree_main(int argc, char** argv) {
  iree_vm_module_release(bytecode_module);
  iree_vm_context_release(iree_context);
  iree_hal_device_release(iree_vk_device);
+  iree_hal_allocator_release(allocator);
  iree_hal_driver_release(iree_vk_driver);
  iree_hal_vulkan_syms_release(iree_vk_syms);
  iree_vm_instance_release(iree_instance);
--- a/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
--- a/dataset/README.md
+++ b/dataset/README.md
@@ -0,0 +1,27 @@
+# Dataset annotation tool
+
+SHARK annotator for adding or modifying prompts of dataset images
+
+## Set up
+
+Activate SHARK Python virtual environment and install additional packages
+```shell
+source ../shark.venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Run annotator
+
+```shell
+python annotation_tool.py
+```
+
+<img width="1280" alt="annotator" src="https://user-images.githubusercontent.com/49575973/214521137-7ef6ae10-7cd8-46e6-b270-b6c0445157f1.png">
+
+* Select a dataset from `Dataset` dropdown list
+* Select an image from `Image` dropdown list
+* Image and the existing prompt will be loaded
+* Select a prompt from `Prompt` dropdown list to modify or "Add new" to add a prompt
+* Click `Save` to save changes, click `Delete` to delete prompt
+* Click `Back` or `Next` to switch image, you could also select other images from `Image`
+* Click `Finish` when finishing annotation or before switching dataset
--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -0,0 +1,247 @@
+import gradio as gr
+import json
+import jsonlines
+import os
+from args import args
+from pathlib import Path
+from PIL import Image
+from utils import get_datasets
+
+
+shark_root = Path(__file__).parent.parent
+demo_css = shark_root.joinpath("web/demo.css").resolve()
+nodlogo_loc = shark_root.joinpath(
+    "web/models/stable_diffusion/logos/nod-logo.png"
+)
+
+
+with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Column(scale=1, elem_id="demo_title_outer"):
+            gr.Image(
+                value=nod_logo,
+                show_label=False,
+                interactive=False,
+                elem_id="top_logo",
+            ).style(width=150, height=100)
+
+    datasets, images, ds_w_prompts = get_datasets(args.gs_url)
+    prompt_data = dict()
+
+    with gr.Row(elem_id="ui_body"):
+        # TODO: add multiselect dataset, there is a gradio version conflict
+        dataset = gr.Dropdown(label="Dataset", choices=datasets)
+        image_name = gr.Dropdown(label="Image", choices=[])
+
+    with gr.Row(elem_id="ui_body"):
+        # TODO: add ability to search image by typing
+        with gr.Column(scale=1, min_width=600):
+            image = gr.Image(type="filepath").style(height=512)
+
+        with gr.Column(scale=1, min_width=600):
+            prompts = gr.Dropdown(
+                label="Prompts",
+                choices=[],
+            )
+            prompt = gr.Textbox(
+                label="Editor",
+                lines=3,
+            )
+            with gr.Row():
+                save = gr.Button("Save")
+                delete = gr.Button("Delete")
+            with gr.Row():
+                back_image = gr.Button("Back")
+                next_image = gr.Button("Next")
+            finish = gr.Button("Finish")
+
+    def filter_datasets(dataset):
+        if dataset is None:
+            return gr.Dropdown.update(value=None, choices=[])
+
+        # create the dataset dir if doesn't exist and download prompt file
+        dataset_path = str(shark_root) + "/dataset/" + dataset
+        if not os.path.exists(dataset_path):
+            os.mkdir(dataset_path)
+
+        # read prompt jsonlines file
+        prompt_data.clear()
+        if dataset in ds_w_prompts:
+            prompt_gs_path = args.gs_url + "/" + dataset + "/metadata.jsonl"
+            os.system(f'gsutil cp "{prompt_gs_path}" "{dataset_path}"/')
+            with jsonlines.open(dataset_path + "/metadata.jsonl") as reader:
+                for line in reader.iter(type=dict, skip_invalid=True):
+                    prompt_data[line["file_name"]] = (
+                        [line["text"]]
+                        if type(line["text"]) is str
+                        else line["text"]
+                    )
+
+        return gr.Dropdown.update(choices=images[dataset])
+
+    dataset.change(fn=filter_datasets, inputs=dataset, outputs=image_name)
+
+    def display_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return gr.Image.update(value=None), gr.Dropdown.update(value=None)
+
+        # download and load the image
+        img_gs_path = args.gs_url + "/" + dataset + "/" + image_name
+        img_sub_path = "/".join(image_name.split("/")[:-1])
+        img_dst_path = (
+            str(shark_root) + "/dataset/" + dataset + "/" + img_sub_path + "/"
+        )
+        if not os.path.exists(img_dst_path):
+            os.mkdir(img_dst_path)
+        os.system(f'gsutil cp "{img_gs_path}" "{img_dst_path}"')
+        img = Image.open(img_dst_path + image_name.split("/")[-1])
+
+        if image_name not in prompt_data.keys():
+            prompt_data[image_name] = []
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Image.update(value=img), gr.Dropdown.update(
+            choices=prompt_choices
+        )
+
+    image_name.change(
+        fn=display_image,
+        inputs=[dataset, image_name],
+        outputs=[image, prompts],
+    )
+
+    def edit_prompt(prompts):
+        if prompts == "Add new":
+            return gr.Textbox.update(value=None)
+
+        return gr.Textbox.update(value=prompts)
+
+    prompts.change(fn=edit_prompt, inputs=prompts, outputs=prompt)
+
+    def save_prompt(dataset, image_name, prompts, prompt):
+        if (
+            dataset is None
+            or image_name is None
+            or prompts is None
+            or prompt is None
+        ):
+            return
+
+        if prompts == "Add new":
+            prompt_data[image_name].append(prompt)
+        else:
+            idx = prompt_data[image_name].index(prompts)
+            prompt_data[image_name][idx] = prompt
+
+        prompt_path = (
+            str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
+        )
+        # write prompt jsonlines file
+        with open(prompt_path, "w") as f:
+            for key, value in prompt_data.items():
+                if not value:
+                    continue
+                v = value if len(value) > 1 else value[0]
+                f.write(json.dumps({"file_name": key, "text": v}))
+                f.write("\n")
+
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Dropdown.update(choices=prompt_choices, value=None)
+
+    save.click(
+        fn=save_prompt,
+        inputs=[dataset, image_name, prompts, prompt],
+        outputs=prompts,
+    )
+
+    def delete_prompt(dataset, image_name, prompts):
+        if dataset is None or image_name is None or prompts is None:
+            return
+        if prompts == "Add new":
+            return
+
+        prompt_data[image_name].remove(prompts)
+        prompt_path = (
+            str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
+        )
+        # write prompt jsonlines file
+        with open(prompt_path, "w") as f:
+            for key, value in prompt_data.items():
+                if not value:
+                    continue
+                v = value if len(value) > 1 else value[0]
+                f.write(json.dumps({"file_name": key, "text": v}))
+                f.write("\n")
+
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Dropdown.update(choices=prompt_choices, value=None)
+
+    delete.click(
+        fn=delete_prompt,
+        inputs=[dataset, image_name, prompts],
+        outputs=prompts,
+    )
+
+    def get_back_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return
+
+        # remove local image
+        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
+        os.system(f'rm "{img_path}"')
+        # get the index for the back image
+        idx = images[dataset].index(image_name)
+        if idx == 0:
+            return gr.Dropdown.update(value=None)
+
+        return gr.Dropdown.update(value=images[dataset][idx - 1])
+
+    back_image.click(
+        fn=get_back_image, inputs=[dataset, image_name], outputs=image_name
+    )
+
+    def get_next_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return
+
+        # remove local image
+        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
+        os.system(f'rm "{img_path}"')
+        # get the index for the next image
+        idx = images[dataset].index(image_name)
+        if idx == len(images[dataset]) - 1:
+            return gr.Dropdown.update(value=None)
+
+        return gr.Dropdown.update(value=images[dataset][idx + 1])
+
+    next_image.click(
+        fn=get_next_image, inputs=[dataset, image_name], outputs=image_name
+    )
+
+    def finish_annotation(dataset):
+        if dataset is None:
+            return
+
+        # upload prompt and remove local data
+        dataset_path = str(shark_root) + "/dataset/" + dataset
+        dataset_gs_path = args.gs_url + "/" + dataset + "/"
+        os.system(
+            f'gsutil cp "{dataset_path}/metadata.jsonl" "{dataset_gs_path}"'
+        )
+        os.system(f'rm -rf "{dataset_path}"')
+
+        return gr.Dropdown.update(value=None)
+
+    finish.click(fn=finish_annotation, inputs=dataset, outputs=dataset)
+
+
+if __name__ == "__main__":
+    shark_web.launch(
+        share=args.share,
+        inbrowser=True,
+        server_name="0.0.0.0",
+        server_port=args.server_port,
+    )
--- a/dataset/args.py
+++ b/dataset/args.py
@@ -0,0 +1,34 @@
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Dataset Annotator flags
+##############################################################################
+
+p.add_argument(
+    "--gs_url",
+    type=str,
+    required=True,
+    help="URL to datasets in GS bucket",
+)
+
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
+##############################################################################
+
+args = p.parse_args()
--- a/dataset/requirements.txt
+++ b/dataset/requirements.txt
@@ -0,0 +1,3 @@
+# SHARK Annotator
+gradio==3.15.0
+jsonlines
--- a/dataset/utils.py
+++ b/dataset/utils.py
@@ -0,0 +1,29 @@
+from google.cloud import storage
+
+
+def get_datasets(gs_url):
+    datasets = set()
+    images = dict()
+    ds_w_prompts = []
+
+    storage_client = storage.Client()
+    bucket_name = gs_url.split("/")[2]
+    source_blob_name = "/".join(gs_url.split("/")[3:])
+    blobs = storage_client.list_blobs(bucket_name, prefix=source_blob_name)
+
+    for blob in blobs:
+        dataset_name = blob.name.split("/")[1]
+        if dataset_name == "":
+            continue
+        datasets.add(dataset_name)
+        if dataset_name not in images.keys():
+            images[dataset_name] = []
+
+        # check if image or jsonl
+        file_sub_path = "/".join(blob.name.split("/")[2:])
+        if "/" in file_sub_path:
+            images[dataset_name] += [file_sub_path]
+        elif "metadata.jsonl" in file_sub_path:
+            ds_w_prompts.append(dataset_name)
+
+    return list(datasets), images, ds_w_prompts
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -14,21 +14,16 @@ import csv
 import argparse
 from shark.shark_importer import SharkImporter
 from shark.parser import shark_args
-import tensorflow as tf
 import subprocess as sp
 import hashlib
 import numpy as np
 from pathlib import Path
-
-visible_default = tf.config.list_physical_devices("GPU")
-try:
-    tf.config.set_visible_devices([], "GPU")
-    visible_devices = tf.config.get_visible_devices()
-    for device in visible_devices:
-        assert device.device_type != "GPU"
-except:
-    # Invalid device or cannot modify virtual devices once initialized.
-    pass
+from shark.examples.shark_inference.stable_diffusion import (
+    model_wrappers as mw,
+)
+from shark.examples.shark_inference.stable_diffusion.stable_args import (
+    args,
+)


 def create_hash(file_name):
@@ -41,9 +36,12 @@ def create_hash(file_name):


 def save_torch_model(torch_model_list):
-    from tank.model_utils import get_hf_model
-    from tank.model_utils import get_vision_model
-    from tank.model_utils import get_hf_img_cls_model
+    from tank.model_utils import (
+        get_hf_model,
+        get_vision_model,
+        get_hf_img_cls_model,
+        get_fp16_model,
+    )

    with open(torch_model_list) as csvfile:
        torch_reader = csv.reader(csvfile, delimiter=",")
@@ -59,13 +57,39 @@ def save_torch_model(torch_model_list):

            model = None
            input = None
+            if model_type == "stable_diffusion":
+                args.use_tuned = False
+                args.import_mlir = True
+                args.use_tuned = False
+                args.local_tank_cache = WORKDIR
+
+                precision_values = ["fp16"]
+                seq_lengths = [64, 77]
+                for precision_value in precision_values:
+                    args.precision = precision_value
+                    for length in seq_lengths:
+                        model = mw.SharkifyStableDiffusionModel(
+                            model_id=torch_model_name,
+                            custom_weights="",
+                            precision=precision_value,
+                            max_len=length,
+                            width=512,
+                            height=512,
+                            use_base_vae=False,
+                            debug=True,
+                            sharktank_dir=WORKDIR,
+                            generate_vmfb=False,
+                        )
+                        model()
+                continue
            if model_type == "vision":
                model, input, _ = get_vision_model(torch_model_name)
            elif model_type == "hf":
                model, input, _ = get_hf_model(torch_model_name)
            elif model_type == "hf_img_cls":
                model, input, _ = get_hf_img_cls_model(torch_model_name)
-
+            elif model_type == "fp16":
+                model, input, _ = get_fp16_model(torch_model_name)
            torch_model_name = torch_model_name.replace("/", "_")
            torch_model_dir = os.path.join(
                WORKDIR, str(torch_model_name) + "_torch"
@@ -106,6 +130,17 @@ def save_tf_model(tf_model_list):
        get_keras_model,
        get_TFhf_model,
    )
+    import tensorflow as tf
+
+    visible_default = tf.config.list_physical_devices("GPU")
+    try:
+        tf.config.set_visible_devices([], "GPU")
+        visible_devices = tf.config.get_visible_devices()
+        for device in visible_devices:
+            assert device.device_type != "GPU"
+    except:
+        # Invalid device or cannot modify virtual devices once initialized.
+        pass

    with open(tf_model_list) as csvfile:
        tf_reader = csv.reader(csvfile, delimiter=",")
@@ -201,34 +236,35 @@ def is_valid_file(arg):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--torch_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/pytorch/torch_model_list.csv",
-        help="""Contains the file with torch_model name and args.
-             Please see: https://github.com/nod-ai/SHARK/blob/main/tank/pytorch/torch_model_list.csv""",
-    )
-    parser.add_argument(
-        "--tf_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/tf/tf_model_list.csv",
-        help="Contains the file with tf model name and args.",
-    )
-    parser.add_argument(
-        "--tflite_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/tflite/tflite_model_list.csv",
-        help="Contains the file with tf model name and args.",
-    )
-    parser.add_argument(
-        "--ci_tank_dir",
-        type=bool,
-        default=False,
-    )
-    parser.add_argument("--upload", type=bool, default=False)
+    # Note, all of these flags are overridden by the import of args from stable_args.py, flags are duplicated temporarily to preserve functionality
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument(
+    #    "--torch_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/torch_model_list.csv",
+    #    help="""Contains the file with torch_model name and args.
+    #         Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
+    # )
+    # parser.add_argument(
+    #    "--tf_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/tf_model_list.csv",
+    #    help="Contains the file with tf model name and args.",
+    # )
+    # parser.add_argument(
+    #    "--tflite_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/tflite/tflite_model_list.csv",
+    #    help="Contains the file with tf model name and args.",
+    # )
+    # parser.add_argument(
+    #    "--ci_tank_dir",
+    #    type=bool,
+    #    default=False,
+    # )
+    # parser.add_argument("--upload", type=bool, default=False)

-    args = parser.parse_args()
+    # old_args = parser.parse_args()

    home = str(Path.home())
    if args.ci_tank_dir == True:
@@ -244,8 +280,3 @@ if __name__ == "__main__":

    if args.tflite_model_csv:
        save_tflite_model(args.tflite_model_csv)
-
-    if args.upload:
-        git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
-        print("uploading files to gs://shark_tank/" + git_hash)
-        os.system(f"gsutil cp -r {WORKDIR}* gs://shark_tank/" + git_hash)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,9 +4,9 @@ requires = [
    "wheel",
    "packaging",

-    "numpy==1.22.4",
-    "torch-mlir>=20220428.420",
-    "iree-compiler>=20220427.13",
-    "iree-runtime>=20220427.13",
+    "numpy>=1.22.4",
+    "torch-mlir>=20221021.633",
+    "iree-compiler>=20221022.190",
+    "iree-runtime>=20221022.190",
 ]
 build-backend = "setuptools.build_meta"
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
 addopts = --verbose -p no:warnings
-norecursedirs = inference tank/tflite 
+norecursedirs = inference tank/tflite examples benchmarks shark 
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -1,4 +1,4 @@
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+-f https://download.pytorch.org/whl/nightly/cpu/
 --pre

 numpy
@@ -28,6 +28,7 @@ Pillow

 # web dependecies.
 gradio
+altair

 # Testing and support.
 #lit
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -2,8 +2,9 @@
 --pre

 numpy==1.22.4
-torch
 torchvision
+pytorch-triton
+tabulate

 tqdm

@@ -14,7 +15,8 @@ iree-tools-tf

 # TensorFlow and JAX.
 gin-config
-tensorflow
+tensorflow==2.10.1
+keras==2.10
 #tf-models-nightly
 #tensorflow-text-nightly
 transformers
@@ -34,6 +36,7 @@ sacremoses

 # web dependecies.
 gradio
+altair
 scipy

 #ONNX and ORT for benchmarking
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,10 +5,25 @@ wheel
 tqdm

 # SHARK Downloader
-gsutil
+google-cloud-storage

 # Testing
 pytest
 pytest-xdist
+pytest-forked
 Pillow
 parameterized
+
+# Add transformers, diffusers and scipy since it most commonly used
+transformers
+diffusers
+scipy
+ftfy
+gradio
+altair
+omegaconf
+safetensors
+
+# Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
+pefile
+pyinstaller
--- a/setup.py
+++ b/setup.py
@@ -10,8 +10,8 @@ PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
 backend_deps = []
 if "NO_BACKEND" in os.environ.keys():
    backend_deps = [
-        "iree-compiler>=20220427.13",
-        "iree-runtime>=20220427.13",
+        "iree-compiler>=20221022.190",
+        "iree-runtime>=20221022.190",
    ]

 setup(
@@ -33,11 +33,11 @@ setup(
        "Operating System :: OS Independent",
    ],
    packages=find_packages(exclude=("examples")),
-    python_requires=">=3.7",
+    python_requires=">=3.9",
    install_requires=[
        "numpy",
        "PyYAML",
-        "torch-mlir>=20220428.420",
+        "torch-mlir>=20221021.633",
    ]
    + backend_deps,
 )
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -0,0 +1,45 @@
+param([string]$arguments)
+
+if ($arguments -eq "--update-src"){
+	git pull
+}
+
+#Write-Host "Installing python"
+
+#Start-Process winget install Python.Python.3.10 '/quiet InstallAllUsers=1 PrependPath=1' -wait -NoNewWindow
+
+#Write-Host "python installation completed successfully"
+
+#Write-Host "Reload environment variables"
+#$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
+#Write-Host "Reloaded environment variables"
+
+
+# redirect stderr into stdout
+$p = &{python -V} 2>&1
+# check if an ErrorRecord was returned
+$version = if($p -is [System.Management.Automation.ErrorRecord])
+{
+    # grab the version string from the error message
+    $p.Exception.Message
+}
+else
+{
+    # otherwise return as is
+    $p
+}
+
+Write-Host "Python version found is"
+Write-Host $p
+
+
+Write-Host "Installing Build Dependencies"
+python -m venv .\shark.venv\
+.\shark.venv\Scripts\activate
+pip install -r requirements.txt
+pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
+pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
+Write-Host "Building SHARK..."
+pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
+Write-Host "Build and installation completed successfully"
+Write-Host "Source your venv with ./shark.venv/Scripts/activate"
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -76,11 +76,16 @@ fi
 $PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
 $PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
 if [ "$torch_mlir_bin" = true ]; then
-  $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
-  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch-mlir"
+  if [[ $(uname -s) = 'Darwin' ]]; then
+    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
+    $PYTHON -m pip install --pre --no-cache-dir  torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
  else
-    echo "Could not install torch-mlir" >&2
+    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
+    if [ $? -eq 0 ];then
+      echo "Successfully Installed torch-mlir"
+    else
+      echo "Could not install torch-mlir" >&2
+    fi
  fi
 else
  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
@@ -89,37 +94,46 @@ else
  exit 1
 fi
 if [[ -z "${USE_IREE}" ]]; then
-  RUNTIME="nod-ai/SHARK-Runtime"
+  rm .use-iree
+  RUNTIME="https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html"
 else
-  RUNTIME="google/iree"
+  touch ./.use-iree
+  RUNTIME="https://iree-org.github.io/iree/pip-release-links.html"
 fi
 if [[ -z "${NO_BACKEND}" ]]; then
  echo "Installing ${RUNTIME}..."
-  $PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime
+  $PYTHON -m pip install --upgrade --find-links ${RUNTIME} iree-compiler iree-runtime
 else
  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
 fi
+
 if [[ ! -z "${IMPORTER}" ]]; then
  echo "${Yellow}Installing importer tools.."
  if [[ $(uname -s) = 'Linux' ]]; then
    echo "${Yellow}Linux detected.. installing Linux importer tools"
-    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    #Always get the importer tools from upstream IREE
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  elif [[ $(uname -s) = 'Darwin' ]]; then
    echo "${Yellow}macOS detected.. installing macOS importer tools"
    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
-    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  fi
 fi

-$PYTHON -m pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://github.com/${RUNTIME}/releases
+$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/torch/

 if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
+  T_VER=$($PYTHON -m pip show torch | grep Version)
+  TORCH_VERSION=${T_VER:9:17}
+  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
+  TV_VERSION=${TV_VER:9:18}
  $PYTHON -m pip uninstall -y torch torchvision
-  $PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
+  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu117/torch-${TORCH_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu117/torchvision-${TV_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl
  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch + cu116."
+    echo "Successfully Installed torch + cu117."
  else
-    echo "Could not install torch + cu116." >&2
+    echo "Could not install torch + cu117." >&2
  fi
 fi

--- a/shark/examples/shark_eager/dynamo_demo.ipynb
+++ b/shark/examples/shark_eager/dynamo_demo.ipynb
@@ -36,7 +36,9 @@
    "    from torchdynamo.optimizations.backends import create_backend\n",
    "    from torchdynamo.optimizations.subgraph import SubGraph\n",
    "except ModuleNotFoundError:\n",
-    "    print(\"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\")\n",
+    "    print(\n",
+    "        \"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\"\n",
+    "    )\n",
    "    exit()\n",
    "\n",
    "# torch-mlir imports for compiling\n",
@@ -97,7 +99,9 @@
    "\n",
    "        for node in fx_g.graph.nodes:\n",
    "            if node.op == \"output\":\n",
-    "                assert len(node.args) == 1, \"Output node must have a single argument\"\n",
+    "                assert (\n",
+    "                    len(node.args) == 1\n",
+    "                ), \"Output node must have a single argument\"\n",
    "                node_arg = node.args[0]\n",
    "                if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
    "                    node.args = (node_arg[0],)\n",
@@ -116,8 +120,12 @@
    "    if len(args) == 1 and isinstance(args[0], list):\n",
    "        args = args[0]\n",
    "\n",
-    "    linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)\n",
-    "    callable, _ = get_iree_compiled_module(linalg_module, \"cuda\", func_name=\"forward\")\n",
+    "    linalg_module = compile(\n",
+    "        ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS\n",
+    "    )\n",
+    "    callable, _ = get_iree_compiled_module(\n",
+    "        linalg_module, \"cuda\", func_name=\"forward\"\n",
+    "    )\n",
    "\n",
    "    def forward(*inputs):\n",
    "        return callable(*inputs)\n",
@@ -212,6 +220,7 @@
    "    assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
    "    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
    "\n",
+    "\n",
    "@torchdynamo.optimize(\"torch_mlir\")\n",
    "def toy_example2(*args):\n",
    "    a, b = args\n",
--- a/shark/examples/shark_inference/CLIPModel_tf.py
+++ b/shark/examples/shark_inference/CLIPModel_tf.py
@@ -22,7 +22,7 @@ class CLIPModule(tf.Module):
            input_ids=x, attention_mask=y, pixel_values=z
        )

-    @tf.function(input_signature=clip_vit_inputs)
+    @tf.function(input_signature=clip_vit_inputs, jit_compile=True)
    def forward(self, input_ids, attention_mask, pixel_values):
        return self.m.predict(
            input_ids, attention_mask, pixel_values
--- a/shark/examples/shark_inference/ESRGAN/README.md
+++ b/shark/examples/shark_inference/ESRGAN/README.md
@@ -0,0 +1,15 @@
+## Running ESRGAN
+
+```
+1. pip install numpy opencv-python
+2. mkdir InputImages
+   (this is where all the input images will reside in)
+3. mkdir OutputImages
+   (this is where the model will generate all the images)
+4. mkdir models
+   (save the .pth checkpoint file here)
+5. python esrgan.py
+```
+
+- Download [RRDB_ESRGAN_x4.pth](https://drive.google.com/drive/u/0/folders/17VYV_SoZZesU6mbxz2dMAIccSSlqLecY) and place it in the `models` directory as mentioned above in step 4.
+- Credits : [ESRGAN](https://github.com/xinntao/ESRGAN)
--- a/shark/examples/shark_inference/ESRGAN/esrgan.py
+++ b/shark/examples/shark_inference/ESRGAN/esrgan.py
@@ -0,0 +1,239 @@
+from ast import arg
+import os.path as osp
+import glob
+import cv2
+import numpy as np
+import torch
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from shark.shark_inference import SharkInference
+import torch_mlir
+import tempfile
+import functools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def make_layer(block, n_layers):
+    layers = []
+    for _ in range(n_layers):
+        layers.append(block())
+    return nn.Sequential(*layers)
+
+
+class ResidualDenseBlock_5C(nn.Module):
+    def __init__(self, nf=64, gc=32, bias=True):
+        super(ResidualDenseBlock_5C, self).__init__()
+        # gc: growth channel, i.e. intermediate channels
+        self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
+        self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
+        self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
+        self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
+        self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+
+        # initialization
+        # mutil.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
+
+    def forward(self, x):
+        x1 = self.lrelu(self.conv1(x))
+        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
+        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
+        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
+        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
+        return x5 * 0.2 + x
+
+
+class RRDB(nn.Module):
+    """Residual in Residual Dense Block"""
+
+    def __init__(self, nf, gc=32):
+        super(RRDB, self).__init__()
+        self.RDB1 = ResidualDenseBlock_5C(nf, gc)
+        self.RDB2 = ResidualDenseBlock_5C(nf, gc)
+        self.RDB3 = ResidualDenseBlock_5C(nf, gc)
+
+    def forward(self, x):
+        out = self.RDB1(x)
+        out = self.RDB2(out)
+        out = self.RDB3(out)
+        return out * 0.2 + x
+
+
+class RRDBNet(nn.Module):
+    def __init__(self, in_nc, out_nc, nf, nb, gc=32):
+        super(RRDBNet, self).__init__()
+        RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
+
+        self.conv_first = nn.Conv2d(in_nc, nf, 3, 1, 1, bias=True)
+        self.RRDB_trunk = make_layer(RRDB_block_f, nb)
+        self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        #### upsampling
+        self.upconv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.upconv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.HRconv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
+
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+
+    def forward(self, x):
+        fea = self.conv_first(x)
+        trunk = self.trunk_conv(self.RRDB_trunk(fea))
+        fea = fea + trunk
+
+        fea = self.lrelu(
+            self.upconv1(F.interpolate(fea, scale_factor=2, mode="nearest"))
+        )
+        fea = self.lrelu(
+            self.upconv2(F.interpolate(fea, scale_factor=2, mode="nearest"))
+        )
+        out = self.conv_last(self.lrelu(self.HRconv(fea)))
+
+        return out
+
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument(
+    "--mlir_loc",
+    type=str,
+    default=None,
+    help="location of the model's mlir file",
+)
+args = p.parse_args()
+###################################################
+
+
+def inference(input_m):
+    return model(input_m)
+
+
+def load_mlir(mlir_loc):
+    import os
+
+    if mlir_loc == None:
+        return None
+    print(f"Trying to load the model from {mlir_loc}.")
+    with open(os.path.join(mlir_loc)) as f:
+        mlir_module = f.read()
+    return mlir_module
+
+
+def compile_through_fx(model, inputs, mlir_loc=None):
+    module = load_mlir(mlir_loc)
+    if module == None:
+        fx_g = make_fx(
+            model,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(inputs)
+
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+
+        def strip_overloads(gm):
+            """
+            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+            Args:
+                gm(fx.GraphModule): The input Fx graph module to be modified
+            """
+            for node in gm.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverload):
+                    node.target = node.target.overloadpacket
+            gm.recompile()
+
+        strip_overloads(fx_g)
+
+        ts_g = torch.jit.script(fx_g)
+
+        print("Torchscript graph generated successfully")
+        module = torch_mlir.compile(
+            ts_g,
+            inputs,
+            torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=False,
+            verbose=False,
+        )
+
+    mlir_model = str(module)
+    func_name = "forward"
+    shark_module = SharkInference(
+        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
+    )
+    shark_module.compile()
+
+    return shark_module
+
+
+model_path = "models/RRDB_ESRGAN_x4.pth"  # models/RRDB_ESRGAN_x4.pth OR models/RRDB_PSNR_x4.pth
+# device = torch.device('cuda')  # if you want to run on CPU, change 'cuda' -> cpu
+device = torch.device("cpu")
+
+test_img_folder = "InputImages/*"
+
+model = RRDBNet(3, 3, 64, 23, gc=32)
+model.load_state_dict(torch.load(model_path), strict=True)
+model.eval()
+model = model.to(device)
+
+print("Model path {:s}. \nTesting...".format(model_path))
+
+if __name__ == "__main__":
+    idx = 0
+    for path in glob.glob(test_img_folder):
+        idx += 1
+        base = osp.splitext(osp.basename(path))[0]
+        print(idx, base)
+        # read images
+        img = cv2.imread(path, cv2.IMREAD_COLOR)
+        img = img * 1.0 / 255
+        img = torch.from_numpy(
+            np.transpose(img[:, :, [2, 1, 0]], (2, 0, 1))
+        ).float()
+        img_LR = img.unsqueeze(0)
+        img_LR = img_LR.to(device)
+
+        with torch.no_grad():
+            shark_module = compile_through_fx(inference, img_LR)
+            shark_output = shark_module.forward((img_LR,))
+            shark_output = torch.from_numpy(shark_output)
+            shark_output = (
+                shark_output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            )
+            esrgan_output = (
+                model(img_LR).data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            )
+        # SHARK OUTPUT
+        shark_output = np.transpose(shark_output[[2, 1, 0], :, :], (1, 2, 0))
+        shark_output = (shark_output * 255.0).round()
+        cv2.imwrite(
+            "OutputImages/{:s}_rlt_shark_output.png".format(base), shark_output
+        )
+        print("Generated SHARK's output")
+        # ESRGAN OUTPUT
+        esrgan_output = np.transpose(esrgan_output[[2, 1, 0], :, :], (1, 2, 0))
+        esrgan_output = (esrgan_output * 255.0).round()
+        cv2.imwrite(
+            "OutputImages/{:s}_rlt_esrgan_output.png".format(base),
+            esrgan_output,
+        )
+        print("Generated ESRGAN's output")
--- a/shark/examples/shark_inference/albert_maskfill_tf.py
+++ b/shark/examples/shark_inference/albert_maskfill_tf.py
@@ -28,7 +28,7 @@ class AlbertModule(tf.Module):
        self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=t5_inputs)
+    @tf.function(input_signature=t5_inputs, jit_compile=True)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

--- a/shark/examples/shark_inference/bloom_tank.py
+++ b/shark/examples/shark_inference/bloom_tank.py
@@ -1,7 +1,9 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model

-mlir_model, func_name, inputs, golden_out = download_torch_model("bloom")
+mlir_model, func_name, inputs, golden_out = download_model(
+    "bloom", frontend="torch"
+)

 shark_module = SharkInference(
    mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"
--- a/shark/examples/shark_inference/gpt2_tf.py
+++ b/shark/examples/shark_inference/gpt2_tf.py
@@ -19,7 +19,7 @@ class GPT2Module(tf.Module):

        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=gpt2_inputs)
+    @tf.function(input_signature=gpt2_inputs, jit_compile=True)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

--- a/shark/examples/shark_inference/minilm_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_benchmark_tf.py
@@ -26,7 +26,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/shark/examples/shark_inference/minilm_jit.py
+++ b/shark/examples/shark_inference/minilm_jit.py
@@ -1,9 +1,10 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model


-mlir_model, func_name, inputs, golden_out = download_torch_model(
-    "microsoft/MiniLM-L12-H384-uncased"
+mlir_model, func_name, inputs, golden_out = download_model(
+    "microsoft/MiniLM-L12-H384-uncased",
+    frontend="torch",
 )


--- a/shark/examples/shark_inference/minilm_tf.py
+++ b/shark/examples/shark_inference/minilm_tf.py
@@ -26,7 +26,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/shark/examples/shark_inference/resnet50_script.py
+++ b/shark/examples/shark_inference/resnet50_script.py
@@ -5,7 +5,7 @@ import torchvision.models as models
 from torchvision import transforms
 import sys
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_torch_model
+from shark.shark_downloader import download_model


 ################################## Preprocessing inputs and model ############
@@ -66,10 +66,12 @@ labels = load_labels()


 ## Can pass any img or input to the forward module.
-mlir_model, func_name, inputs, golden_out = download_torch_model("resnet50")
+mlir_model, func_name, inputs, golden_out = download_model(
+    "resnet50", frontend="torch"
+)

 shark_module = SharkInference(mlir_model, func_name, mlir_dialect="linalg")
-# shark_module.compile()
+shark_module.compile()
 path = shark_module.save_module()
 shark_module.load_module(path)
 result = shark_module.forward((img.detach().numpy(),))
--- a/shark/examples/shark_inference/simple_dlrm.py
+++ b/shark/examples/shark_inference/simple_dlrm.py
@@ -151,7 +151,6 @@ class DLRM_Net(nn.Module):
            and (ln_top is not None)
            and (arch_interaction_op is not None)
        ):
-
            # save arguments
            self.output_d = 0
            self.arch_interaction_op = arch_interaction_op
@@ -216,7 +215,6 @@ class DLRM_Net(nn.Module):
        return ly

    def interact_features(self, x, ly):
-
        if self.arch_interaction_op == "dot":
            # concatenate dense and sparse features
            (batch_size, d) = x.shape
--- a/shark/examples/shark_inference/sparse_arch.py
+++ b/shark/examples/shark_inference/sparse_arch.py
@@ -99,7 +99,6 @@ class SparseArchShark(nn.Module):
        )

    def forward(self, *batched_inputs):
-
        concatenated_list = []
        input_enum, embedding_enum = 0, 0

@@ -121,7 +120,6 @@ class SparseArchShark(nn.Module):


 def test_sparse_arch() -> None:
-
    D = 3
    eb1_config = EmbeddingBagConfig(
        name="t1",
@@ -211,7 +209,6 @@ class DLRMShark(nn.Module):
    def forward(
        self, dense_features: torch.Tensor, *sparse_features
    ) -> torch.Tensor:
-
        embedded_dense = self.dense_arch(dense_features)
        embedded_sparse = self.sparse_arch(*sparse_features)
        concatenated_dense = self.inter_arch(
--- a/shark/examples/shark_inference/stable_diff.py
+++ b/shark/examples/shark_inference/stable_diff.py
@@ -47,8 +47,7 @@ def load_mlir(mlir_loc):
    return mlir_module


-def compile_through_fx(model, inputs, mlir_loc=None):
-
+def compile_through_fx(model, inputs, mlir_loc=None, extra_args=[]):
    module = load_mlir(mlir_loc)
    if mlir_loc == None:
        fx_g = make_fx(
@@ -98,15 +97,17 @@ def compile_through_fx(model, inputs, mlir_loc=None):
    func_name = "forward"

    shark_module = SharkInference(
-        mlir_model, func_name, device=args.device, mlir_dialect="tm_tensor"
+        mlir_model,
+        func_name,
+        device=args.device,
+        mlir_dialect="tm_tensor",
    )
-    shark_module.compile()
+    shark_module.compile(extra_args)

    return shark_module


 if __name__ == "__main__":
-
    YOUR_TOKEN = "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"

    # 1. Load the autoencoder model which will be used to decode the latents into image space.
@@ -161,6 +162,7 @@ if __name__ == "__main__":
        unet,
        (latent_model_input, torch.tensor([1.0]), text_embeddings),
        args.mlir_loc,
+        ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
    )

    # torch.jit.script(unet)
@@ -220,7 +222,6 @@ if __name__ == "__main__":
    # print(latents, latents.shape)

    for i, t in tqdm(enumerate(scheduler.timesteps)):
-
        print(f"i = {i} t = {t}")
        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
        latent_model_input = torch.cat([latents] * 2)
--- a/shark/examples/shark_inference/stable_diff_f16.py
+++ b/shark/examples/shark_inference/stable_diff_f16.py
@@ -10,21 +10,59 @@ from torch._decomp import get_decompositions
 import torch_mlir
 import tempfile
 import numpy as np
-import os

-##############################################################################
+# pip install diffusers
+# pip install scipy
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a photograph of an astronaut riding a horse",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument("--steps", type=int, default=50, help="the device to use")
+p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
+p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
+args = p.parse_args()
+
+#####################################################
+
+
+def fp16_unet():
+    from shark.shark_downloader import download_model
+
+    mlir_model, func_name, inputs, golden_out = download_model(
+        "stable_diff_f16_18_OCT",
+        tank_url="gs://shark_tank/prashant_nod",
+        frontend="torch",
+    )
+    shark_module = SharkInference(
+        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
+    )
+    shark_module.compile()
+    return shark_module


 def load_mlir(mlir_loc):
+    import os
+
    if mlir_loc == None:
        return None
+    print(f"Trying to load the model from {mlir_loc}.")
    with open(os.path.join(mlir_loc)) as f:
        mlir_module = f.read()
    return mlir_module


-def compile_through_fx(model, inputs, device, mlir_loc=None):
-
+def compile_through_fx(model, inputs, mlir_loc=None):
    module = load_mlir(mlir_loc)
    if mlir_loc == None:
        fx_g = make_fx(
@@ -74,106 +112,78 @@ def compile_through_fx(model, inputs, device, mlir_loc=None):
    func_name = "forward"

    shark_module = SharkInference(
-        mlir_model, func_name, device=device, mlir_dialect="tm_tensor"
+        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
    )
    shark_module.compile()

    return shark_module


-##############################################################################
+if __name__ == "__main__":
+    YOUR_TOKEN = "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"

-DEBUG = False
-compiled_module = {}
-
-
-def stable_diff_inf(prompt: str, steps, device: str):
-
-    args = {}
-    args["prompt"] = [prompt]
-    args["steps"] = steps
-    args["device"] = device
-    args["mlir_loc"] = "./stable_diffusion.mlir"
-    output_loc = (
-        f"stored_results/stable_diffusion/{prompt}_{int(steps)}_{device}.jpg"
+    # 1. Load the autoencoder model which will be used to decode the latents into image space.
+    vae = AutoencoderKL.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        subfolder="vae",
+        use_auth_token=YOUR_TOKEN,
    )

-    global DEBUG
-    global compiled_module
+    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModel.from_pretrained(
+        "openai/clip-vit-large-patch14"
+    )

-    DEBUG = False
-    log_write = open(r"logs/stable_diffusion_log.txt", "w")
-    if log_write:
-        DEBUG = True
+    class VaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="vae",
+                use_auth_token=YOUR_TOKEN,
+            )

-    if args["device"] not in compiled_module.keys():
-        YOUR_TOKEN = "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"
+        def forward(self, input):
+            return self.vae.decode(input, return_dict=False)[0]

-        # 1. Load the autoencoder model which will be used to decode the latents into image space.
-        compiled_module["vae"] = AutoencoderKL.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            subfolder="vae",
-            use_auth_token=YOUR_TOKEN,
-        )
+    vae = VaeModel()
+    vae_input = torch.rand(1, 4, 64, 64)
+    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)

-        # 2. Load the tokenizer and text encoder to tokenize and encode the text.
-        compiled_module["tokenizer"] = CLIPTokenizer.from_pretrained(
-            "openai/clip-vit-large-patch14"
-        )
-        compiled_module["text_encoder"] = CLIPTextModel.from_pretrained(
-            "openai/clip-vit-large-patch14"
-        )
-        if DEBUG:
-            log_write.write("Compiling the Unet module.\n")
+    # Wrap the unet model to return tuples.
+    class UnetModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.unet = UNet2DConditionModel.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="unet",
+                use_auth_token=YOUR_TOKEN,
+            )
+            self.in_channels = self.unet.in_channels
+            self.train(False)

-        # Wrap the unet model to return tuples.
-        class UnetModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.unet = UNet2DConditionModel.from_pretrained(
-                    "CompVis/stable-diffusion-v1-4",
-                    subfolder="unet",
-                    use_auth_token=YOUR_TOKEN,
-                )
-                self.in_channels = self.unet.in_channels
-                self.train(False)
+    def forward(self, x, y, z):
+        return self.unet.forward(x, y, z, return_dict=False)[0]

-            def forward(self, x, y, z):
-                return self.unet.forward(x, y, z, return_dict=False)[0]
+    # # 3. The UNet model for generating the latents.
+    unet = UnetModel()

-        # 3. The UNet model for generating the latents.
-        unet = UnetModel()
-        latent_model_input = torch.rand([2, 4, 64, 64])
-        text_embeddings = torch.rand([2, 77, 768])
-        shark_unet = compile_through_fx(
-            unet,
-            (latent_model_input, torch.tensor([1.0]), text_embeddings),
-            args["device"],
-            args["mlir_loc"],
-        )
-        compiled_module[args["device"]] = shark_unet
-        if DEBUG:
-            log_write.write("Compilation successful.\n")
+    shark_unet = fp16_unet()

-        compiled_module["unet"] = unet
-        compiled_module["scheduler"] = LMSDiscreteScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            num_train_timesteps=1000,
-        )
+    scheduler = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )

-    shark_unet = compiled_module[args["device"]]
-    vae = compiled_module["vae"]
-    unet = compiled_module["unet"]
-    tokenizer = compiled_module["tokenizer"]
-    text_encoder = compiled_module["text_encoder"]
-    scheduler = compiled_module["scheduler"]
+    prompt = [args.prompt]

    height = 512  # default height of Stable Diffusion
    width = 512  # default width of Stable Diffusion

-    num_inference_steps = int(args["steps"])  # Number of denoising steps
+    num_inference_steps = args.steps  # Number of denoising steps

    guidance_scale = 7.5  # Scale for classifier-free guidance

@@ -181,10 +191,10 @@ def stable_diff_inf(prompt: str, steps, device: str):
        42
    )  # Seed generator to create the inital latent noise

-    batch_size = len(args["prompt"])
+    batch_size = len(prompt)

    text_input = tokenizer(
-        args["prompt"],
+        prompt,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        truncation=True,
@@ -208,30 +218,40 @@ def stable_diff_inf(prompt: str, steps, device: str):
        (batch_size, unet.in_channels, height // 8, width // 8),
        generator=generator,
    )
+    # latents = latents.to(torch_device)
+
    scheduler.set_timesteps(num_inference_steps)
+
    latents = latents * scheduler.sigmas[0]
+    # print(latents, latents.shape)

    for i, t in tqdm(enumerate(scheduler.timesteps)):
-
-        if DEBUG:
-            log_write.write(f"i = {i} t = {t}\n")
+        print(f"i = {i} t = {t}")
        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
        latent_model_input = torch.cat([latents] * 2)
        sigma = scheduler.sigmas[i]
        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

        # predict the noise residual
-        latent_model_input_numpy = latent_model_input.detach().numpy()
-        text_embeddings_numpy = text_embeddings.detach().numpy()
+
+        # with torch.no_grad():
+        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
+
+        latent_model_input_numpy = (
+            latent_model_input.detach().numpy().astype(np.half)
+        )
+        text_embeddings_numpy = (
+            text_embeddings.detach().numpy().astype(np.half)
+        )

        noise_pred = shark_unet.forward(
            (
                latent_model_input_numpy,
-                np.array([t]).astype(np.float32),
+                np.array([t]).astype(np.half),
                text_embeddings_numpy,
            )
        )
-        noise_pred = torch.from_numpy(noise_pred)
+        noise_pred = torch.from_numpy(noise_pred).to(torch.float32)

        # perform guidance
        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
@@ -242,21 +262,16 @@ def stable_diff_inf(prompt: str, steps, device: str):
        # compute the previous noisy sample x_t -> x_t-1
        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]

+    # print("Latents shape : ", latents.shape)
+
    # scale and decode the image latents with vae
    latents = 1 / 0.18215 * latents
-    image = vae.decode(latents).sample
+    latents_numpy = latents.detach().numpy()
+    image = shark_vae.forward((latents_numpy,))
+    image = torch.from_numpy(image)

    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]
-    output = pil_images[0]
-    # save the output image with the prompt name.
-    output.save(os.path.join(output_loc))
-    log_write.close()
-
-    std_output = ""
-    with open(r"logs/stable_diffusion_log.txt", "r") as log_read:
-        std_output = log_read.read()
-
-    return output, std_output
+    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_tf.py
+++ b/shark/examples/shark_inference/stable_diff_tf.py
@@ -17,7 +17,7 @@ from keras_cv.models.generative.stable_diffusion.text_encoder import (
 )

 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
+from shark.shark_downloader import download_model
 from PIL import Image

 # pip install "git+https://github.com/keras-team/keras-cv.git"
@@ -75,8 +75,8 @@ class SharkStableDiffusion:
        # Create models
        self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)

-        mlir_model, func_name, inputs, golden_out = download_tf_model(
-            "stable_diff", tank_url="gs://shark_tank/quinn"
+        mlir_model, func_name, inputs, golden_out = download_model(
+            "stable_diff", tank_url="gs://shark_tank/quinn", frontend="tf"
        )
        shark_module = SharkInference(
            mlir_model, func_name, device=device, mlir_dialect="mhlo"
--- a/shark/examples/shark_inference/stable_diffusion/.gitignore
+++ b/shark/examples/shark_inference/stable_diffusion/.gitignore
@@ -0,0 +1,2 @@
+*.vmfb
+*.jpg
--- a/shark/examples/shark_inference/stable_diffusion/README.md
+++ b/shark/examples/shark_inference/stable_diffusion/README.md
@@ -0,0 +1,106 @@
+# STABLE DIFFUSION
+
+## Installation
+
+Follow setup instructions in the main [README.md](https://github.com/nod-ai/SHARK#readme) for regular usage. 
+
+ 
+## Using other supported Stable Diffusion variants with SHARK:
+
+Currently we support fine-tuned versions of Stable Diffusion such as:
+- [AnythingV3](https://huggingface.co/Linaqruf/anything-v3.0)
+- [Analog Diffusion](https://huggingface.co/wavymulder/Analog-Diffusion)
+
+use the flag `--hf_model_id=` to specify the repo-id of the model to be used.
+
+```shell
+python .\shark\examples\shark_inference\stable_diffusion\main.py --hf_model_id="Linaqruf/anything-v3.0" --max_length=77 --prompt="1girl, brown hair, green eyes, colorful, autumn, cumulonimbus clouds, lighting, blue sky, falling leaves, garden" --no-use_tuned
+```
+
+## Run a custom model using a `.ckpt` / `.safetensors` checkpoint file:
+* Ensure you don't have any `.yaml` file at the root directory of SHARK - best would be to ensure you're on the latest `main` branch and use `--clear_all` the first time you're running the command for inference.
+* Install `pytorch_lightning` by running :-
+```shell
+pip install pytorch_lightning
+```
+NOTE: This is needed to process [ckpt file of runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.ckpt).
+* Download a [.ckpt](https://huggingface.co/andite/anything-v4.0/resolve/main/anything-v4.0-pruned-fp32.ckpt) file in case you don't have a locally generated `.ckpt` file for StableDiffusion.
+
+* Now pass the above `.ckpt` file to `ckpt_loc` command-line argument using the following :-
+```shell
+python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --ckpt_loc="/path/to/.ckpt/file" --no-use_tuned
+```
+* We use a combination of 2 flags to make this feature work : `import_mlir` and `ckpt_loc`.
+* In case `ckpt_loc` is NOT specified then a [default](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) HuggingFace repo-id is run via `hf_model_id`. So, two ways to use `import_mlir` :-
+- With `hf_model_id` to run HuggingFace's StableDiffusion variants.
+- With `ckpt_loc` to run a StableDiffusion variant with a `.ckpt` or `.safetensors` checkpoint file
+
+* Use custom model `.ckpt` files from [HuggingFace-StableDiffusion](https://huggingface.co/models?other=stable-diffusion) to generate images.
+* You may also try out [.safetensors file of Protogen x3.4 of civitai.com](https://civitai.com/models/3666/protogen-x34-photorealism-official-release) and provide the `.safetensors` path to `ckpt_loc` flag.
+* NOTE: Ensure that the `.ckpt` or `.safetensors` file are part of the path passed to `ckpt_loc` flag. Eg: `--ckpt_loc="/path/to/checkpoint/file/name_of_checkpoint.ckpt` OR `--ckpt_loc="/path/to/checkpoint/file/name_of_checkpoint.safetensors`. Also ensure that you're using `--no-use_tuned` flag in your run command.
+
+
+## Running the model for a `batch_size` and for a set of `runs`:
+We currently support batch size in the range `[1, 3]`.
+You can specify batch size using `batch_size` flag (defaults to `1`) and the number of times you want to run the model using `runs` flag (defaults to `1`).
+In total, you'll be able to generate `batch_size * runs` number of images.
+- Usage 1: Using the same prompt -
+```shell
+python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=3 --no-use_tuned
+```
+The example above generates `3` different images in total with the same prompt `tajmahal, oil on canvas, sunflowers, 4k, uhd`.
+- Usage 2: Using different prompts -
+```shell
+python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=3 -p="batman riding a horse, oil on canvas, 4k, uhd" -p="superman riding a horse, oil on canvas, 4k, uhd" --no-use_tuned
+```
+The example above generates `1` image for each different prompt, thus generating `3` images in total.
+- Usage 3: Using `runs` -
+```shell
+python3.10 main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd" --max_length=64 --import_mlir --hf_model_id="runwayml/stable-diffusion-v1-5" --batch_size=2 --runs=3 --no-use_tuned
+```
+The example above generates `6` different images in total, `2` images for each `runs`.
+
+</details>
+  <details>
+  <summary>Debug Commands</summary>
+
+## Debug commands and other advanced usage follows.
+
+```shell
+python main.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text" 
+
+```
+
+## dump all dispatch .spv and isa using amdllpc
+
+```shell
+python main.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
+```
+
+## Compile and save the .vmfb (using vulkan fp16 as an example):
+
+```shell
+python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
+```
+
+## Capture an RGP trace
+
+```shell
+python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
+```
+
+## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
+
+```shell
+iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf16  
+```
+
+## Run the unet module with iree-benchmark-module (same config as above):
+```shell
+##if you want to use .npz inputs:
+unzip ~/.local/shark_tank/<your unet>/inputs.npz
+
+iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --function_input=@arr_0.npy --function_input=1xf16 --function_input=@arr_2.npy --function_input=@arr_3.npy --function_input=@arr_4.npy  
+```
+
+</details>
--- a/shark/examples/shark_inference/stable_diffusion/download_hf_models.py
+++ b/shark/examples/shark_inference/stable_diffusion/download_hf_models.py
@@ -0,0 +1,25 @@
+from PIL import Image
+import requests
+
+from transformers import CLIPProcessor, CLIPModel
+
+model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(
+    text=["a photo of a cat", "a photo of a dog"],
+    images=image,
+    return_tensors="pt",
+    padding=True,
+)
+
+outputs = model(**inputs)
+logits_per_image = (
+    outputs.logits_per_image
+)  # this is the image-text similarity score
+probs = logits_per_image.softmax(
+    dim=1
+)  # we can take the softmax to get the label probabilities
--- a/shark/examples/shark_inference/stable_diffusion/main.py
+++ b/shark/examples/shark_inference/stable_diffusion/main.py
@@ -0,0 +1,339 @@
+import os
+import sys
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"
+
+if sys.platform == "darwin":
+    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
+
+from transformers import CLIPTextModel, CLIPTokenizer
+import torch
+from PIL import Image
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from stable_args import args
+from datetime import datetime as dt
+import json
+import re
+from pathlib import Path
+from model_wrappers import SharkifyStableDiffusionModel
+
+# This has to come before importing cache objects
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    # Temporary workaround of deleting yaml files to incorporate diffusers' pipeline.
+    # TODO: Remove this once we have better weight updation logic.
+    inference_yaml = ["v2-inference-v.yaml", "v1-inference.yaml"]
+    for yaml in inference_yaml:
+        if os.path.exists(yaml):
+            os.remove(yaml)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+from utils import set_init_device_flags, disk_space_check, preprocessCKPT
+
+from schedulers import (
+    SharkEulerDiscreteScheduler,
+)
+import time
+from shark.iree_utils.compile_utils import dump_isas
+
+
+# Helper function to profile the vulkan device.
+def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
+    if args.vulkan_debug_utils and "vulkan" in args.device:
+        import iree
+
+        print(f"Profiling and saving to {file_path}.")
+        vulkan_device = iree.runtime.get_device(args.device)
+        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
+        return vulkan_device
+    return None
+
+
+def end_profiling(device):
+    if device:
+        return device.end_profiling()
+
+
+if __name__ == "__main__":
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+
+    # Make it as default prompt
+    if len(args.prompts) == 0:
+        args.prompts = ["cyberpunk forest by Salvador Dali"]
+
+    prompt = args.prompts
+    neg_prompt = args.negative_prompts
+    height = args.height
+    width = args.width
+    num_inference_steps = args.steps  # Number of denoising steps
+
+    # Scale for classifier-free guidance
+    guidance_scale = torch.tensor(args.guidance_scale).to(torch.float32)
+
+    batch_size = args.batch_size
+    prompt = prompt * batch_size if len(prompt) == 1 else prompt
+    len_of_prompt = len(prompt)
+    assert (
+        len_of_prompt == batch_size
+    ), f"no. of prompts ({len_of_prompt}) is not equal to batch_size ({batch_size})"
+    print("Running StableDiffusion with the following config :-")
+    print(f"Batch size : {batch_size}")
+    print(f"Prompts : {prompt}")
+    print(f"Runs : {args.runs}")
+
+    # Try to make neg_prompt equal to batch_size by appending blank strings.
+    for i in range(batch_size - len(neg_prompt)):
+        neg_prompt.append("")
+
+    set_init_device_flags()
+    disk_space_check(Path.cwd())
+
+    if not args.import_mlir:
+        from opt_params import get_unet, get_vae, get_clip
+
+        clip = get_clip()
+        unet = get_unet()
+        vae = get_vae()
+    else:
+        if args.ckpt_loc != "":
+            assert args.ckpt_loc.lower().endswith(
+                (".ckpt", ".safetensors")
+            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
+            preprocessCKPT()
+        mlir_import = SharkifyStableDiffusionModel(
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.precision,
+            max_len=args.max_length,
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            use_base_vae=args.use_base_vae,
+            use_tuned=args.use_tuned,
+        )
+        clip, unet, vae = mlir_import()
+
+    if args.dump_isa:
+        dump_isas(args.dispatch_benchmarks_dir)
+
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    scheduler = DPMSolverMultistepScheduler.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        subfolder="scheduler",
+    )
+    cpu_scheduling = True
+    if args.hf_model_id == "stabilityai/stable-diffusion-2-1":
+        tokenizer = CLIPTokenizer.from_pretrained(
+            "stabilityai/stable-diffusion-2-1", subfolder="tokenizer"
+        )
+
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-2-1",
+            subfolder="scheduler",
+        )
+
+    if args.hf_model_id == "stabilityai/stable-diffusion-2-1-base":
+        tokenizer = CLIPTokenizer.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer"
+        )
+
+        if args.use_compiled_scheduler:
+            scheduler = SharkEulerDiscreteScheduler.from_pretrained(
+                "stabilityai/stable-diffusion-2-1-base",
+                subfolder="scheduler",
+            )
+            scheduler.compile()
+            cpu_scheduling = False
+        else:
+            scheduler = EulerDiscreteScheduler.from_pretrained(
+                "stabilityai/stable-diffusion-2-1-base",
+                subfolder="scheduler",
+            )
+    for run in range(args.runs):
+        # Handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        seed = args.seed
+        if run >= 1 or seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(
+            seed
+        )  # Seed generator to create the inital latent noise
+
+        # create a random initial latent.
+        latents = torch.randn(
+            (batch_size, 4, height // 8, width // 8),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+        if run == 0:
+            # Warmup phase to improve performance.
+            if args.warmup_count >= 1:
+                vae_warmup_input = torch.clone(latents).detach().numpy()
+                clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
+            for i in range(args.warmup_count):
+                vae("forward", (vae_warmup_input,))
+                clip("forward", (clip_warmup_input,))
+
+        start = time.time()
+        if run == 0:
+            text_input = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=args.max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = tokenizer(
+                neg_prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input = torch.cat(
+                [uncond_input.input_ids, text_input.input_ids]
+            )
+
+            clip_inf_start = time.time()
+            text_embeddings = clip("forward", (text_input,))
+            clip_inf_end = time.time()
+            text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+            text_embeddings_numpy = text_embeddings.detach().numpy()
+
+            scheduler.set_timesteps(num_inference_steps)
+            scheduler.is_scale_input_called = True
+
+        latents = latents * scheduler.init_noise_sigma
+
+        avg_ms = 0
+        for i, t in tqdm(
+            enumerate(scheduler.timesteps), disable=args.hide_steps
+        ):
+            step_start = time.time()
+            if not args.hide_steps:
+                print(f"i = {i} t = {t}", end="")
+            timestep = torch.tensor([t]).to(dtype).detach().numpy()
+            latent_model_input = scheduler.scale_model_input(latents, t)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            profile_device = start_profiling(file_path="unet.rdc")
+
+            noise_pred = unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                ),
+                send_to_host=False,
+            )
+
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = scheduler.step(noise_pred, t, latents).prev_sample
+            else:
+                latents = scheduler.step(noise_pred, t, latents)
+            step_time = time.time() - step_start
+            avg_ms += step_time
+            step_ms = int((step_time) * 1000)
+            if not args.hide_steps:
+                print(f" ({step_ms}ms)")
+
+        # scale and decode the image latents with vae
+        if args.use_base_vae:
+            latents = 1 / 0.18215 * latents
+        latents_numpy = latents
+        if cpu_scheduling:
+            latents_numpy = latents.detach().numpy()
+        profile_device = start_profiling(file_path="vae.rdc")
+        vae_start = time.time()
+        images = vae("forward", (latents_numpy,))
+        vae_end = time.time()
+        end_profiling(profile_device)
+        if args.use_base_vae:
+            image = torch.from_numpy(images)
+            image = (image.detach().cpu() * 255.0).numpy()
+            images = image.round()
+        end_time = time.time()
+
+        avg_ms = 1000 * avg_ms / args.steps
+        clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
+        vae_inf_time = (vae_end - vae_start) * 1000
+        total_time = end_time - start
+
+        print(f"\nStats for run {run}:")
+        print(f"Average step time: {avg_ms}ms/it")
+        print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
+        print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
+        print(f"\nTotal image generation time: {total_time}sec")
+
+        images = torch.from_numpy(images).to(torch.uint8).permute(0, 2, 3, 1)
+        pil_images = [Image.fromarray(image) for image in images.numpy()]
+
+        if args.output_dir is not None:
+            output_path = Path(args.output_dir)
+            output_path.mkdir(parents=True, exist_ok=True)
+        else:
+            output_path = Path.cwd()
+        disk_space_check(output_path, lim=5)
+        for i in range(batch_size):
+            json_store = {
+                "prompt": prompt[i],
+                "negative prompt": args.negative_prompts[i],
+                "seed": seed,
+                "hf_model_id": args.hf_model_id,
+                "precision": args.precision,
+                "steps": args.steps,
+                "guidance_scale": args.guidance_scale,
+                "scheduler": args.scheduler,
+            }
+            prompt_slice = re.sub("[^a-zA-Z0-9]", "_", prompt[i][:15])
+            img_name = f"{prompt_slice}_{seed}_{run}_{i}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+            if args.output_img_format == "jpg":
+                pil_images[i].save(
+                    output_path / f"{img_name}.jpg",
+                    quality=95,
+                    subsampling=0,
+                    optimize=True,
+                    progressive=True,
+                )
+            else:
+                pil_images[i].save(output_path / f"{img_name}.png", "PNG")
+                if args.output_img_format not in ["png", "jpg"]:
+                    print(
+                        f"[ERROR] Format {args.output_img_format} is not supported yet."
+                        "saving image as png. Supported formats png / jpg"
+                    )
+            with open(output_path / f"{img_name}.json", "w") as f:
+                f.write(json.dumps(json_store, indent=4))
--- a/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
+++ b/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
@@ -0,0 +1,284 @@
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from transformers import CLIPTextModel
+from utils import compile_through_fx, get_opt_flags
+from resources import base_models
+from collections import defaultdict
+import torch
+
+
+# These shapes are parameter dependent.
+def replace_shape_str(shape, max_len, width, height, batch_size):
+    new_shape = []
+    for i in range(len(shape)):
+        if shape[i] == "max_len":
+            new_shape.append(max_len)
+        elif shape[i] == "height":
+            new_shape.append(height)
+        elif shape[i] == "width":
+            new_shape.append(width)
+        elif isinstance(shape[i], str):
+            if "batch_size" in shape[i]:
+                mul_val = int(shape[i].split("*")[0])
+                new_shape.append(batch_size * mul_val)
+        else:
+            new_shape.append(shape[i])
+    return new_shape
+
+
+# Get the input info for various models i.e. "unet", "clip", "vae".
+def get_input_info(model_info, max_len, width, height, batch_size):
+    dtype_config = {"f32": torch.float32, "i64": torch.int64}
+    input_map = defaultdict(list)
+    for k in model_info:
+        for inp in model_info[k]:
+            shape = model_info[k][inp]["shape"]
+            dtype = dtype_config[model_info[k][inp]["dtype"]]
+            tensor = None
+            if isinstance(shape, list):
+                clean_shape = replace_shape_str(
+                    shape, max_len, width, height, batch_size
+                )
+                if dtype == torch.int64:
+                    tensor = torch.randint(1, 3, tuple(clean_shape))
+                else:
+                    tensor = torch.randn(*clean_shape).to(dtype)
+            elif isinstance(shape, int):
+                tensor = torch.tensor(shape).to(dtype)
+            else:
+                sys.exit("shape isn't specified correctly.")
+            input_map[k].append(tensor)
+    return input_map
+
+
+class SharkifyStableDiffusionModel:
+    def __init__(
+        self,
+        model_id: str,
+        custom_weights: str,
+        precision: str,
+        max_len: int = 64,
+        width: int = 512,
+        height: int = 512,
+        batch_size: int = 1,
+        use_base_vae: bool = False,
+        use_tuned: bool = False,
+        debug: bool = False,
+        sharktank_dir: str = "",
+        generate_vmfb: bool = True,
+    ):
+        self.check_params(max_len, width, height)
+        self.max_len = max_len
+        self.height = height // 8
+        self.width = width // 8
+        self.batch_size = batch_size
+        self.model_id = model_id if custom_weights == "" else custom_weights
+        self.precision = precision
+        self.base_vae = use_base_vae
+        self.model_name = (
+            "_"
+            + str(batch_size)
+            + "_"
+            + str(max_len)
+            + "_"
+            + str(height)
+            + "_"
+            + str(width)
+            + "_"
+            + precision
+        )
+        self.use_tuned = use_tuned
+        self.debug = debug
+        self.sharktank_dir = sharktank_dir
+        self.generate_vmfb = generate_vmfb
+        # We need a better naming convention for the .vmfbs because despite
+        # using the custom model variant the .vmfb names remain the same and
+        # it'll always pick up the compiled .vmfb instead of compiling the
+        # custom model.
+        # So, currently, we add `self.model_id` in the `self.model_name` of
+        # .vmfb file.
+        # TODO: Have a better way of naming the vmfbs using self.model_name.
+        import re
+
+        model_name = re.sub(r"\W+", "_", self.model_id)
+        if model_name[0] == "_":
+            model_name = model_name[1:]
+        self.model_name = self.model_name + "_" + model_name
+
+    def check_params(self, max_len, width, height):
+        if not (max_len >= 32 and max_len <= 77):
+            sys.exit("please specify max_len in the range [32, 77].")
+        if not (width % 8 == 0 and width >= 384):
+            sys.exit("width should be greater than 384 and multiple of 8")
+        if not (height % 8 == 0 and height >= 384):
+            sys.exit("height should be greater than 384 and multiple of 8")
+
+    def get_vae(self):
+        class VaeModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, base_vae=self.base_vae):
+                super().__init__()
+                self.vae = AutoencoderKL.from_pretrained(
+                    model_id,
+                    subfolder="vae",
+                )
+                self.base_vae = base_vae
+
+            def forward(self, input):
+                if not self.base_vae:
+                    input = 1 / 0.18215 * input
+                x = self.vae.decode(input, return_dict=False)[0]
+                x = (x / 2 + 0.5).clamp(0, 1)
+                if self.base_vae:
+                    return x
+                x = x * 255.0
+                return x.round()
+
+        vae = VaeModel()
+        inputs = tuple(self.inputs["vae"])
+        is_f16 = True if self.precision == "fp16" else False
+        vae_name = "base_vae" if self.base_vae else "vae"
+        vae_model_name = vae_name + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, vae_model_name), exist_ok=True
+            )
+        shark_vae = compile_through_fx(
+            vae,
+            inputs,
+            is_f16=is_f16,
+            use_tuned=self.use_tuned,
+            model_name=vae_model_name,
+            extra_args=get_opt_flags("vae", precision=self.precision),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+        )
+        return shark_vae
+
+    def get_unet(self):
+        class UnetModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.unet = UNet2DConditionModel.from_pretrained(
+                    model_id,
+                    subfolder="unet",
+                )
+                self.in_channels = self.unet.in_channels
+                self.train(False)
+
+            def forward(
+                self, latent, timestep, text_embedding, guidance_scale
+            ):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latents = torch.cat([latent] * 2)
+                unet_out = self.unet.forward(
+                    latents, timestep, text_embedding, return_dict=False
+                )[0]
+                noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+                return noise_pred
+
+        unet = UnetModel()
+        is_f16 = True if self.precision == "fp16" else False
+        inputs = tuple(self.inputs["unet"])
+        input_mask = [True, True, True, False]
+        unet_model_name = "unet" + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, unet_model_name),
+                exist_ok=True,
+            )
+        shark_unet = compile_through_fx(
+            unet,
+            inputs,
+            model_name=unet_model_name,
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            use_tuned=self.use_tuned,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+        )
+        return shark_unet
+
+    def get_clip(self):
+        class CLIPText(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.text_encoder = CLIPTextModel.from_pretrained(
+                    model_id,
+                    subfolder="text_encoder",
+                )
+
+            def forward(self, input):
+                return self.text_encoder(input)[0]
+
+        clip_model = CLIPText()
+        clip_model_name = "clip" + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, clip_model_name),
+                exist_ok=True,
+            )
+
+        shark_clip = compile_through_fx(
+            clip_model,
+            tuple(self.inputs["clip"]),
+            model_name=clip_model_name,
+            extra_args=get_opt_flags("clip", precision="fp32"),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+        )
+        return shark_clip
+
+    def __call__(self):
+        from utils import get_vmfb_path_name
+        from stable_args import args
+        import traceback, functools, operator, os
+
+        model_name = ["clip", "base_vae" if self.base_vae else "vae", "unet"]
+        vmfb_path = [
+            get_vmfb_path_name(model + self.model_name)[0]
+            for model in model_name
+        ]
+        for model_id in base_models:
+            self.inputs = get_input_info(
+                base_models[model_id],
+                self.max_len,
+                self.width,
+                self.height,
+                self.batch_size,
+            )
+            try:
+                compiled_unet = self.get_unet()
+                compiled_vae = self.get_vae()
+                compiled_clip = self.get_clip()
+            except Exception as e:
+                if args.enable_stack_trace:
+                    traceback.print_exc()
+                vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
+                all_vmfb_present = functools.reduce(
+                    operator.__and__, vmfb_present
+                )
+                # We need to delete vmfbs only if some of the models were compiled.
+                if not all_vmfb_present:
+                    for i in range(len(vmfb_path)):
+                        if vmfb_present[i]:
+                            os.remove(vmfb_path[i])
+                            print("Deleted: ", vmfb_path[i])
+                print("Retrying with a different base model configuration")
+                continue
+            # This is done just because in main.py we are basing the choice of tokenizer and scheduler
+            # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
+            # model and rely on retrying method to find the input configuration, we should also update
+            # the knowledge of base model id accordingly into `args.hf_model_id`.
+            if args.ckpt_loc != "":
+                args.hf_model_id = model_id
+            return compiled_clip, compiled_unet, compiled_vae
+        sys.exit(
+            "Cannot compile the model. Please use `enable_stack_trace` and create an issue at https://github.com/nod-ai/SHARK/issues"
+        )
--- a/shark/examples/shark_inference/stable_diffusion/opt_params.py
+++ b/shark/examples/shark_inference/stable_diffusion/opt_params.py
@@ -0,0 +1,112 @@
+import sys
+import resources
+from stable_args import args
+from utils import get_shark_model
+
+models_db = (
+    resources.beta_models_db if args.beta_models else resources.models_db
+)
+BATCH_SIZE = len(args.prompts)
+if BATCH_SIZE != 1:
+    sys.exit("Only batch size 1 is supported.")
+
+hf_model_variant_map = {
+    "Linaqruf/anything-v3.0": ["anythingv3", "v2_1base"],
+    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v2_1base"],
+    "prompthero/openjourney": ["openjourney", "v2_1base"],
+    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v2_1base"],
+    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1"],
+    "stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
+    "CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
+}
+
+variant, version = hf_model_variant_map[args.hf_model_id]
+
+
+def get_params(bucket_key, model_key, model, is_tuned, precision):
+    iree_flags = []
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    try:
+        bucket = models_db[0][bucket_key]
+        model_name = models_db[1][model_key]
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "default_compilation_flags"
+        ]
+    except KeyError:
+        raise Exception(
+            f"{bucket_key}/{model_key} is not present in the models database"
+        )
+
+    if (
+        "specified_compilation_flags"
+        in models_db[2][model][is_tuned][precision]
+    ):
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in models_db[2][model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return bucket, model_name, iree_flags
+
+
+def get_unet():
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if "vulkan" not in args.device and args.use_tuned:
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
+    else:
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "unet", is_tuned, args.precision
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_vae():
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    is_base = "/base" if args.use_base_vae else ""
+    if "vulkan" not in args.device and args.use_tuned:
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}/{args.device}"
+    else:
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "vae", is_tuned, args.precision
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_clip():
+    bucket_key = f"{variant}/untuned"
+    model_key = (
+        f"{variant}/{version}/clip/fp32/length_{args.max_length}/untuned"
+    )
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "clip", "untuned", "fp32"
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
--- a/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
+++ b/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
@@ -0,0 +1,44 @@
+Compile / Run Instructions:
+
+To compile .vmfb for SD (vae, unet, CLIP), run the following commands with the .mlir in your local shark_tank cache (default location for Linux users is `~/.local/shark_tank`). These will be available once the script from [this README](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md) is run once.
+Running the script mentioned above with the `--save_vmfb` flag will also save the .vmfb in your SHARK base directory if you want to skip straight to benchmarks.
+
+Compile Commands FP32/FP16: 
+
+```shell
+Vulkan AMD: 
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+
+#  add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
+#  use –iree-input-type=mhlo for tf models
+
+CUDA NVIDIA:
+iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+
+CPU:
+iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu  --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+```
+
+
+
+Run / Benchmark Command (FP32 - NCHW):
+(NEED to use BS=2 since we do two forward passes to unet as a result of classifier free guidance.)
+
+```shell
+## Vulkan AMD:
+iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
+
+## CUDA:
+iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=cuda  --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
+
+## CPU:
+iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=local-task  --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
+
+```
+
+Run via vulkan_gui for RGP Profiling:
+
+To build the vulkan app for profiling UNet follow the instructions [here](https://github.com/nod-ai/SHARK/tree/main/cpp) and then run the following command from the cpp directory with your compiled stable_diff.vmfb
+```shell
+./build/vulkan_gui/iree-vulkan-gui --module_file=/path/to/unet.vmfb --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
+```
--- a/shark/examples/shark_inference/stable_diffusion/resources.py
+++ b/shark/examples/shark_inference/stable_diffusion/resources.py
@@ -0,0 +1,38 @@
+import os
+import json
+import sys
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+def get_json_file(path):
+    json_var = []
+    loc_json = resource_path(path)
+    if os.path.exists(loc_json):
+        with open(loc_json, encoding="utf-8") as fopen:
+            json_var = json.load(fopen)
+
+    if not json_var:
+        print(f"Unable to fetch {path}")
+
+    return json_var
+
+
+# TODO: This shouldn't be called from here, every time the file imports
+# it will run all the global vars.
+prompts_examples = get_json_file("resources/prompts.json")
+models_db = get_json_file("resources/model_db.json")
+beta_models_db = get_json_file("resources/beta_model_db.json")
+
+# The base_model contains the input configuration for the different
+# models and also helps in providing information for the variants.
+base_models = get_json_file("resources/base_model.json")
+
+# Contains optimization flags for different models.
+opt_flags = get_json_file("resources/opt_flags.json")
--- a/shark/examples/shark_inference/stable_diffusion/resources/base_model.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/base_model.json
@@ -0,0 +1,98 @@
+{
+    "stabilityai/stable-diffusion-2-1": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    },
+    "CompVis/stable-diffusion-v1-4": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    }
+}
--- a/shark/examples/shark_inference/stable_diffusion/resources/beta_model_db.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/beta_model_db.json
@@ -0,0 +1,177 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/latest",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet64_512_512_fp16_stabilityai_stable_diffusion_2_1_basec",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip64_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/shark/examples/shark_inference/stable_diffusion/resources/model_config.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/model_config.json
@@ -0,0 +1,21 @@
+[
+  {
+    "stablediffusion/v1_4":"CompVis/stable-diffusion-v1-4",
+    "stablediffusion/v2_1base":"stabilityai/stable-diffusion-2-1-base",
+    "stablediffusion/v2_1":"stabilityai/stable-diffusion-2-1",
+    "anythingv3/v1_4":"Linaqruf/anything-v3.0",
+    "analogdiffusion/v1_4":"wavymulder/Analog-Diffusion",
+    "openjourney/v1_4":"prompthero/openjourney",
+    "dreamlike/v1_4":"dreamlike-art/dreamlike-diffusion-1.0"
+  },
+  {
+    "stablediffusion/fp16":"fp16",
+    "stablediffusion/fp32":"main",
+    "anythingv3/fp16":"diffusers",
+    "anythingv3/fp32":"diffusers",
+    "analogdiffusion/fp16":"main",
+    "analogdiffusion/fp32":"main",
+    "openjourney/fp16":"main",
+    "openjourney/fp32":"main"
+  }
+]
--- a/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
@@ -0,0 +1,177 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip2base_18dec_fp32",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_19dec_v2p1base_fp32_64",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet2_14dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae2_19dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/shark/examples/shark_inference/stable_diffusion/resources/opt_flags.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/opt_flags.json
@@ -0,0 +1,101 @@
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [],
+          "specified_compilation_flags": {
+            "cuda": [],
+            "default_device": ["--iree-flow-enable-padding-linalg-ops",
+                               "--iree-flow-linalg-ops-padding-size=32",
+                               "--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [],
+          "specified_compilation_flags": {
+            "cuda": [],
+            "default_device": [
+              "--iree-flow-enable-padding-linalg-ops",
+              "--iree-flow-linalg-ops-padding-size=32",
+              "--iree-flow-enable-conv-img2col-transform"
+            ]
+          }
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
--- a/shark/examples/shark_inference/stable_diffusion/resources/prompts.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/prompts.json
@@ -0,0 +1,8 @@
+[["A high tech solarpunk utopia in the Amazon rainforest"],
+["A pikachu fine dining with a view to the Eiffel Tower"],
+["A mecha robot in a favela in expressionist style"],
+["an insect robot preparing a delicious meal"],
+["A digital Illustration of the Babel tower, 4k, detailed, trending in artstation, fantasy vivid colors"],
+["Cluttered house in the woods, anime, oil painting, high resolution, cottagecore, ghibli inspired, 4k"],
+["A beautiful mansion beside a waterfall in the woods, by josef thoma, matte painting, trending on artstation HQ"],
+["portrait photo of a asia old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes"]]
--- a/shark/examples/shark_inference/stable_diffusion/schedulers.py
+++ b/shark/examples/shark_inference/stable_diffusion/schedulers.py
@@ -0,0 +1,144 @@
+import sys
+import numpy as np
+from typing import List, Optional, Tuple, Union
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
+from diffusers.configuration_utils import register_to_config
+from utils import compile_through_fx, get_shark_model
+from stable_args import args
+import torch
+
+SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+
+
+BATCH_SIZE = len(args.prompts)
+if len(args.prompts) == 0:
+    BATCH_SIZE = 1
+
+model_input = {
+    "euler": {
+        "latent": torch.randn(
+            BATCH_SIZE, 4, args.height // 8, args.width // 8
+        ),
+        "output": torch.randn(
+            BATCH_SIZE, 4, args.height // 8, args.width // 8
+        ),
+        "sigma": torch.tensor(1).to(torch.float32),
+        "dt": torch.tensor(1).to(torch.float32),
+    },
+}
+
+
+class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
+        super().__init__(
+            num_train_timesteps,
+            beta_start,
+            beta_end,
+            beta_schedule,
+            trained_betas,
+            prediction_type,
+        )
+
+    def compile(self):
+        example_latent = model_input["euler"]["latent"]
+        example_output = model_input["euler"]["output"]
+        if args.precision == "fp16":
+            example_latent = example_latent.half()
+            example_output = example_output.half()
+        example_sigma = model_input["euler"]["sigma"]
+        example_dt = model_input["euler"]["dt"]
+
+        class ScalingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, latent, sigma):
+                return latent / ((sigma**2 + 1) ** 0.5)
+
+        class SchedulerStepModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, noise_pred, sigma, latent, dt):
+                pred_original_sample = latent - sigma * noise_pred
+                derivative = (latent - pred_original_sample) / sigma
+                return latent + derivative * dt
+
+        iree_flags = []
+        if len(args.iree_vulkan_target_triple) > 0:
+            iree_flags.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+        # Disable bindings fusion to work with moltenVK.
+        if sys.platform == "darwin":
+            iree_flags.append("-iree-stream-fuse-binding=false")
+
+        if args.import_mlir:
+            scaling_model = ScalingModel()
+            self.scaling_model = compile_through_fx(
+                scaling_model,
+                (example_latent, example_sigma),
+                model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+
+            step_model = SchedulerStepModel()
+            self.step_model = compile_through_fx(
+                step_model,
+                (example_output, example_sigma, example_latent, example_dt),
+                model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+        else:
+            self.scaling_model = get_shark_model(
+                SCHEDULER_BUCKET,
+                "euler_scale_model_input_" + args.precision,
+                iree_flags,
+            )
+            self.step_model = get_shark_model(
+                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
+            )
+
+    def scale_model_input(self, sample, timestep):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        return self.scaling_model(
+            "forward",
+            (
+                sample,
+                sigma,
+            ),
+            send_to_host=False,
+        )
+
+    def step(self, noise_pred, timestep, latent):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        dt = self.sigmas[step_index + 1] - sigma
+        return self.step_model(
+            "forward",
+            (
+                noise_pred,
+                sigma,
+                latent,
+                dt,
+            ),
+            send_to_host=False,
+        )
--- a/shark/examples/shark_inference/stable_diffusion/sd_annotation.py
+++ b/shark/examples/shark_inference/stable_diffusion/sd_annotation.py
@@ -0,0 +1,191 @@
+import os
+from shark.model_annotation import model_annotation, create_context
+from shark.iree_utils._common import iree_target_map, run_cmd
+from shark.shark_downloader import (
+    download_model,
+    download_public_file,
+    WORKDIR,
+)
+from shark.parser import shark_args
+from stable_args import args
+
+
+device = (
+    args.device if "://" not in args.device else args.device.split("://")[0]
+)
+
+
+# Download the model (Unet or VAE fp16) from shark_tank
+def load_model_from_tank():
+    from opt_params import get_params, version, variant
+
+    shark_args.local_tank_cache = args.local_tank_cache
+    bucket_key = f"{variant}/untuned"
+    if args.annotation_model == "unet":
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/untuned"
+    elif args.annotation_model == "vae":
+        is_base = "/base" if args.use_base_vae else ""
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/untuned{is_base}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, args.annotation_model, "untuned", args.precision
+    )
+    mlir_model, func_name, inputs, golden_out = download_model(
+        model_name,
+        tank_url=bucket,
+        frontend="torch",
+    )
+    return mlir_model, model_name
+
+
+# Download the tuned config files from shark_tank
+def load_winograd_configs():
+    config_bucket = "gs://shark_tank/sd_tuned/configs/"
+    config_name = f"{args.annotation_model}_winograd_{device}.json"
+    full_gs_url = config_bucket + config_name
+    winograd_config_dir = f"{WORKDIR}configs/" + config_name
+    print("Loading Winograd config file from ", winograd_config_dir)
+    download_public_file(full_gs_url, winograd_config_dir, True)
+    return winograd_config_dir
+
+
+def load_lower_configs():
+    from opt_params import version, variant
+
+    config_bucket = "gs://shark_tank/sd_tuned/configs/"
+    config_version = version
+    if variant in ["anythingv3", "analogdiffusion"]:
+        args.max_length = 77
+        config_version = "v1_4"
+    if args.annotation_model == "vae":
+        args.max_length = 77
+    config_name = f"{args.annotation_model}_{config_version}_{args.precision}_len{args.max_length}_{device}.json"
+    full_gs_url = config_bucket + config_name
+    lowering_config_dir = f"{WORKDIR}configs/" + config_name
+    print("Loading lowering config file from ", lowering_config_dir)
+    download_public_file(full_gs_url, lowering_config_dir, True)
+    return lowering_config_dir
+
+
+# Annotate the model with Winograd attribute on selected conv ops
+def annotate_with_winograd(input_mlir, winograd_config_dir, model_name):
+    if model_name.split("_")[-1] != "tuned":
+        out_file_path = (
+            f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
+        )
+    else:
+        out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
+
+    with create_context() as ctx:
+        winograd_model = model_annotation(
+            ctx,
+            input_contents=input_mlir,
+            config_path=winograd_config_dir,
+            search_op="conv",
+            winograd=True,
+        )
+        with open(out_file_path, "w") as f:
+            f.write(str(winograd_model))
+            f.close()
+    return winograd_model, out_file_path
+
+
+# For Unet annotate the model with tuned lowering configs
+def annotate_with_lower_configs(
+    input_mlir, lowering_config_dir, model_name, use_winograd
+):
+    if use_winograd:
+        dump_after = "iree-linalg-ext-convert-conv2d-to-winograd"
+    else:
+        dump_after = "iree-flow-pad-linalg-ops"
+
+    # Dump IR after padding/img2col/winograd passes
+    device_spec_args = ""
+    if device == "cuda":
+        from shark.iree_utils.gpu_utils import get_iree_gpu_args
+
+        gpu_flags = get_iree_gpu_args()
+        for flag in gpu_flags:
+            device_spec_args += flag + " "
+    elif device == "vulkan":
+        device_spec_args = (
+            f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
+        )
+    print("Applying tuned configs on", model_name)
+
+    run_cmd(
+        f"iree-compile {input_mlir} "
+        "--iree-input-type=tm_tensor "
+        f"--iree-hal-target-backends={iree_target_map(device)} "
+        f"{device_spec_args}"
+        "--iree-stream-resource-index-bits=64 "
+        "--iree-vm-target-index-bits=64 "
+        "--iree-flow-enable-padding-linalg-ops "
+        "--iree-flow-linalg-ops-padding-size=32 "
+        "--iree-flow-enable-conv-img2col-transform "
+        f"--mlir-print-ir-after={dump_after} "
+        "--compile-to=flow "
+        f"2>{args.annotation_output}/dump_after_winograd.mlir "
+    )
+
+    # Annotate the model with lowering configs in the config file
+    with create_context() as ctx:
+        tuned_model = model_annotation(
+            ctx,
+            input_contents=f"{args.annotation_output}/dump_after_winograd.mlir",
+            config_path=lowering_config_dir,
+            search_op="all",
+        )
+
+    # Remove the intermediate mlir and save the final annotated model
+    os.remove(f"{args.annotation_output}/dump_after_winograd.mlir")
+    if model_name.split("_")[-1] != "tuned":
+        out_file_path = (
+            f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
+        )
+    else:
+        out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
+    with open(out_file_path, "w") as f:
+        f.write(str(tuned_model))
+        f.close()
+    return tuned_model, out_file_path
+
+
+def sd_model_annotation(mlir_model, model_name, model_from_tank=False):
+    if args.annotation_model == "unet" and device == "vulkan":
+        use_winograd = True
+        winograd_config_dir = load_winograd_configs()
+        winograd_model, model_path = annotate_with_winograd(
+            mlir_model, winograd_config_dir, model_name
+        )
+        lowering_config_dir = load_lower_configs()
+        tuned_model, output_path = annotate_with_lower_configs(
+            model_path, lowering_config_dir, model_name, use_winograd
+        )
+    elif args.annotation_model == "vae" and device == "vulkan":
+        use_winograd = True
+        winograd_config_dir = load_winograd_configs()
+        tuned_model, output_path = annotate_with_winograd(
+            mlir_model, winograd_config_dir, model_name
+        )
+    else:
+        use_winograd = False
+        if model_from_tank:
+            mlir_model = f"{WORKDIR}{model_name}_torch/{model_name}_torch.mlir"
+        else:
+            # Just use this function to convert bytecode to string
+            orig_model, model_path = annotate_with_winograd(
+                mlir_model, "", model_name
+            )
+            mlir_model = model_path
+        lowering_config_dir = load_lower_configs()
+        tuned_model, output_path = annotate_with_lower_configs(
+            mlir_model, lowering_config_dir, model_name, use_winograd
+        )
+    print(f"Saved the annotated mlir in {output_path}.")
+    return tuned_model, output_path
+
+
+if __name__ == "__main__":
+    mlir_model, model_name = load_model_from_tank()
+    sd_model_annotation(mlir_model, model_name, model_from_tank=True)
--- a/shark/examples/shark_inference/stable_diffusion/shark_sd_cli.spec
+++ b/shark/examples/shark_inference/stable_diffusion/shark_sd_cli.spec
@@ -0,0 +1,76 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import copy_metadata
+
+import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
+
+datas = []
+datas += collect_data_files('torch')
+datas += copy_metadata('torch')
+datas += copy_metadata('tqdm')
+datas += copy_metadata('regex')
+datas += copy_metadata('requests')
+datas += copy_metadata('packaging')
+datas += copy_metadata('filelock')
+datas += copy_metadata('numpy')
+datas += copy_metadata('tokenizers')
+datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torchvision')
+datas += copy_metadata('torch-mlir')
+datas += copy_metadata('diffusers')
+datas += copy_metadata('transformers')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
+datas += collect_data_files('iree')
+datas += collect_data_files('google-cloud-storage')
+datas += collect_data_files('shark')
+datas += [
+         ( 'resources/prompts.json', 'resources'),
+         ( 'resources/model_db.json', 'resources'),
+         ( 'resources/base_model.json', 'resources'),
+         ( 'resources/opt_flags.json', 'resources'),
+         ]
+
+binaries = []
+
+block_cipher = None
+
+
+a = Analysis(
+    ['main.py'],
+    pathex=['.'],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core' ],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='shark_sd_cli',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/shark/examples/shark_inference/stable_diffusion/stable_args.py
+++ b/shark/examples/shark_inference/stable_diffusion/stable_args.py
@@ -0,0 +1,372 @@
+import os
+import argparse
+from pathlib import Path
+
+
+def path_expand(s):
+    return Path(s).expanduser().resolve()
+
+
+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Stable Diffusion Params
+##############################################################################
+
+p.add_argument(
+    "-p",
+    "--prompts",
+    action="append",
+    default=[],
+    help="text of which images to be generated.",
+)
+
+p.add_argument(
+    "--negative_prompts",
+    nargs="+",
+    default=[""],
+    help="text you don't want to see in the generated image.",
+)
+
+p.add_argument(
+    "--steps",
+    type=int,
+    default=50,
+    help="the no. of steps to do the sampling.",
+)
+
+p.add_argument(
+    "--seed",
+    type=int,
+    default=42,
+    help="the seed to use.",
+)
+
+p.add_argument(
+    "--batch_size",
+    type=int,
+    default=1,
+    choices=range(1, 4),
+    help="the number of inferences to be made in a single `run`.",
+)
+
+p.add_argument(
+    "--height",
+    type=int,
+    default=512,
+    help="the height of the output image.",
+)
+
+p.add_argument(
+    "--width",
+    type=int,
+    default=512,
+    help="the width of the output image.",
+)
+
+p.add_argument(
+    "--guidance_scale",
+    type=float,
+    default=7.5,
+    help="the value to be used for guidance scaling.",
+)
+
+p.add_argument(
+    "--max_length",
+    type=int,
+    default=64,
+    help="max length of the tokenizer output, options are 64 and 77.",
+)
+
+##############################################################################
+### Model Config and Usage Params
+##############################################################################
+
+p.add_argument(
+    "--device", type=str, default="vulkan", help="device to run the model."
+)
+
+p.add_argument(
+    "--precision", type=str, default="fp16", help="precision to run the model."
+)
+
+p.add_argument(
+    "--import_mlir",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
+)
+
+p.add_argument(
+    "--load_vmfb",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
+)
+
+p.add_argument(
+    "--save_vmfb",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="saves the compiled flatbuffer to the local directory",
+)
+
+p.add_argument(
+    "--use_tuned",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Download and use the tuned version of the model if available",
+)
+
+p.add_argument(
+    "--use_base_vae",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Do conversion from the VAE output to pixel space on cpu.",
+)
+
+p.add_argument(
+    "--scheduler",
+    type=str,
+    default="SharkEulerDiscrete",
+    help="other supported schedulers are [PNDM, DDIM, LMSDiscrete, EulerDiscrete, DPMSolverMultistep]",
+)
+
+p.add_argument(
+    "--output_img_format",
+    type=str,
+    default="png",
+    help="specify the format in which output image is save. Supported options: jpg / png",
+)
+
+p.add_argument(
+    "--output_dir",
+    type=str,
+    default=None,
+    help="Directory path to save the output images and json",
+)
+
+p.add_argument(
+    "--runs",
+    type=int,
+    default=1,
+    help="number of images to be generated with random seeds in single execution",
+)
+
+p.add_argument(
+    "--ckpt_loc",
+    type=str,
+    default="",
+    help="Path to SD's .ckpt file.",
+)
+
+p.add_argument(
+    "--hf_model_id",
+    type=str,
+    default="stabilityai/stable-diffusion-2-1-base",
+    help="The repo-id of hugging face.",
+)
+
+p.add_argument(
+    "--enable_stack_trace",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Enable showing the stack trace when retrying the base model configuration",
+)
+p.add_argument(
+    "--beta_models",
+    default=False,
+    type=bool,
+    help="(False/True), use beta model files",
+)
+##############################################################################
+### IREE - Vulkan supported flags
+##############################################################################
+
+p.add_argument(
+    "--iree-vulkan-target-triple",
+    type=str,
+    default="",
+    help="Specify target triple for vulkan",
+)
+
+p.add_argument(
+    "--vulkan_debug_utils",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Profiles vulkan device and collects the .rdc info",
+)
+
+p.add_argument(
+    "--vulkan_large_heap_block_size",
+    default="4147483648",
+    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
+)
+
+p.add_argument(
+    "--vulkan_validation_layers",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for disabling vulkan validation layers when benchmarking",
+)
+
+##############################################################################
+### Misc. Debug and Optimization flags
+##############################################################################
+
+p.add_argument(
+    "--use_compiled_scheduler",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="use the default scheduler precompiled into the model if available",
+)
+
+p.add_argument(
+    "--local_tank_cache",
+    default="",
+    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
+)
+
+p.add_argument(
+    "--dump_isa",
+    default=False,
+    action="store_true",
+    help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
+)
+
+p.add_argument(
+    "--dispatch_benchmarks",
+    default=None,
+    help='dispatches to return benchamrk data on.  use "All" for all, and None for none.',
+)
+
+p.add_argument(
+    "--dispatch_benchmarks_dir",
+    default="temp_dispatch_benchmarks",
+    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
+)
+
+p.add_argument(
+    "--enable_rgp",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for inserting debug frames between iterations for use with rgp.",
+)
+
+p.add_argument(
+    "--hide_steps",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for hiding the details of iteration/sec for each step.",
+)
+
+p.add_argument(
+    "--warmup_count",
+    type=int,
+    default=0,
+    help="flag setting warmup count for clip and vae [>= 0].",
+)
+
+p.add_argument(
+    "--clear_all",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
+)
+
+##############################################################################
+### Web UI flags
+##############################################################################
+
+p.add_argument(
+    "--progress_bar",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for removing the pregress bar animation during image generation",
+)
+
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
+##############################################################################
+### SD model auto-annotation flags
+##############################################################################
+
+p.add_argument(
+    "--annotation_output",
+    type=path_expand,
+    default="./",
+    help="Directory to save the annotated mlir file",
+)
+
+p.add_argument(
+    "--annotation_model",
+    type=str,
+    default="unet",
+    help="Options are unet and vae.",
+)
+
+p.add_argument(
+    "--use_winograd",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Apply Winograd on selected conv ops.",
+)
+##############################################################################
+### CI generation tags
+##############################################################################
+
+# TODO: remove from here once argparse is not required by half of sd, none of these are relevant to main.py
+
+p.add_argument(
+    "--ci_tank_dir",
+    default=True,
+    type=bool,
+    help="used for CI generation purposes only.",
+)
+p.add_argument(
+    "--upload",
+    default=False,
+    type=bool,
+    help="upload generated models to shark tank (builder only), irrelevant to main.py",
+)
+p.add_argument(
+    "--torch_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/torch_model_list.csv",
+    help="""Contains the file with torch_model name and args.
+		 Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
+)
+p.add_argument(
+    "--tf_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/tf_model_list.csv",
+    help="Contains the file with tf model name and args.",
+)
+p.add_argument(
+    "--tflite_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/tflite/tflite_model_list.csv",
+    help="Contains the file with tf model name and args.",
+)
+args = p.parse_args()
--- a/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
+++ b/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
@@ -0,0 +1,154 @@
+# Stable Diffusion optimized for AMD RDNA2/RDNA3 GPUs
+
+Before you start, please be aware that this is beta software that relies on a special AMD driver. Like all StableDiffusion GUIs published so far, you need some technical expertise to set it up. We apologize in advance if you bump into issues. If that happens, please don't hesitate to ask our Discord community for help! If you still can't get it to work, we're sorry, and please be assured that we (Nod and AMD) are working hard to improve the user experience in coming months.
+If it works well for you, please "star" the following GitHub projects... this is one of the best ways to help and spread the word!
+
+* https://github.com/nod-ai/SHARK
+* https://github.com/iree-org/iree
+
+## Install this specific AMD Drivers (AMD latest may not have all the fixes).
+
+### AMD KB Drivers for RDNA2 and RDNA3:
+
+*AMD Software: Adrenalin Edition 22.11.1 for MLIR/IREE Driver Version 22.20.29.09 for Windows® 10 and Windows® 11 (Windows Driver Store Version 31.0.12029.9003)*
+
+First, for RDNA2 users, download this special driver in a folder of your choice. We recommend you keep the installation files around, since you may need to re-install it later, if Windows Update decides to overwrite it:
+https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mlir-iree
+
+For RDNA3, the latest driver 23.1.2 supports MLIR/IREE as well: https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-1-2-kb
+
+KNOWN ISSUES with this special AMD driver:
+* `Windows Update` may (depending how it's configured) automatically install a new official AMD driver that overwrites this IREE-specific driver. If Stable Diffusion used to work, then a few days later, it slows down a lot or produces incorrect results (e.g. black images), this may be the cause. To fix this problem, please check the installed driver version, and re-install the special driver if needed. (TODO: document how to prevent this `Windows Update` behavior!)
+* Some people using this special driver experience mouse pointer accuracy issues, especially if using a larger-than-default mouse pointer. The clicked point isn't centered properly. One possible work-around is to reset the pointer size to "1" in "Change pointer size and color".
+
+## Installation
+
+Download the latest Windows SHARK SD binary [469 here](https://github.com/nod-ai/SHARK/releases/download/20230124.469/shark_sd_20230124_469.exe) in a folder of your choice. If you want nighly builds, you can look for them on the GitHub releases page.
+
+Notes:
+* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR which can be outdated if you run a new EXE from the same folder. You can use `--clean_all` flag once to clean all the old files. 
+* If you recently updated the driver or this binary (EXE file), we recommend you:
+  * clear all the local artifacts with `--clear_all` OR 
+  * clear the Vulkan shader cache: For Windows users this can be done by clearing the contents of `C:\Users\%username%\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
+  * clear the `huggingface` cache. In Windows, this is `C:\Users\%username%\.cache\huggingface`.
+
+## Running
+
+* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE to start the web browser)
+* The first run may take about 10-15 minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
+* If successful, you will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
+* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/?__theme=dark.
+
+## Stopping
+
+* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment. The application should stop. 
+* Please make sure to do the above step before you attempt to update the EXE to a new version.
+
+# Results
+
+<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
+
+
+Here are some samples generated:
+
+![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
+
+![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
+
+
+<details>
+  <summary>Advanced Installation </summary>
+
+
+## Setup your Python Virtual Environment and Dependencies
+<details>
+ <summary> Windows 10/11 Users </summary>
+
+* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
+
+* Install Git for Windows from [here](https://git-scm.com/download/win)
+
+#### Allow the install script to run in Powershell
+```powershell
+set-executionpolicy remotesigned 
+```
+
+#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
+```powershell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+./setup_venv.ps1 #You can re-run this script to get the latest version
+```
+</details> 
+
+ <details>
+  <summary>Linux</summary>
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+./setup_venv.sh
+source shark.venv/bin/activate
+```
+ </details>
+
+### Run Stable Diffusion on your device - WebUI
+
+<details>
+ <summary>Windows 10/11 Users</summary>
+ 
+```powershell
+(shark.venv) PS C:\Users\nod\SHARK> cd web
+(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
+```
+ 
+ </details>
+ 
+<details>
+ <summary>Linux Users</summary>
+ 
+```shell
+(shark.venv) > cd web
+(shark.venv) > python index.py
+```
+ 
+</details>
+
+### Run Stable Diffusion on your device - Commandline
+
+<details>
+ <summary>Windows 10/11 Users</summary>
+ 
+```powershell
+(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+```
+ 
+  </details>
+
+<details>
+ <summary>Linux</summary>
+ 
+```shell
+python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+```
+ 
+  </details>
+
+The output on a 7900XTX would like:
+
+```shell 
+Stats for run 0:
+Average step time: 47.19188690185547ms/it
+Clip Inference time (ms) = 109.531
+VAE Inference time (ms): 78.590
+
+Total image generation time: 2.5788655281066895sec
+```
+
+For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
+ 
+</details>
+  <details>
+  <summary>Discord link</summary>
+Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
+</details>
--- a/shark/examples/shark_inference/stable_diffusion/stable_diffusion_telegram_bot.md
+++ b/shark/examples/shark_inference/stable_diffusion/stable_diffusion_telegram_bot.md
@@ -0,0 +1,15 @@
+You need to pre-create your bot (https://core.telegram.org/bots#how-do-i-create-a-bot)
+Then create in the directory web file .env
+In it the record:
+TG_TOKEN="your_token"
+specifying your bot's token from previous step.
+Then run telegram_bot.py with the same parameters that you use when running index.py, for example:
+python telegram_bot.py --max_length=77 --vulkan_large_heap_block_size=0 --use_base_vae --local_tank_cache h:\shark\TEMP
+
+Bot commands:
+/select_model
+/select_scheduler
+/set_steps "integer number of steps"
+/set_guidance_scale "integer number"
+/set_negative_prompt "negative text"
+Any other text triggers the creation of an image based on it.
--- a/shark/examples/shark_inference/stable_diffusion/utils.py
+++ b/shark/examples/shark_inference/stable_diffusion/utils.py
@@ -0,0 +1,416 @@
+import os
+import gc
+import tempfile
+import torch
+from shark.shark_inference import SharkInference
+from shark.examples.shark_inference.stable_diffusion.stable_args import args
+from shark.shark_importer import import_with_fx
+from shark.iree_utils.vulkan_utils import (
+    set_iree_vulkan_runtime_flags,
+    get_vulkan_target_triple,
+)
+from shark.iree_utils.gpu_utils import get_cuda_sm_cc
+from resources import opt_flags
+from sd_annotation import sd_model_annotation
+import sys
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    load_pipeline_from_original_stable_diffusion_ckpt,
+)
+
+
+def get_vmfb_path_name(model_name):
+    device = (
+        args.device
+        if "://" not in args.device
+        else "-".join(args.device.split("://"))
+    )
+    extended_name = "{}_{}".format(model_name, device)
+    vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
+    return [vmfb_path, extended_name]
+
+
+def _compile_module(shark_module, model_name, extra_args=[]):
+    if args.load_vmfb or args.save_vmfb:
+        [vmfb_path, extended_name] = get_vmfb_path_name(model_name)
+        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
+            print(f"loading existing vmfb from: {vmfb_path}")
+            shark_module.load_module(vmfb_path, extra_args=extra_args)
+        else:
+            if args.save_vmfb:
+                print("Saving to {}".format(vmfb_path))
+            else:
+                print(
+                    "No vmfb found. Compiling and saving to {}".format(
+                        vmfb_path
+                    )
+                )
+            path = shark_module.save_module(
+                os.getcwd(), extended_name, extra_args
+            )
+            shark_module.load_module(path, extra_args=extra_args)
+    else:
+        shark_module.compile(extra_args)
+    return shark_module
+
+
+# Downloads the model from shark_tank and returns the shark_module.
+def get_shark_model(tank_url, model_name, extra_args=[]):
+    from shark.shark_downloader import download_model
+    from shark.parser import shark_args
+
+    # Set local shark_tank cache directory.
+    shark_args.local_tank_cache = args.local_tank_cache
+    if "cuda" in args.device:
+        shark_args.enable_tf32 = True
+
+    mlir_model, func_name, inputs, golden_out = download_model(
+        model_name,
+        tank_url=tank_url,
+        frontend="torch",
+    )
+    shark_module = SharkInference(
+        mlir_model, device=args.device, mlir_dialect="linalg"
+    )
+    return _compile_module(shark_module, model_name, extra_args)
+
+
+# Converts the torch-module into a shark_module.
+def compile_through_fx(
+    model,
+    inputs,
+    model_name,
+    is_f16=False,
+    f16_input_mask=None,
+    use_tuned=False,
+    extra_args=[],
+    save_dir=tempfile.gettempdir(),
+    debug=False,
+    generate_vmfb=True,
+):
+    from shark.parser import shark_args
+
+    if "cuda" in args.device:
+        shark_args.enable_tf32 = True
+
+    mlir_module, func_name = import_with_fx(
+        model, inputs, is_f16, f16_input_mask
+    )
+
+    if use_tuned:
+        model_name = model_name + "_tuned"
+        tuned_model_path = f"{args.annotation_output}/{model_name}_torch.mlir"
+        if not os.path.exists(tuned_model_path):
+            if "vae" in model_name.split("_")[0]:
+                args.annotation_model = "vae"
+
+            tuned_model, tuned_model_path = sd_model_annotation(
+                mlir_module, model_name
+            )
+            del mlir_module, tuned_model
+            gc.collect()
+
+        with open(tuned_model_path, "rb") as f:
+            mlir_module = f.read()
+            f.close()
+
+    save_dir = os.path.join(args.local_tank_cache, model_name)
+
+    (
+        mlir_module,
+        func_name,
+    ) = import_with_fx(
+        model=model,
+        inputs=inputs,
+        is_f16=is_f16,
+        f16_input_mask=f16_input_mask,
+        debug=debug,
+        model_name=model_name,
+        save_dir=save_dir,
+    )
+    if generate_vmfb:
+        shark_module = SharkInference(
+            mlir_module,
+            device=args.device,
+            mlir_dialect="linalg",
+        )
+
+        return _compile_module(shark_module, model_name, extra_args)
+
+
+def set_iree_runtime_flags():
+    vulkan_runtime_flags = [
+        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
+        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
+    ]
+    if args.enable_rgp:
+        vulkan_runtime_flags += [
+            f"--enable_rgp=true",
+            f"--vulkan_debug_utils=true",
+        ]
+    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
+
+
+def get_all_devices(driver_name):
+    """
+    Inputs: driver_name
+    Returns a list of all the available devices for a given driver sorted by
+    the iree path names of the device as in --list_devices option in iree.
+    """
+    from iree.runtime import get_driver
+
+    driver = get_driver(driver_name)
+    device_list_src = driver.query_available_devices()
+    device_list_src.sort(key=lambda d: d["path"])
+    return device_list_src
+
+
+def get_device_mapping(driver, key_combination=3):
+    """This method ensures consistent device ordering when choosing
+    specific devices for execution
+    Args:
+        driver (str): execution driver (vulkan, cuda, rocm, etc)
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Returns:
+        dict: map to possible device names user can input mapped to desired combination of name/path.
+    """
+    from shark.iree_utils._common import iree_device_map
+
+    driver = iree_device_map(driver)
+    device_list = get_all_devices(driver)
+    device_map = dict()
+
+    def get_output_value(dev_dict):
+        if key_combination == 1:
+            return f"{driver}://{dev_dict['path']}"
+        if key_combination == 2:
+            return dev_dict["name"]
+        if key_combination == 3:
+            return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
+
+    # mapping driver name to default device (driver://0)
+    device_map[f"{driver}"] = get_output_value(device_list[0])
+    for i, device in enumerate(device_list):
+        # mapping with index
+        device_map[f"{driver}://{i}"] = get_output_value(device)
+        # mapping with full path
+        device_map[f"{driver}://{device['path']}"] = get_output_value(device)
+    return device_map
+
+
+def map_device_to_name_path(device, key_combination=3):
+    """Gives the appropriate device data (supported name/path) for user selected execution device
+    Args:
+        device (str): user
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Raises:
+        ValueError:
+    Returns:
+        str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
+    """
+    driver = device.split("://")[0]
+    device_map = get_device_mapping(driver, key_combination)
+    try:
+        device_mapping = device_map[device]
+    except KeyError:
+        raise ValueError(f"Device '{device}' is not a valid device.")
+    return device_mapping
+
+
+def set_init_device_flags():
+    if "vulkan" in args.device:
+        # set runtime flags for vulkan.
+        set_iree_runtime_flags()
+
+        # set triple flag to avoid multiple calls to get_vulkan_triple_flag
+        device_name, args.device = map_device_to_name_path(args.device)
+        if not args.iree_vulkan_target_triple:
+            triple = get_vulkan_target_triple(device_name)
+            if triple is not None:
+                args.iree_vulkan_target_triple = triple
+        print(
+            f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
+        )
+    elif "cuda" in args.device:
+        args.device = "cuda"
+    elif "cpu" in args.device:
+        args.device = "cpu"
+
+    # set max_length based on availability.
+    if args.hf_model_id in [
+        "Linaqruf/anything-v3.0",
+        "wavymulder/Analog-Diffusion",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+    ]:
+        args.max_length = 77
+    elif args.hf_model_id == "prompthero/openjourney":
+        args.max_length = 64
+
+    # Use tuned models in the case of fp16, vulkan rdna3 or cuda sm devices.
+    if (
+        args.hf_model_id
+        in ["prompthero/openjourney", "dreamlike-art/dreamlike-diffusion-1.0"]
+        or args.precision != "fp16"
+        or args.height != 512
+        or args.width != 512
+        or args.batch_size != 1
+        or ("vulkan" not in args.device and "cuda" not in args.device)
+    ):
+        args.use_tuned = False
+
+    elif (
+        "vulkan" in args.device
+        and "rdna3" not in args.iree_vulkan_target_triple
+    ):
+        args.use_tuned = False
+
+    elif "cuda" in args.device and get_cuda_sm_cc() not in [
+        "sm_80",
+        "sm_84",
+        "sm_86",
+        "sm_89",
+    ]:
+        args.use_tuned = False
+
+    elif args.use_base_vae and args.hf_model_id not in [
+        "stabilityai/stable-diffusion-2-1-base",
+        "CompVis/stable-diffusion-v1-4",
+    ]:
+        args.use_tuned = False
+
+    # Use tuned model in the case of stablediffusion/fp16 and cuda device sm_80
+    if (
+        args.hf_model_id
+        in [
+            "stabilityai/stable-diffusion-2-1-base",
+            "Linaqruf/anything-v3.0",
+            "wavymulder/Analog-Diffusion",
+        ]
+        and args.precision == "fp16"
+        and "cuda" in args.device
+        and get_cuda_sm_cc() in ["sm_80", "sm_89"]
+        and args.use_tuned  # required to avoid always forcing true on these cards
+    ):
+        args.use_tuned = True
+    else:
+        args.use_tuned = False
+
+    if args.use_tuned:
+        print(f"Using {args.device} tuned models for stablediffusion/fp16.")
+    else:
+        print("Tuned models are currently not supported for this setting.")
+
+
+# Utility to get list of devices available.
+def get_available_devices():
+    def get_devices_by_name(driver_name):
+        from shark.iree_utils._common import iree_device_map
+
+        device_list = []
+        try:
+            driver_name = iree_device_map(driver_name)
+            device_list_dict = get_all_devices(driver_name)
+            print(f"{driver_name} devices are available.")
+        except:
+            print(f"{driver_name} devices are not available.")
+        else:
+            for i, device in enumerate(device_list_dict):
+                device_list.append(f"{device['name']} => {driver_name}://{i}")
+        return device_list
+
+    set_iree_runtime_flags()
+
+    available_devices = []
+    vulkan_devices = get_devices_by_name("vulkan")
+    available_devices.extend(vulkan_devices)
+    cuda_devices = get_devices_by_name("cuda")
+    available_devices.extend(cuda_devices)
+    available_devices.append("cpu")
+    return available_devices
+
+
+def disk_space_check(path, lim=20):
+    from shutil import disk_usage
+
+    du = disk_usage(path)
+    free = du.free / (1024 * 1024 * 1024)
+    if free <= lim:
+        print(f"[WARNING] Only {free:.2f}GB space available in {path}.")
+
+
+def get_opt_flags(model, precision="fp16"):
+    iree_flags = []
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    if "default_compilation_flags" in opt_flags[model][is_tuned][precision]:
+        iree_flags += opt_flags[model][is_tuned][precision][
+            "default_compilation_flags"
+        ]
+
+    if "specified_compilation_flags" in opt_flags[model][is_tuned][precision]:
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in opt_flags[model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += opt_flags[model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+    return iree_flags
+
+
+def preprocessCKPT():
+    from pathlib import Path
+
+    path = Path(args.ckpt_loc)
+    diffusers_path = path.parent.absolute()
+    diffusers_directory_name = path.stem
+    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
+    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
+    print(
+        "Created directory : ",
+        diffusers_directory_name,
+        " at -> ",
+        diffusers_path,
+    )
+    path_to_diffusers = complete_path_to_diffusers.as_posix()
+    from_safetensors = (
+        True if args.ckpt_loc.lower().endswith(".safetensors") else False
+    )
+    # EMA weights usually yield higher quality images for inference but non-EMA weights have
+    # been yielding better results in our case.
+    # TODO: Add an option `--ema` (`--no-ema`) for users to specify if they want to go for EMA
+    #       weight extraction or not.
+    extract_ema = False
+    print("Loading pipeline from original stable diffusion checkpoint")
+    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path=args.ckpt_loc,
+        extract_ema=extract_ema,
+        from_safetensors=from_safetensors,
+    )
+    pipe.save_pretrained(path_to_diffusers)
+    print("Loading complete")
+    args.ckpt_loc = path_to_diffusers
+    print("Custom model path is : ", args.ckpt_loc)
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`from .pipeline_shark_stable_diffusion_txt2img import Text2ImagePipeline`