switch to apps/

Add PyInstaller for apps/ webui and cli (#909 )
tested webui, cli and webui exe and cli exe
2026-01-12 15:28:10 -05:00 · 2023-02-01 06:54:20 -08:00 · 2023-02-01 06:51:27 -08:00 · 2023-02-01 06:04:49 -08:00 · 2023-02-01 05:16:11 -08:00 · 2023-01-31 22:32:05 -08:00
351 changed files with 48765 additions and 112 deletions
--- a/.github/workflows/gh-pages-releases.yml
+++ b/.github/workflows/gh-pages-releases.yml
@@ -23,7 +23,7 @@ jobs:
      - run: git fetch --all
      - run: git switch github-pages
      - run: git config --global user.email "none@none.com"
-      - run: git config --global user.name "nod-team"
+      - run: git config --global user.name "nod-ai"
      - run: mv /tmp/index.html package-index/index.html
      - run: git add package-index/index.html

--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -9,7 +9,84 @@ on:
  workflow_dispatch:

 jobs:
-  build:
+  windows-build:
+    runs-on: 7950X
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Compute version
+      shell: powershell
+      run: |
+        $package_version = $(Get-Date -UFormat "%Y%m%d")+"."+${{ github.run_number }}
+        $package_version_ = $(Get-Date -UFormat "%Y%m%d")+"_"+${{ github.run_number }}
+        $tag_name=$package_version
+        echo "package_version=$package_version" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+        echo "package_version_=$package_version_" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+        echo "tag_name=$tag_name" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+
+    - name: Create Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        tag_name: ${{ env.tag_name }}
+        release_name: nod.ai SHARK ${{ env.tag_name }}
+        body: |
+          Automatic snapshot release of nod.ai SHARK.
+        draft: true
+        prerelease: false
+
+    - name: Build Package 
+      shell: powershell
+      run: |
+        ./setup_venv.ps1
+        pyinstaller .\apps\stable_diffusion\shark_sd.spec
+        mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
+        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
+        pyinstaller .\apps\stable_diffusion\shark_sd_cli.spec
+        mv ./dist/shark_sd_cli.exe ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
+        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
+
+        
+    # GHA windows VM OOMs so disable for now
+    #- name: Build and validate the SHARK Runtime package
+    #  shell: powershell
+    #  run: |
+    #    $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
+    #    pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
+
+    - uses: actions/upload-artifact@v2
+      with:
+        path: dist/*
+    
+    - name: Upload Release Assets
+      id: upload-release-assets
+      uses: dwenegar/upload-release-assets@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+        assets_path: ./dist/*
+
+    - name: Publish Release
+      id: publish_release
+      uses: eregon/publish-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+
+  linux-build:

    runs-on: a100
    strategy:
@@ -32,40 +109,13 @@ jobs:
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
-    
-    - name: Compute version
-      run: |
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        tag_name="${package_version}"
-        echo "package_version=${package_version}" >> $GITHUB_ENV
-        echo "tag_name=${tag_name}" >> $GITHUB_ENV    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-    - name: Create Release
-      id: create_release
-      uses: actions/create-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        tag_name: ${{ env.tag_name }}
-        release_name: nod.ai SHARK ${{ env.tag_name }}
-        body: |
-          Automatic snapshot release of nod.ai SHARK.
-        draft: true
-        prerelease: false
-    - name: Find Torch-MLIR Release
-      run: |
-        TM_HTML_URL="$(python3 -c "import urllib.request, json, sys; u=json.loads(urllib.request.urlopen('https://api.github.com/repos/llvm/torch-mlir/releases/latest').read().decode()).get('html_url', False); print(u) if u else sys.exit(1);")"
-        TM_RELEASE_DIR=${TM_HTML_URL/"tag"/"expanded_assets"}
-        echo "TM_RELEASE_DIR=${TM_RELEASE_DIR}" >> $GITHUB_ENV
+
    - name: Install dependencies
      run: |
-        echo "Torch-MLIR Release DIR is ${{ env.TM_RELEASE_DIR }}"
+        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
        python -m pip install --upgrade pip
        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
+        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
@@ -74,25 +124,26 @@ jobs:
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
    - name: Build and validate the IREE package
      if: ${{ matrix.backend == 'IREE' }}
+      continue-on-error: true
      run: |
        cd $GITHUB_WORKSPACE
        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
        source iree.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/iree-org/iree/releases
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://iree-org.github.io/iree/pip-release-links.html
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
          tail -n 1 |
          tee -a pytest_results.txt
        if !(grep -Fxq " failed" pytest_results.txt) 
          then 
            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/$SHA
-            gsutil -m cp -r gs://shark_tank/$SHA/* gs://shark_tank/latest/
+            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
+            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/latest/
        fi
        rm -rf ./wheelhouse/nodai*

@@ -104,29 +155,10 @@ jobs:
        source shark.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
+        pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
          tail -n 1 |
          tee -a pytest_results.txt
-    
-    - name: Upload Release Assets
-      if: ${{ matrix.backend == 'SHARK' }}
-      id: upload-release-assets
-      uses: dwenegar/upload-release-assets@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ${GITHUB_WORKSPACE}/wheelhouse/nodai_*.whl
-
-    - name: Publish Release
-      if: ${{ matrix.backend == 'SHARK' }}
-      id: publish_release
-      uses: eregon/publish-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -6,10 +6,24 @@ name: Validate Models on Shark Runtime
 on:
  push:
    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
  pull_request:
    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
  workflow_dispatch:

+# Ensure that only a single job or workflow using the same
+# concurrency group will run at a time. This would cancel
+# any in-progress jobs in the same github workflow and github
+# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  build-validate:
    strategy:
@@ -32,8 +46,6 @@ jobs:
            suite: cuda
          - os: MacStudio
            suite: cpu
-          - os: MacStudio
-            suite: vulkan
          - os: icelake
            suite: vulkan
          - os: icelake
@@ -88,9 +100,9 @@ jobs:
      if: matrix.suite == 'cpu'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cpu
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k cpu
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

@@ -100,14 +112,28 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cuda
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
+        # Disabled due to black image bug
+        # python build_tools/stable_diffusion_testing.py --device=cuda 

-    - name: Validate Vulkan Models
-      if: matrix.suite == 'vulkan'
+    - name: Validate Vulkan Models (MacOS)
+      if: matrix.suite == 'vulkan' && matrix.os == 'MacStudio'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k vulkan
+        export DYLD_LIBRARY_PATH=/usr/local/lib/
+        echo $PATH
+        pip list | grep -E "torch|iree"
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" -k vulkan --update_tank
+
+    - name: Validate Vulkan Models (a100)
+      if: matrix.suite == 'vulkan' && matrix.os != 'MacStudio'
+      run: |
+        cd $GITHUB_WORKSPACE
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="${GITHUB_WORKSPACE}/shark_tmp/shark_cache" -k vulkan
+        python build_tools/stable_diffusion_testing.py --device=vulkan
--- a/.gitignore
+++ b/.gitignore
@@ -31,7 +31,6 @@ MANIFEST
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
-*.spec

 # Installer logs
 pip-log.txt
@@ -163,7 +162,14 @@ cython_debug/
 # Shark related artefacts
 *venv/
 shark_tmp/
+*.vmfb
+.use-iree
+tank/dict_configs.py

 # ORT related artefacts
 cache_models/
 onnx_models/
+
+#web logging
+web/logs/
+web/stored_results/stable_diffusion/
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "inference/thirdparty/shark-runtime"]
+	path = inference/thirdparty/shark-runtime
+	url =https://github.com/nod-ai/SHARK-Runtime.git
+	branch = shark-06032022
--- a/.style.yapf
+++ b/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+  based_on_style = google
+  column_limit = 80
--- a/218
+++ b/218
@@ -0,0 +1,218 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,332 @@
+# SHARK
+
+High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerators and Heterogeneous Clusters
+
+[![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
+[![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)
+
+
+## Installation (Windows, Linux and macOS)
+
+## Check out the code
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+```
+
+## Setup your Python VirtualEnvironment and Dependencies
+
+### Windows 10/11 Users
+
+* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
+
+* Install Git for Windows from [here](https://git-scm.com/download/win)
+
+#### Allow the install script to run in Powershell
+```powershell
+set-executionpolicy remotesigned
+```
+
+#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
+```powershell
+./setup_venv.ps1 #You can re-run this script to get the latest version
+```
+
+### Linux / macOS Users
+
+```shell
+./setup_venv.sh
+source shark.venv/bin/activate
+```
+
+
+### Run Stable Diffusion on your device - WebUI
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\Users\nod\SHARK> cd web
+(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
+```
+#### Linux Users
+```shell
+(shark.venv) > cd web
+(shark.venv) > python index.py
+```
+
+#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
+
+
+<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
+
+
+
+### Run Stable Diffusion on your device - Commandline
+
+#### Install your hardware drivers
+* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree)
+* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
+* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
+
+Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
+
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+```
+
+#### Linux / macOS Users
+```shell
+python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+```
+
+You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
+
+The output on a 7900XTX would like:
+
+```shell 
+Stats for run 0:
+Average step time: 47.19188690185547ms/it
+Clip Inference time (ms) = 109.531
+VAE Inference time (ms): 78.590
+
+Total image generation time: 2.5788655281066895sec
+```
+
+Here are some samples generated:
+
+![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
+
+![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
+
+
+
+For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
+
+Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
+
+
+<details>
+  <summary>Binary Installation</summary>
+
+### Setup a new pip Virtual Environment
+
+This step sets up a new VirtualEnv for Python
+
+```shell
+python --version #Check you have 3.10 on Linux, macOS or Windows Powershell
+python -m venv shark_venv
+source shark_venv/bin/activate   # Use shark_venv/Scripts/activate on Windows
+
+# If you are using conda create and activate a new conda env
+
+# Some older pip installs may not be able to handle the recent PyTorch deps
+python -m pip install --upgrade pip
+```
+
+*macOS Metal* users please install https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg and enable "System wide install"
+
+### Install SHARK
+
+This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10
+
+```shell
+pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+```
+
+### Run shark tank model tests.
+```shell
+pytest tank/test_models.py
+```
+See tank/README.md for a more detailed walkthrough of our pytest suite and CLI.
+
+### Download and run Resnet50 sample
+
+```shell
+curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/resnet50_script.py
+#Install deps for test script
+pip install --pre torch torchvision torchaudio tqdm pillow gsutil --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+python ./resnet50_script.py --device="cpu"  #use cuda or vulkan or metal
+```
+
+### Download and run BERT (MiniLM) sample
+```shell
+curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/minilm_jit.py
+#Install deps for test script
+pip install transformers torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
+```
+</details>
+
+
+
+<details>
+  <summary>Development, Testing and Benchmarks</summary>
+
+If you want to use Python3.10 and with TF Import tools you can use the environment variables like:
+Set `USE_IREE=1` to use upstream IREE
+```
+# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
+```
+
+### Run any of the hundreds of SHARK tank models via the test framework
+```shell
+python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
+# Or a pytest
+pytest tank/test_models.py -k "MiniLM"
+```
+  
+
+If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
+the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
+with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
+for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
+for Torch-MLIR.
+
+### How to use your locally built Torch-MLIR with SHARK
+```shell
+1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
+2.) Run `pip uninstall torch-mlir`.
+3.) Go to your local Torch-MLIR directory.
+4.) Activate mlir_venv virtual envirnoment.
+5.) Run `pip uninstall -r requirements.txt`.
+6.) Run `pip install -r requirements.txt`.
+7.) Build Torch-MLIR.
+8.) Activate shark.venv virtual environment from the Torch-MLIR directory.
+8.) Run `export PYTHONPATH=`pwd`/build/tools/torch-mlir/python_packages/torch_mlir:`pwd`/examples` in the Torch-MLIR directory.
+9.) Go to the SHARK directory.
+```
+Now the SHARK will use your locally build Torch-MLIR repo.
+
+
+## Benchmarking Dispatches
+
+To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your command line argument.  
+If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`.  E.G. `--dispatch_benchmarks="0 1 2 10"`
+
+if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled.  E.G:
+
+```
+shark_module = SharkInference(
+        mlir_model,
+        func_name,
+        device=args.device,
+        mlir_dialect="tm_tensor",
+        dispatch_benchmarks="all",
+        dispatch_benchmarks_dir="results"
+    )
+```
+
+Output will include:
+- An ordered list ordered-dispatches.txt of all the dispatches with their runtime
+- Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
+- An .mlir file containing the dispatch benchmark 
+- A compiled .vmfb file containing the dispatch benchmark
+- An .mlir file containing just the hal executable
+- A compiled .vmfb file of the hal executable
+- A .txt file containing benchmark output
+
+
+See tank/README.md for instructions on how to run model tests and benchmarks from the SHARK tank.
+
+</details>
+
+<details>
+  <summary>API Reference</summary>
+
+### Shark Inference API
+
+```
+
+from shark.shark_importer import SharkImporter
+
+# SharkImporter imports mlir file from the torch, tensorflow or tf-lite module.
+
+mlir_importer = SharkImporter(
+    torch_module,
+    (input),
+    frontend="torch",  #tf, #tf-lite
+)
+torch_mlir, func_name = mlir_importer.import_mlir(tracing_required=True)
+
+# SharkInference accepts mlir in linalg, mhlo, and tosa dialect.
+
+from shark.shark_inference import SharkInference
+shark_module = SharkInference(torch_mlir, func_name, device="cpu", mlir_dialect="linalg")
+shark_module.compile()
+result = shark_module.forward((input))
+
+```
+
+
+### Example demonstrating running MHLO IR.
+
+```
+from shark.shark_inference import SharkInference
+import numpy as np
+
+mhlo_ir = r"""builtin.module  {
+      func.func @forward(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
+        %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<4x4xf32>
+        %1 = "mhlo.abs"(%0) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+        return %1 : tensor<4x4xf32>
+      }
+}"""
+
+arg0 = np.ones((1, 4)).astype(np.float32)
+arg1 = np.ones((4, 1)).astype(np.float32)
+shark_module = SharkInference(mhlo_ir, func_name="forward", device="cpu", mlir_dialect="mhlo")
+shark_module.compile()
+result = shark_module.forward((arg0, arg1))
+```
+</details>
+
+## Supported and Validated Models
+
+SHARK is maintained to support the latest innovations in ML Models: 
+
+| TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------|----------|-------------|
+| BERT                | :green_heart:         | :green_heart:         | :green_heart:            |
+| DistilBERT         | :green_heart:         | :green_heart:         | :green_heart:            |
+| GPT2         | :green_heart:         | :green_heart:         | :green_heart:            |
+| BLOOM         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Stable Diffusion         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Vision Transformer       | :green_heart:         | :green_heart:         | :green_heart:            |
+| ResNet50         | :green_heart:         | :green_heart:         | :green_heart:            |
+
+For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).
+
+## Communication Channels
+
+*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
+*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+
+## Related Projects
+
+<details>
+  <summary>IREE Project Channels</summary>
+
+*   [Upstream IREE issues](https://github.com/google/iree/issues): Feature requests,
+    bugs, and other work tracking
+*   [Upstream IREE Discord server](https://discord.gg/26P4xW4): Daily development
+    discussions with the core team and collaborators
+*   [iree-discuss email list](https://groups.google.com/forum/#!forum/iree-discuss):
+    Announcements, general and low-priority discussion
+</details>
+
+<details>
+  <summary>MLIR and Torch-MLIR Project Channels</summary>
+
+* `#torch-mlir` channel on the LLVM [Discord](https://discord.gg/xS7Z362) - this is the most active communication channel
+* Torch-MLIR Github issues [here](https://github.com/llvm/torch-mlir/issues)
+* [`torch-mlir` section](https://llvm.discourse.group/c/projects-that-want-to-become-official-llvm-projects/torch-mlir/41) of LLVM Discourse
+*  Weekly meetings on Mondays 9AM PST. See [here](https://discourse.llvm.org/t/community-meeting-developer-hour-refactoring-recurring-meetings/62575) for more information.
+* [MLIR topic within LLVM Discourse](https://llvm.discourse.group/c/llvm-project/mlir/31) SHARK and IREE is enabled by and heavily relies on [MLIR](https://mlir.llvm.org).
+</details>
+  
+## License
+
+nod.ai SHARK is licensed under the terms of the Apache 2.0 License with LLVM Exceptions.
+See [LICENSE](LICENSE) for more information.
--- a/apps/init.py
+++ b/apps/init.py
--- a/apps/stable_diffusion/init.py
+++ b/apps/stable_diffusion/init.py
--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -0,0 +1 @@
+from apps.stable_diffusion.scripts.txt2img import txt2img_inf
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
--- a/apps/stable_diffusion/scripts/txt2img.py
+++ b/apps/stable_diffusion/scripts/txt2img.py
@@ -0,0 +1,274 @@
+import os
+
+os.environ["AMD_ENABLE_LLPC"] = "1"
+
+import json
+import torch
+import re
+import time
+from pathlib import Path
+from PIL import PngImagePlugin
+from datetime import datetime as dt
+from dataclasses import dataclass
+from csv import DictWriter
+from apps.stable_diffusion.src import (
+    args,
+    Text2ImagePipeline,
+    get_schedulers,
+    set_init_device_flags,
+)
+
+
+@dataclass
+class Config:
+    model_id: str
+    ckpt_loc: str
+    precision: str
+    batch_size: int
+    max_length: int
+    height: int
+    width: int
+    device: str
+
+
+# This has to come before importing cache objects
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+# save output images and the inputs correspoding to it.
+def save_output_img(output_img):
+    output_path = args.output_dir if args.output_dir else Path.cwd()
+    generated_imgs_path = Path(output_path, "generated_imgs")
+    generated_imgs_path.mkdir(parents=True, exist_ok=True)
+    csv_path = Path(generated_imgs_path, "imgs_details.csv")
+
+    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
+    out_img_name = (
+        f"{prompt_slice}_{args.seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+    )
+    out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+
+    if args.output_img_format == "jpg":
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+        output_img.save(out_img_path, quality=95, subsampling=0)
+    else:
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
+        pngInfo = PngImagePlugin.PngInfo()
+
+        if args.write_metadata_to_png:
+            pngInfo.add_text(
+                "parameters",
+                f"{args.prompts[0]}\nNegative prompt: {args.negative_prompts[0]}\nSteps:{args.steps}, Sampler: {args.scheduler}, CFG scale: {args.guidance_scale}, Seed: {args.seed}, Size: {args.width}x{args.height}, Model: {args.hf_model_id}",
+            )
+
+        output_img.save(
+            output_path / f"{out_img_name}.png", "PNG", pnginfo=pngInfo
+        )
+
+        if args.output_img_format not in ["png", "jpg"]:
+            print(
+                f"[ERROR] Format {args.output_img_format} is not supported yet."
+                "Image saved as png instead. Supported formats: png / jpg"
+            )
+
+    new_entry = {
+        "VARIANT": args.hf_model_id,
+        "SCHEDULER": args.scheduler,
+        "PROMPT": args.prompts[0],
+        "NEG_PROMPT": args.negative_prompts[0],
+        "SEED": args.seed,
+        "CFG_SCALE": args.guidance_scale,
+        "PRECISION": args.precision,
+        "STEPS": args.steps,
+        "HEIGHT": args.height,
+        "WIDTH": args.width,
+        "MAX_LENGTH": args.max_length,
+        "OUTPUT": out_img_path,
+    }
+
+    with open(csv_path, "a") as csv_obj:
+        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
+        dictwriter_obj.writerow(new_entry)
+        csv_obj.close()
+
+    if args.save_metadata_to_json:
+        del new_entry["OUTPUT"]
+        with open(f"{output_path}/{out_img_name}.json", "w") as f:
+            json.dump(new_entry, f, indent=4)
+
+
+txt2img_obj = None
+config_obj = None
+schedulers = None
+
+
+# Exposed to UI.
+def txt2img_inf(
+    prompt: str,
+    negative_prompt: str,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_size: int,
+    scheduler: str,
+    model_id: str,
+    custom_model_id: str,
+    ckpt_file_obj,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+):
+    global txt2img_obj
+    global config_obj
+    global schedulers
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.scheduler = scheduler
+    args.hf_model_id = custom_model_id if custom_model_id else model_id
+    args.ckpt_loc = ckpt_file_obj.name if ckpt_file_obj else ""
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+    )
+    if config_obj != new_config_obj:
+        config_obj = new_config_obj
+        args.precision = precision
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.use_tuned = True
+        args.import_mlir = False
+        set_init_device_flags()
+        schedulers = get_schedulers(model_id)
+        scheduler_obj = schedulers[scheduler]
+        txt2img_obj = Text2ImagePipeline.from_pretrained(
+            scheduler_obj,
+            args.import_mlir,
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.precision,
+            args.max_length,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.use_base_vae,
+        )
+    txt2img_obj.scheduler = schedulers[scheduler]
+
+    start_time = time.time()
+    txt2img_obj.log = ""
+    generated_imgs = txt2img_obj.generate_images(
+        prompt,
+        negative_prompt,
+        batch_size,
+        height,
+        width,
+        steps,
+        guidance_scale,
+        seed,
+        args.max_length,
+        dtype,
+        args.use_base_vae,
+        cpu_scheduling,
+    )
+    total_time = time.time() - start_time
+    save_output_img(generated_imgs[0])
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={device}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={args.height}x{args.width}"
+    text_output += (
+        f", batch size={args.batch_size}, max_length={args.max_length}"
+    )
+    text_output += txt2img_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    return generated_imgs, text_output
+
+
+if __name__ == "__main__":
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)
+    scheduler_obj = schedulers[args.scheduler]
+
+    txt2img_obj = Text2ImagePipeline.from_pretrained(
+        scheduler_obj,
+        args.import_mlir,
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.precision,
+        args.max_length,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.use_base_vae,
+    )
+
+    start_time = time.time()
+    generated_imgs = txt2img_obj.generate_images(
+        args.prompts,
+        args.negative_prompts,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.steps,
+        args.guidance_scale,
+        args.seed,
+        args.max_length,
+        dtype,
+        args.use_base_vae,
+        cpu_scheduling,
+    )
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={args.seed}, size={args.height}x{args.width}"
+    text_output += (
+        f", batch size={args.batch_size}, max_length={args.max_length}"
+    )
+    text_output += txt2img_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    save_output_img(generated_imgs[0])
+    print(text_output)
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -0,0 +1,78 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import copy_metadata
+
+import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
+
+datas = []
+datas += collect_data_files('torch')
+datas += copy_metadata('torch')
+datas += copy_metadata('tqdm')
+datas += copy_metadata('regex')
+datas += copy_metadata('requests')
+datas += copy_metadata('packaging')
+datas += copy_metadata('filelock')
+datas += copy_metadata('numpy')
+datas += copy_metadata('tokenizers')
+datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torchvision')
+datas += copy_metadata('torch-mlir')
+datas += copy_metadata('diffusers')
+datas += copy_metadata('transformers')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
+datas += collect_data_files('gradio')
+datas += collect_data_files('iree')
+datas += collect_data_files('google-cloud-storage')
+datas += collect_data_files('shark')
+datas += [
+         ( 'src/utils/resources/prompts.json', 'resources' ),
+         ( 'src/utils/resources/model_db.json', 'resources' ),
+         ( 'src/utils/resources/opt_flags.json', 'resources' ),
+         ( 'src/utils/resources/base_model.json', 'resources' ),
+         ( 'web/logos/*', 'logos' )
+         ]
+
+binaries = []
+
+block_cipher = None
+
+
+a = Analysis(
+    ['web/index.py'],
+    pathex=['.'],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='shark_sd',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/apps/stable_diffusion/shark_sd_cli.spec
+++ b/apps/stable_diffusion/shark_sd_cli.spec
@@ -0,0 +1,77 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import copy_metadata
+
+import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
+
+datas = []
+datas += collect_data_files('torch')
+datas += copy_metadata('torch')
+datas += copy_metadata('tqdm')
+datas += copy_metadata('regex')
+datas += copy_metadata('requests')
+datas += copy_metadata('packaging')
+datas += copy_metadata('filelock')
+datas += copy_metadata('numpy')
+datas += copy_metadata('tokenizers')
+datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torchvision')
+datas += copy_metadata('torch-mlir')
+datas += copy_metadata('diffusers')
+datas += copy_metadata('transformers')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
+datas += collect_data_files('gradio')
+datas += collect_data_files('iree')
+datas += collect_data_files('google-cloud-storage')
+datas += collect_data_files('shark')
+datas += [
+         ( 'src/utils/resources/prompts.json', 'resources' ),
+         ( 'src/utils/resources/model_db.json', 'resources' ),
+         ( 'src/utils/resources/opt_flags.json', 'resources' ),
+         ( 'src/utils/resources/base_model.json', 'resources' ),
+         ]
+
+binaries = []
+
+block_cipher = None
+
+
+a = Analysis(
+    ['scripts/txt2img.py'],
+    pathex=['.'],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='shark_sd_cli',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -0,0 +1,8 @@
+from apps.stable_diffusion.src.utils import (
+    args,
+    set_init_device_flags,
+    prompt_examples,
+    get_available_devices,
+)
+from apps.stable_diffusion.src.pipelines import Text2ImagePipeline
+from apps.stable_diffusion.src.schedulers import get_schedulers
--- a/apps/stable_diffusion/src/models/init.py
+++ b/apps/stable_diffusion/src/models/init.py
@@ -0,0 +1,9 @@
+from apps.stable_diffusion.src.models.model_wrappers import (
+    SharkifyStableDiffusionModel,
+)
+from apps.stable_diffusion.src.models.opt_params import (
+    get_vae,
+    get_unet,
+    get_clip,
+    get_tokenizer,
+)
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -0,0 +1,233 @@
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from transformers import CLIPTextModel
+from collections import defaultdict
+import torch
+import sys
+import traceback
+import re
+from apps.stable_diffusion.src.utils import (
+    compile_through_fx,
+    get_opt_flags,
+    base_models,
+    args,
+)
+
+
+# These shapes are parameter dependent.
+def replace_shape_str(shape, max_len, width, height, batch_size):
+    new_shape = []
+    for i in range(len(shape)):
+        if shape[i] == "max_len":
+            new_shape.append(max_len)
+        elif shape[i] == "height":
+            new_shape.append(height)
+        elif shape[i] == "width":
+            new_shape.append(width)
+        elif isinstance(shape[i], str):
+            if "batch_size" in shape[i]:
+                mul_val = int(shape[i].split("*")[0])
+                new_shape.append(batch_size * mul_val)
+        else:
+            new_shape.append(shape[i])
+    return new_shape
+
+
+# Get the input info for various models i.e. "unet", "clip", "vae".
+def get_input_info(model_info, max_len, width, height, batch_size):
+    dtype_config = {"f32": torch.float32, "i64": torch.int64}
+    input_map = defaultdict(list)
+    for k in model_info:
+        for inp in model_info[k]:
+            shape = model_info[k][inp]["shape"]
+            dtype = dtype_config[model_info[k][inp]["dtype"]]
+            tensor = None
+            if isinstance(shape, list):
+                clean_shape = replace_shape_str(
+                    shape, max_len, width, height, batch_size
+                )
+                if dtype == torch.int64:
+                    tensor = torch.randint(1, 3, tuple(clean_shape))
+                else:
+                    tensor = torch.randn(*clean_shape).to(dtype)
+            elif isinstance(shape, int):
+                tensor = torch.tensor(shape).to(dtype)
+            else:
+                sys.exit("shape isn't specified correctly.")
+            input_map[k].append(tensor)
+    return input_map
+
+
+class SharkifyStableDiffusionModel:
+    def __init__(
+        self,
+        model_id: str,
+        custom_weights: str,
+        precision: str,
+        max_len: int = 64,
+        width: int = 512,
+        height: int = 512,
+        batch_size: int = 1,
+        use_base_vae: bool = False,
+    ):
+        self.check_params(max_len, width, height)
+        self.max_len = max_len
+        self.height = height // 8
+        self.width = width // 8
+        self.batch_size = batch_size
+        self.model_id = model_id if custom_weights == "" else custom_weights
+        self.precision = precision
+        self.base_vae = use_base_vae
+        self.model_name = (
+            str(batch_size)
+            + "_"
+            + str(max_len)
+            + "_"
+            + str(height)
+            + "_"
+            + str(width)
+            + "_"
+            + precision
+        )
+        # We need a better naming convention for the .vmfbs because despite
+        # using the custom model variant the .vmfb names remain the same and
+        # it'll always pick up the compiled .vmfb instead of compiling the
+        # custom model.
+        # So, currently, we add `self.model_id` in the `self.model_name` of
+        # .vmfb file.
+        # TODO: Have a better way of naming the vmfbs using self.model_name.
+
+        model_name = re.sub(r"\W+", "_", self.model_id)
+        if model_name[0] == "_":
+            model_name = model_name[1:]
+        self.model_name = self.model_name + "_" + model_name
+
+    def check_params(self, max_len, width, height):
+        if not (max_len >= 32 and max_len <= 77):
+            sys.exit("please specify max_len in the range [32, 77].")
+        if not (width % 8 == 0 and width >= 384):
+            sys.exit("width should be greater than 384 and multiple of 8")
+        if not (height % 8 == 0 and height >= 384):
+            sys.exit("height should be greater than 384 and multiple of 8")
+
+    def get_vae(self):
+        class VaeModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, base_vae=self.base_vae):
+                super().__init__()
+                self.vae = AutoencoderKL.from_pretrained(
+                    model_id,
+                    subfolder="vae",
+                )
+                self.base_vae = base_vae
+
+            def forward(self, input):
+                if not self.base_vae:
+                    input = 1 / 0.18215 * input
+                x = self.vae.decode(input, return_dict=False)[0]
+                x = (x / 2 + 0.5).clamp(0, 1)
+                if self.base_vae:
+                    return x
+                x = x * 255.0
+                return x.round()
+
+        vae = VaeModel()
+        inputs = tuple(self.inputs["vae"])
+        is_f16 = True if self.precision == "fp16" else False
+        vae_name = "base_vae" if self.base_vae else "vae"
+        shark_vae = compile_through_fx(
+            vae,
+            inputs,
+            is_f16=is_f16,
+            model_name=vae_name + self.model_name,
+            extra_args=get_opt_flags("vae", precision=self.precision),
+        )
+        return shark_vae
+
+    def get_unet(self):
+        class UnetModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.unet = UNet2DConditionModel.from_pretrained(
+                    model_id,
+                    subfolder="unet",
+                )
+                self.in_channels = self.unet.in_channels
+                self.train(False)
+
+            def forward(
+                self, latent, timestep, text_embedding, guidance_scale
+            ):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latents = torch.cat([latent] * 2)
+                unet_out = self.unet.forward(
+                    latents, timestep, text_embedding, return_dict=False
+                )[0]
+                noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+                return noise_pred
+
+        unet = UnetModel()
+        is_f16 = True if self.precision == "fp16" else False
+        inputs = tuple(self.inputs["unet"])
+        input_mask = [True, True, True, False]
+        shark_unet = compile_through_fx(
+            unet,
+            inputs,
+            model_name="unet" + self.model_name,
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+        )
+        return shark_unet
+
+    def get_clip(self):
+        class CLIPText(torch.nn.Module):
+            def __init__(self, model_id=self.model_id):
+                super().__init__()
+                self.text_encoder = CLIPTextModel.from_pretrained(
+                    model_id,
+                    subfolder="text_encoder",
+                )
+
+            def forward(self, input):
+                return self.text_encoder(input)[0]
+
+        clip_model = CLIPText()
+
+        shark_clip = compile_through_fx(
+            clip_model,
+            tuple(self.inputs["clip"]),
+            model_name="clip" + self.model_name,
+            extra_args=get_opt_flags("clip", precision="fp32"),
+        )
+        return shark_clip
+
+    def __call__(self):
+        for model_id in base_models:
+            self.inputs = get_input_info(
+                base_models[model_id],
+                self.max_len,
+                self.width,
+                self.height,
+                self.batch_size,
+            )
+            try:
+                compiled_clip = self.get_clip()
+                compiled_unet = self.get_unet()
+                compiled_vae = self.get_vae()
+            except Exception as e:
+                if args.enable_stack_trace:
+                    traceback.print_exc()
+                print("Retrying with a different base model configuration")
+                continue
+            # This is done just because in main.py we are basing the choice of tokenizer and scheduler
+            # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
+            # model and rely on retrying method to find the input configuration, we should also update
+            # the knowledge of base model id accordingly into `args.hf_model_id`.
+            if args.ckpt_loc != "":
+                args.hf_model_id = model_id
+            return compiled_clip, compiled_unet, compiled_vae
+        sys.exit(
+            "Cannot compile the model. Please use `enable_stack_trace` and create an issue at https://github.com/nod-ai/SHARK/issues"
+        )
--- a/apps/stable_diffusion/src/models/opt_params.py
+++ b/apps/stable_diffusion/src/models/opt_params.py
@@ -0,0 +1,113 @@
+import sys
+from transformers import CLIPTokenizer
+from apps.stable_diffusion.src.utils import models_db, args, get_shark_model
+
+
+hf_model_variant_map = {
+    "Linaqruf/anything-v3.0": ["anythingv3", "v2_1base"],
+    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v2_1base"],
+    "prompthero/openjourney": ["openjourney", "v2_1base"],
+    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v2_1base"],
+    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1"],
+    "stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
+    "CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
+}
+
+
+def get_params(bucket_key, model_key, model, is_tuned, precision):
+    iree_flags = []
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    try:
+        bucket = models_db[0][bucket_key]
+        model_name = models_db[1][model_key]
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "default_compilation_flags"
+        ]
+    except KeyError:
+        raise Exception(
+            f"{bucket_key}/{model_key} is not present in the models database"
+        )
+
+    if (
+        "specified_compilation_flags"
+        in models_db[2][model][is_tuned][precision]
+    ):
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in models_db[2][model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += models_db[2][model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return bucket, model_name, iree_flags
+
+
+def get_unet():
+    variant, version = hf_model_variant_map[args.hf_model_id]
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if "vulkan" not in args.device and args.use_tuned:
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
+    else:
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "unet", is_tuned, args.precision
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_vae():
+    variant, version = hf_model_variant_map[args.hf_model_id]
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    is_base = "/base" if args.use_base_vae else ""
+    if "vulkan" not in args.device and args.use_tuned:
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}/{args.device}"
+    else:
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "vae", is_tuned, args.precision
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_clip():
+    variant, version = hf_model_variant_map[args.hf_model_id]
+    bucket_key = f"{variant}/untuned"
+    model_key = (
+        f"{variant}/{version}/clip/fp32/length_{args.max_length}/untuned"
+    )
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "clip", "untuned", "fp32"
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
+def get_tokenizer():
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.hf_model_id, subfolder="tokenizer"
+    )
+    return tokenizer
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -0,0 +1,3 @@
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_txt2img import (
+    Text2ImagePipeline,
+)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -0,0 +1,134 @@
+import torch
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+
+
+class Text2ImagePipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get initial latents
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Get Image latents
+        latents = self.produce_img_latents(
+            latents=init_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=self.scheduler.timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -0,0 +1,209 @@
+import torch
+from transformers import CLIPTokenizer
+import torchvision.transforms as T
+from tqdm.auto import tqdm
+import time
+from typing import Union
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from shark.shark_inference import SharkInference
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.models import (
+    SharkifyStableDiffusionModel,
+    get_vae,
+    get_clip,
+    get_unet,
+    get_tokenizer,
+)
+from apps.stable_diffusion.src.utils import (
+    start_profiling,
+    end_profiling,
+    preprocessCKPT,
+)
+
+
+class StableDiffusionPipeline:
+    def __init__(
+        self,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        self.vae = vae
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+        self.unet = unet
+        self.scheduler = scheduler
+        # TODO: Implement using logging python utility.
+        self.log = ""
+
+    def encode_prompts(self, prompts, neg_prompts, max_length):
+        # Tokenize text and get embeddings
+        text_input = self.tokenizer(
+            prompts,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        # Get unconditional embeddings as well
+        uncond_input = self.tokenizer(
+            neg_prompts,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])
+
+        clip_inf_start = time.time()
+        text_embeddings = self.text_encoder("forward", (text_input,))
+        clip_inf_time = (time.time() - clip_inf_start) * 1000
+        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"
+
+        return text_embeddings
+
+    def decode_latents(self, latents, use_base_vae, cpu_scheduling):
+        if use_base_vae:
+            latents = 1 / 0.18215 * latents
+
+        latents_numpy = latents
+        if cpu_scheduling:
+            latents_numpy = latents.detach().numpy()
+
+        profile_device = start_profiling(file_path="vae.rdc")
+        vae_start = time.time()
+        images = self.vae("forward", (latents_numpy,))
+        vae_inf_time = (time.time() - vae_start) * 1000
+        end_profiling(profile_device)
+        self.log += f"\nVAE Inference time (ms): {vae_inf_time:.3f}"
+
+        if use_base_vae:
+            images = torch.from_numpy(images)
+            images = (images.detach().cpu() * 255.0).numpy()
+            images = images.round()
+
+        transform = T.ToPILImage()
+        pil_images = [
+            transform(image)
+            for image in torch.from_numpy(images).to(torch.uint8)
+        ]
+        return pil_images
+
+    def produce_img_latents(
+        self,
+        latents,
+        text_embeddings,
+        guidance_scale,
+        total_timesteps,
+        dtype,
+        cpu_scheduling,
+        return_all_latents=False,
+    ):
+        step_time_sum = 0
+        latent_history = [latents]
+        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            timestep = torch.tensor([t]).to(dtype).detach().numpy()
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            # Profiling Unet.
+            profile_device = start_profiling(file_path="unet.rdc")
+            noise_pred = self.unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                ),
+                send_to_host=False,
+            )
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = self.scheduler.step(
+                    noise_pred, t, latents
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents)
+
+            latent_history.append(latents)
+            step_time = (time.time() - step_start_time) * 1000
+            #  self.log += (
+            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
+            #  )
+            step_time_sum += step_time
+
+        avg_step_time = step_time_sum / len(total_timesteps)
+        self.log += f"\nAverage step time: {avg_step_time}ms/it"
+
+        if not return_all_latents:
+            return latents
+        all_latents = torch.cat(latent_history, dim=0)
+        return all_latents
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+        import_mlir: bool,
+        model_id: str,
+        ckpt_loc: str,
+        precision: str,
+        max_length: int,
+        batch_size: int,
+        height: int,
+        width: int,
+        use_base_vae: bool,
+    ):
+        init_kwargs = None
+        if import_mlir:
+            if ckpt_loc:
+                preprocessCKPT()
+            mlir_import = SharkifyStableDiffusionModel(
+                model_id,
+                ckpt_loc,
+                precision,
+                max_len=max_length,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                use_base_vae=use_base_vae,
+            )
+            clip, unet, vae = mlir_import()
+            return cls(vae, clip, get_tokenizer(), unet, scheduler)
+        return cls(
+            get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
+        )
--- a/apps/stable_diffusion/src/schedulers/init.py
+++ b/apps/stable_diffusion/src/schedulers/init.py
@@ -0,0 +1,4 @@
+from apps.stable_diffusion.src.schedulers.sd_schedulers import get_schedulers
+from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
+    SharkEulerDiscreteScheduler,
+)
--- a/apps/stable_diffusion/src/schedulers/sd_schedulers.py
+++ b/apps/stable_diffusion/src/schedulers/sd_schedulers.py
@@ -0,0 +1,51 @@
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+)
+from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
+    SharkEulerDiscreteScheduler,
+)
+
+
+def get_schedulers(model_id):
+    schedulers = dict()
+    schedulers["PNDM"] = PNDMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["DDIM"] = DDIMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "DPMSolverMultistep"
+    ] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "EulerAncestralDiscrete"
+    ] = EulerAncestralDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers[
+        "SharkEulerDiscrete"
+    ] = SharkEulerDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["SharkEulerDiscrete"].compile()
+    return schedulers
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -0,0 +1,143 @@
+import sys
+import numpy as np
+from typing import List, Optional, Tuple, Union
+from diffusers import (
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+)
+from diffusers.configuration_utils import register_to_config
+from apps.stable_diffusion.src.utils import (
+    compile_through_fx,
+    get_shark_model,
+    args,
+)
+import torch
+
+
+class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
+        super().__init__(
+            num_train_timesteps,
+            beta_start,
+            beta_end,
+            beta_schedule,
+            trained_betas,
+            prediction_type,
+        )
+
+    def compile(self):
+        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+        BATCH_SIZE = args.batch_size
+
+        model_input = {
+            "euler": {
+                "latent": torch.randn(
+                    BATCH_SIZE, 4, args.height // 8, args.width // 8
+                ),
+                "output": torch.randn(
+                    BATCH_SIZE, 4, args.height // 8, args.width // 8
+                ),
+                "sigma": torch.tensor(1).to(torch.float32),
+                "dt": torch.tensor(1).to(torch.float32),
+            },
+        }
+
+        example_latent = model_input["euler"]["latent"]
+        example_output = model_input["euler"]["output"]
+        if args.precision == "fp16":
+            example_latent = example_latent.half()
+            example_output = example_output.half()
+        example_sigma = model_input["euler"]["sigma"]
+        example_dt = model_input["euler"]["dt"]
+
+        class ScalingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, latent, sigma):
+                return latent / ((sigma**2 + 1) ** 0.5)
+
+        class SchedulerStepModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, noise_pred, sigma, latent, dt):
+                pred_original_sample = latent - sigma * noise_pred
+                derivative = (latent - pred_original_sample) / sigma
+                return latent + derivative * dt
+
+        iree_flags = []
+        if len(args.iree_vulkan_target_triple) > 0:
+            iree_flags.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+        # Disable bindings fusion to work with moltenVK.
+        if sys.platform == "darwin":
+            iree_flags.append("-iree-stream-fuse-binding=false")
+
+        if args.import_mlir:
+            scaling_model = ScalingModel()
+            self.scaling_model = compile_through_fx(
+                scaling_model,
+                (example_latent, example_sigma),
+                model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+
+            step_model = SchedulerStepModel()
+            self.step_model = compile_through_fx(
+                step_model,
+                (example_output, example_sigma, example_latent, example_dt),
+                model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+        else:
+            self.scaling_model = get_shark_model(
+                SCHEDULER_BUCKET,
+                "euler_scale_model_input_" + args.precision,
+                iree_flags,
+            )
+            self.step_model = get_shark_model(
+                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
+            )
+
+    def scale_model_input(self, sample, timestep):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        return self.scaling_model(
+            "forward",
+            (
+                sample,
+                sigma,
+            ),
+            send_to_host=False,
+        )
+
+    def step(self, noise_pred, timestep, latent):
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        dt = self.sigmas[step_index + 1] - sigma
+        return self.step_model(
+            "forward",
+            (
+                noise_pred,
+                sigma,
+                latent,
+                dt,
+            ),
+            send_to_host=False,
+        )
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -0,0 +1,22 @@
+from apps.stable_diffusion.src.utils.profiler import (
+    start_profiling,
+    end_profiling,
+)
+from apps.stable_diffusion.src.utils.resources import (
+    prompt_examples,
+    models_db,
+    base_models,
+    opt_flags,
+    resource_path,
+)
+from apps.stable_diffusion.src.utils.stable_args import args
+from apps.stable_diffusion.src.utils.utils import (
+    get_shark_model,
+    compile_through_fx,
+    set_iree_runtime_flags,
+    map_device_to_name_path,
+    set_init_device_flags,
+    get_available_devices,
+    get_opt_flags,
+    preprocessCKPT,
+)
--- a/apps/stable_diffusion/src/utils/profiler.py
+++ b/apps/stable_diffusion/src/utils/profiler.py
@@ -0,0 +1,18 @@
+from apps.stable_diffusion.src.utils.stable_args import args
+
+
+# Helper function to profile the vulkan device.
+def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
+    if args.vulkan_debug_utils and "vulkan" in args.device:
+        import iree
+
+        print(f"Profiling and saving to {file_path}.")
+        vulkan_device = iree.runtime.get_device(args.device)
+        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
+        return vulkan_device
+    return None
+
+
+def end_profiling(device):
+    if device:
+        return device.end_profiling()
--- a/apps/stable_diffusion/src/utils/resources.py
+++ b/apps/stable_diffusion/src/utils/resources.py
@@ -0,0 +1,37 @@
+import os
+import json
+import sys
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+def get_json_file(path):
+    json_var = []
+    loc_json = resource_path(path)
+    if os.path.exists(loc_json):
+        with open(loc_json, encoding="utf-8") as fopen:
+            json_var = json.load(fopen)
+
+    if not json_var:
+        print(f"Unable to fetch {path}")
+
+    return json_var
+
+
+# TODO: This shouldn't be called from here, every time the file imports
+# it will run all the global vars.
+prompt_examples = get_json_file("resources/prompts.json")
+models_db = get_json_file("resources/model_db.json")
+
+# The base_model contains the input configuration for the different
+# models and also helps in providing information for the variants.
+base_models = get_json_file("resources/base_model.json")
+
+# Contains optimization flags for different models.
+opt_flags = get_json_file("resources/opt_flags.json")
--- a/apps/stable_diffusion/src/utils/resources/base_model.json
+++ b/apps/stable_diffusion/src/utils/resources/base_model.json
@@ -0,0 +1,98 @@
+{
+    "stabilityai/stable-diffusion-2-1": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    },
+    "CompVis/stable-diffusion-v1-4": {
+        "unet": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "clip": {
+            "token" : {
+                "shape" : [
+                    "2*batch_size",
+                    "max_len"
+                ],
+                "dtype":"i64"
+            }
+        }
+    }
+}
--- a/apps/stable_diffusion/src/utils/resources/model_db.json
+++ b/apps/stable_diffusion/src/utils/resources/model_db.json
@@ -0,0 +1,177 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet2base_8dec_fp16",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip2base_18dec_fp32",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_19dec_v2p1base_fp32_64",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet2_14dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae2_19dec_fp16",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip2_18dec_fp32",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/apps/stable_diffusion/src/utils/resources/opt_flags.json
+++ b/apps/stable_diffusion/src/utils/resources/opt_flags.json
@@ -0,0 +1,95 @@
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
--- a/apps/stable_diffusion/src/utils/resources/prompts.json
+++ b/apps/stable_diffusion/src/utils/resources/prompts.json
@@ -0,0 +1,8 @@
+[["A high tech solarpunk utopia in the Amazon rainforest"],
+["A pikachu fine dining with a view to the Eiffel Tower"],
+["A mecha robot in a favela in expressionist style"],
+["an insect robot preparing a delicious meal"],
+["A digital Illustration of the Babel tower, 4k, detailed, trending in artstation, fantasy vivid colors"],
+["Cluttered house in the woods, anime, oil painting, high resolution, cottagecore, ghibli inspired, 4k"],
+["A beautiful mansion beside a waterfall in the woods, by josef thoma, matte painting, trending on artstation HQ"],
+["portrait photo of a asia old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes"]]
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -0,0 +1,337 @@
+import argparse
+from pathlib import Path
+
+
+def path_expand(s):
+    return Path(s).expanduser().resolve()
+
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Stable Diffusion Params
+##############################################################################
+
+p.add_argument(
+    "-p",
+    "--prompts",
+    action="append",
+    default=[],
+    help="text of which images to be generated.",
+)
+
+p.add_argument(
+    "--negative-prompts",
+    nargs="+",
+    default=[""],
+    help="text you don't want to see in the generated image.",
+)
+
+p.add_argument(
+    "--steps",
+    type=int,
+    default=50,
+    help="the no. of steps to do the sampling.",
+)
+
+p.add_argument(
+    "--seed",
+    type=int,
+    default=42,
+    help="the seed to use.",
+)
+
+p.add_argument(
+    "--batch_size",
+    type=int,
+    default=1,
+    choices=range(1, 4),
+    help="the number of inferences to be made in a single `run`.",
+)
+
+p.add_argument(
+    "--height",
+    type=int,
+    default=512,
+    help="the height of the output image.",
+)
+
+p.add_argument(
+    "--width",
+    type=int,
+    default=512,
+    help="the width of the output image.",
+)
+
+p.add_argument(
+    "--guidance_scale",
+    type=float,
+    default=7.5,
+    help="the value to be used for guidance scaling.",
+)
+
+p.add_argument(
+    "--max_length",
+    type=int,
+    default=64,
+    help="max length of the tokenizer output, options are 64 and 77.",
+)
+
+##############################################################################
+### Model Config and Usage Params
+##############################################################################
+
+p.add_argument(
+    "--device", type=str, default="vulkan", help="device to run the model."
+)
+
+p.add_argument(
+    "--precision", type=str, default="fp16", help="precision to run the model."
+)
+
+p.add_argument(
+    "--import_mlir",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
+)
+
+p.add_argument(
+    "--load_vmfb",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
+)
+
+p.add_argument(
+    "--save_vmfb",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="saves the compiled flatbuffer to the local directory",
+)
+
+p.add_argument(
+    "--use_tuned",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Download and use the tuned version of the model if available",
+)
+
+p.add_argument(
+    "--use_base_vae",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Do conversion from the VAE output to pixel space on cpu.",
+)
+
+p.add_argument(
+    "--scheduler",
+    type=str,
+    default="SharkEulerDiscrete",
+    help="other supported schedulers are [PNDM, DDIM, LMSDiscrete, EulerDiscrete, DPMSolverMultistep]",
+)
+
+p.add_argument(
+    "--output_img_format",
+    type=str,
+    default="png",
+    help="specify the format in which output image is save. Supported options: jpg / png",
+)
+
+p.add_argument(
+    "--output_dir",
+    type=str,
+    default=None,
+    help="Directory path to save the output images and json",
+)
+
+p.add_argument(
+    "--runs",
+    type=int,
+    default=1,
+    help="number of images to be generated with random seeds in single execution",
+)
+
+p.add_argument(
+    "--ckpt_loc",
+    type=str,
+    default="",
+    help="Path to SD's .ckpt file.",
+)
+
+p.add_argument(
+    "--hf_model_id",
+    type=str,
+    default="stabilityai/stable-diffusion-2-1-base",
+    help="The repo-id of hugging face.",
+)
+
+p.add_argument(
+    "--enable_stack_trace",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Enable showing the stack trace when retrying the base model configuration",
+)
+
+##############################################################################
+### IREE - Vulkan supported flags
+##############################################################################
+
+p.add_argument(
+    "--iree-vulkan-target-triple",
+    type=str,
+    default="",
+    help="Specify target triple for vulkan",
+)
+
+p.add_argument(
+    "--vulkan_debug_utils",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Profiles vulkan device and collects the .rdc info",
+)
+
+p.add_argument(
+    "--vulkan_large_heap_block_size",
+    default="4147483648",
+    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
+)
+
+p.add_argument(
+    "--vulkan_validation_layers",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for disabling vulkan validation layers when benchmarking",
+)
+
+##############################################################################
+### Misc. Debug and Optimization flags
+##############################################################################
+
+p.add_argument(
+    "--use_compiled_scheduler",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="use the default scheduler precompiled into the model if available",
+)
+
+p.add_argument(
+    "--local_tank_cache",
+    default="",
+    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
+)
+
+p.add_argument(
+    "--dump_isa",
+    default=False,
+    action="store_true",
+    help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
+)
+
+p.add_argument(
+    "--dispatch_benchmarks",
+    default=None,
+    help='dispatches to return benchamrk data on.  use "All" for all, and None for none.',
+)
+
+p.add_argument(
+    "--dispatch_benchmarks_dir",
+    default="temp_dispatch_benchmarks",
+    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
+)
+
+p.add_argument(
+    "--enable_rgp",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for inserting debug frames between iterations for use with rgp.",
+)
+
+p.add_argument(
+    "--hide_steps",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for hiding the details of iteration/sec for each step.",
+)
+
+p.add_argument(
+    "--warmup_count",
+    type=int,
+    default=0,
+    help="flag setting warmup count for clip and vae [>= 0].",
+)
+
+p.add_argument(
+    "--clear_all",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag to clear all mlir and vmfb from common locations. Recompiling will take several minutes",
+)
+
+p.add_argument(
+    "--save_metadata_to_json",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for whether or not to save a generation information json file with the image.",
+)
+
+p.add_argument(
+    "--write_metadata_to_png",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for whether or not to save generation information in PNG chunk text to generated images.",
+)
+
+##############################################################################
+### Web UI flags
+##############################################################################
+
+p.add_argument(
+    "--progress_bar",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for removing the pregress bar animation during image generation",
+)
+
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
+##############################################################################
+### SD model auto-annotation flags
+##############################################################################
+
+p.add_argument(
+    "--annotation_output",
+    type=path_expand,
+    default="./",
+    help="Directory to save the annotated mlir file",
+)
+
+p.add_argument(
+    "--annotation_model",
+    type=str,
+    default="unet",
+    help="Options are unet and vae.",
+)
+
+p.add_argument(
+    "--use_winograd",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Apply Winograd on selected conv ops.",
+)
+
+args, unknown = p.parse_known_args()
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -0,0 +1,351 @@
+import os
+import torch
+from shark.shark_inference import SharkInference
+from shark.shark_importer import import_with_fx
+from shark.iree_utils.vulkan_utils import (
+    set_iree_vulkan_runtime_flags,
+    get_vulkan_target_triple,
+)
+from shark.iree_utils.gpu_utils import get_cuda_sm_cc
+from apps.stable_diffusion.src.utils.stable_args import args
+from apps.stable_diffusion.src.utils.resources import opt_flags
+import sys
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    load_pipeline_from_original_stable_diffusion_ckpt,
+)
+
+
+def _compile_module(shark_module, model_name, extra_args=[]):
+    if args.load_vmfb or args.save_vmfb:
+        device = (
+            args.device
+            if "://" not in args.device
+            else "-".join(args.device.split("://"))
+        )
+        extended_name = "{}_{}".format(model_name, device)
+        vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
+        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
+            print(f"loading existing vmfb from: {vmfb_path}")
+            shark_module.load_module(vmfb_path, extra_args=extra_args)
+        else:
+            if args.save_vmfb:
+                print("Saving to {}".format(vmfb_path))
+            else:
+                print(
+                    "No vmfb found. Compiling and saving to {}".format(
+                        vmfb_path
+                    )
+                )
+            path = shark_module.save_module(
+                os.getcwd(), extended_name, extra_args
+            )
+            shark_module.load_module(path, extra_args=extra_args)
+    else:
+        shark_module.compile(extra_args)
+    return shark_module
+
+
+# Downloads the model from shark_tank and returns the shark_module.
+def get_shark_model(tank_url, model_name, extra_args=[]):
+    from shark.shark_downloader import download_model
+    from shark.parser import shark_args
+
+    # Set local shark_tank cache directory.
+    shark_args.local_tank_cache = args.local_tank_cache
+    if "cuda" in args.device:
+        shark_args.enable_tf32 = True
+
+    mlir_model, func_name, inputs, golden_out = download_model(
+        model_name,
+        tank_url=tank_url,
+        frontend="torch",
+    )
+    shark_module = SharkInference(
+        mlir_model, device=args.device, mlir_dialect="linalg"
+    )
+    return _compile_module(shark_module, model_name, extra_args)
+
+
+# Converts the torch-module into a shark_module.
+def compile_through_fx(
+    model,
+    inputs,
+    model_name,
+    is_f16=False,
+    f16_input_mask=None,
+    extra_args=[],
+):
+    mlir_module, func_name = import_with_fx(
+        model, inputs, is_f16, f16_input_mask
+    )
+    shark_module = SharkInference(
+        mlir_module,
+        device=args.device,
+        mlir_dialect="linalg",
+    )
+
+    return _compile_module(shark_module, model_name, extra_args)
+
+
+def set_iree_runtime_flags():
+    vulkan_runtime_flags = [
+        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
+        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
+    ]
+    if args.enable_rgp:
+        vulkan_runtime_flags += [
+            f"--enable_rgp=true",
+            f"--vulkan_debug_utils=true",
+        ]
+    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
+
+
+def get_all_devices(driver_name):
+    """
+    Inputs: driver_name
+    Returns a list of all the available devices for a given driver sorted by
+    the iree path names of the device as in --list_devices option in iree.
+    """
+    from iree.runtime import get_driver
+
+    driver = get_driver(driver_name)
+    device_list_src = driver.query_available_devices()
+    device_list_src.sort(key=lambda d: d["path"])
+    return device_list_src
+
+
+def get_device_mapping(driver, key_combination=3):
+    """This method ensures consistent device ordering when choosing
+    specific devices for execution
+    Args:
+        driver (str): execution driver (vulkan, cuda, rocm, etc)
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Returns:
+        dict: map to possible device names user can input mapped to desired combination of name/path.
+    """
+    from shark.iree_utils._common import iree_device_map
+
+    driver = iree_device_map(driver)
+    device_list = get_all_devices(driver)
+    device_map = dict()
+
+    def get_output_value(dev_dict):
+        if key_combination == 1:
+            return f"{driver}://{dev_dict['path']}"
+        if key_combination == 2:
+            return dev_dict["name"]
+        if key_combination == 3:
+            return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
+
+    # mapping driver name to default device (driver://0)
+    device_map[f"{driver}"] = get_output_value(device_list[0])
+    for i, device in enumerate(device_list):
+        # mapping with index
+        device_map[f"{driver}://{i}"] = get_output_value(device)
+        # mapping with full path
+        device_map[f"{driver}://{device['path']}"] = get_output_value(device)
+    return device_map
+
+
+def map_device_to_name_path(device, key_combination=3):
+    """Gives the appropriate device data (supported name/path) for user selected execution device
+    Args:
+        device (str): user
+        key_combination (int, optional): choice for mapping value for device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Raises:
+        ValueError:
+    Returns:
+        str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
+    """
+    driver = device.split("://")[0]
+    device_map = get_device_mapping(driver, key_combination)
+    try:
+        device_mapping = device_map[device]
+    except KeyError:
+        raise ValueError(f"Device '{device}' is not a valid device.")
+    return device_mapping
+
+
+def set_init_device_flags():
+    if "vulkan" in args.device:
+        # set runtime flags for vulkan.
+        set_iree_runtime_flags()
+
+        # set triple flag to avoid multiple calls to get_vulkan_triple_flag
+        device_name, args.device = map_device_to_name_path(args.device)
+        if not args.iree_vulkan_target_triple:
+            triple = get_vulkan_target_triple(device_name)
+            if triple is not None:
+                args.iree_vulkan_target_triple = triple
+        print(
+            f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
+        )
+    elif "cuda" in args.device:
+        args.device = "cuda"
+    elif "cpu" in args.device:
+        args.device = "cpu"
+
+    # set max_length based on availability.
+    if args.hf_model_id in [
+        "Linaqruf/anything-v3.0",
+        "wavymulder/Analog-Diffusion",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+    ]:
+        args.max_length = 77
+    elif args.hf_model_id == "prompthero/openjourney":
+        args.max_length = 64
+
+    # Use tuned models in the case of a specific setting.
+    if (
+        args.hf_model_id
+        in ["prompthero/openjourney", "dreamlike-art/dreamlike-diffusion-1.0"]
+        or args.precision != "fp16"
+    ):
+        args.use_tuned = False
+
+    elif (
+        "vulkan" in args.device
+        and "rdna3" not in args.iree_vulkan_target_triple
+    ):
+        args.use_tuned = False
+
+    elif "cuda" in args.device and get_cuda_sm_cc() not in ["sm_80", "sm_89"]:
+        args.use_tuned = False
+
+    elif args.use_base_vae and args.hf_model_id not in [
+        "stabilityai/stable-diffusion-2-1-base",
+        "CompVis/stable-diffusion-v1-4",
+    ]:
+        args.use_tuned = False
+
+    if args.use_tuned:
+        print(f"Using tuned models for {args.hf_model_id}/fp16/{args.device}.")
+    else:
+        print("Tuned models are currently not supported for this setting.")
+
+    # set import_mlir to True for unuploaded models.
+    if args.hf_model_id not in [
+        "Linaqruf/anything-v3.0",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+        "prompthero/openjourney",
+        "wavymulder/Analog-Diffusion",
+        "stabilityai/stable-diffusion-2-1",
+        "stabilityai/stable-diffusion-2-1-base",
+        "CompVis/stable-diffusion-v1-4",
+    ]:
+        args.import_mlir = True
+
+    if args.height != 512 or args.width != 512 or args.batch_size != 1:
+        args.import_mlir = True
+
+
+# Utility to get list of devices available.
+def get_available_devices():
+    def get_devices_by_name(driver_name):
+        from shark.iree_utils._common import iree_device_map
+
+        device_list = []
+        try:
+            driver_name = iree_device_map(driver_name)
+            device_list_dict = get_all_devices(driver_name)
+            print(f"{driver_name} devices are available.")
+        except:
+            print(f"{driver_name} devices are not available.")
+        else:
+            for i, device in enumerate(device_list_dict):
+                device_list.append(f"{device['name']} => {driver_name}://{i}")
+        return device_list
+
+    set_iree_runtime_flags()
+
+    available_devices = []
+    vulkan_devices = get_devices_by_name("vulkan")
+    available_devices.extend(vulkan_devices)
+    cuda_devices = get_devices_by_name("cuda")
+    available_devices.extend(cuda_devices)
+    available_devices.append("cpu")
+    return available_devices
+
+
+def disk_space_check(path, lim=20):
+    from shutil import disk_usage
+
+    du = disk_usage(path)
+    free = du.free / (1024 * 1024 * 1024)
+    if free <= lim:
+        print(f"[WARNING] Only {free:.2f}GB space available in {path}.")
+
+
+def get_opt_flags(model, precision="fp16"):
+    iree_flags = []
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if len(args.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+        )
+
+    # Disable bindings fusion to work with moltenVK.
+    if sys.platform == "darwin":
+        iree_flags.append("-iree-stream-fuse-binding=false")
+
+    if "specified_compilation_flags" in opt_flags[model][is_tuned][precision]:
+        device = (
+            args.device
+            if "://" not in args.device
+            else args.device.split("://")[0]
+        )
+        if (
+            device
+            not in opt_flags[model][is_tuned][precision][
+                "specified_compilation_flags"
+            ]
+        ):
+            device = "default_device"
+        iree_flags += opt_flags[model][is_tuned][precision][
+            "specified_compilation_flags"
+        ][device]
+
+    return iree_flags
+
+
+def preprocessCKPT():
+    from pathlib import Path
+
+    path = Path(args.ckpt_loc)
+    diffusers_path = path.parent.absolute()
+    diffusers_directory_name = path.stem
+    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
+    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
+    print(
+        "Created directory : ",
+        diffusers_directory_name,
+        " at -> ",
+        diffusers_path,
+    )
+    path_to_diffusers = complete_path_to_diffusers.as_posix()
+    from_safetensors = (
+        True if args.ckpt_loc.lower().endswith(".safetensors") else False
+    )
+    # EMA weights usually yield higher quality images for inference but non-EMA weights have
+    # been yielding better results in our case.
+    # TODO: Add an option `--ema` (`--no-ema`) for users to specify if they want to go for EMA
+    #       weight extraction or not.
+    extract_ema = False
+    print("Loading pipeline from original stable diffusion checkpoint")
+    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path=args.ckpt_loc,
+        extract_ema=extract_ema,
+        from_safetensors=from_safetensors,
+    )
+    pipe.save_pretrained(path_to_diffusers)
+    print("Loading complete")
+    args.ckpt_loc = path_to_diffusers
+    print("Custom model path is : ", args.ckpt_loc)
--- a/apps/stable_diffusion/web/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/css/sd_dark_theme.css
@@ -0,0 +1,67 @@
+.gradio-container {
+    background-color: black
+}
+
+.container {
+    background-color: black !important;
+    padding-top: 20px !important;
+}
+
+#ui_title {
+    padding: 10px !important;
+}
+
+#top_logo {
+    background-color: transparent;
+    border-radius: 0 !important;
+    border: 0;
+}
+
+#demo_title {
+    background-color: black;
+    border-radius: 0 !important;
+    border: 0;
+    padding-top: 50px;
+    padding-bottom: 0px;
+    width: 460px !important;
+}
+
+#demo_title_outer {
+    border-radius: 0;
+}
+
+#prompt_box_outer div:first-child {
+    border-radius: 0 !important
+}
+
+#prompt_box textarea {
+    background-color: #1d1d1d !important
+}
+
+#prompt_examples {
+    margin: 0 !important
+}
+
+#prompt_examples svg {
+    display: none !important;
+}
+
+.gr-sample-textbox {
+    border-radius: 1rem !important;
+    border-color: rgb(31, 41, 55) !important;
+    border-width: 2px !important;
+}
+
+#ui_body {
+    background-color: #111111 !important;
+    padding: 10px !important;
+    border-radius: 0.5em !important;
+}
+
+#img_result+div {
+    display: none !important;
+}
+
+footer {
+    display: none !important;
+}
--- a/apps/stable_diffusion/web/gradio/img2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/img2img_ui.py
--- a/apps/stable_diffusion/web/gradio/txt2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/txt2img_ui.py
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -0,0 +1,262 @@
+import os
+import sys
+from pathlib import Path
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"
+
+if sys.platform == "darwin":
+    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.src import (
+    prompt_examples,
+    args,
+    get_available_devices,
+)
+from apps.stable_diffusion.scripts import txt2img_inf
+
+nodlogo_loc = resource_path("logos/nod-logo.png")
+sdlogo_loc = resource_path("logos/sd-demo-logo.png")
+
+
+demo_css = resource_path("css/sd_dark_theme.css")
+
+
+with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        logo2 = Image.open(sdlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=100)
+            with gr.Column(scale=5, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=logo2,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="demo_title",
+                ).style(width=150, height=100)
+
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    with gr.Group():
+                        model_id = gr.Dropdown(
+                            label="Model ID",
+                            value="stabilityai/stable-diffusion-2-1-base",
+                            choices=[
+                                "Linaqruf/anything-v3.0",
+                                "prompthero/openjourney",
+                                "wavymulder/Analog-Diffusion",
+                                "stabilityai/stable-diffusion-2-1",
+                                "stabilityai/stable-diffusion-2-1-base",
+                                "CompVis/stable-diffusion-v1-4",
+                            ],
+                        )
+                        custom_model_id = gr.Textbox(
+                            placeholder="check here: https://huggingface.co/models eg. runwayml/stable-diffusion-v1-5",
+                            value="",
+                            label="HuggingFace Model ID",
+                        )
+                    with gr.Group():
+                        ckpt_loc = gr.File(
+                            label="Upload checkpoint",
+                            file_types=[".ckpt", ".safetensors"],
+                        )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value="cyberpunk forest by Salvador Dali",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="trees, green",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                with gr.Accordion(label="Advance Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            label="Scheduler",
+                            value="SharkEulerDiscrete",
+                            choices=[
+                                "DDIM",
+                                "PNDM",
+                                "LMSDiscrete",
+                                "DPMSolverMultistep",
+                                "EulerDiscrete",
+                                "EulerAncestralDiscrete",
+                                "SharkEulerDiscrete",
+                            ],
+                        )
+                        batch_size = gr.Slider(
+                            1, 4, value=1, step=1, label="Number of Images"
+                        )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 786, value=512, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 786, value=512, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value="fp16",
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=64,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=50, step=1, label="Steps"
+                        )
+                        guidance_scale = gr.Slider(
+                            0,
+                            50,
+                            value=7.5,
+                            step=0.1,
+                            label="CFG Scale",
+                        )
+                    with gr.Row():
+                        save_metadata_to_png = gr.Checkbox(
+                            label="Save prompt information to PNG",
+                            value=False,
+                            interactive=True,
+                        )
+                        save_metadata_to_json = gr.Checkbox(
+                            label="Save prompt information to JSON file",
+                            value=False,
+                            interactive=True,
+                        )
+                with gr.Row():
+                    seed = gr.Number(value=-1, precision=0, label="Seed")
+                    available_devices = get_available_devices()
+                    device = gr.Dropdown(
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        None,
+                        inputs=[],
+                        outputs=[seed],
+                        _js="() => Math.floor(Math.random() * 4294967295)",
+                    )
+                    stable_diffusion = gr.Button("Generate Image")
+                with gr.Accordion(label="Prompt Examples!", open=False):
+                    ex = gr.Examples(
+                        examples=prompt_examples,
+                        inputs=prompt,
+                        cache_examples=False,
+                        elem_id="prompt_examples",
+                    )
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2], height="auto")
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=4,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+
+        prompt.submit(
+            txt2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_size,
+                scheduler,
+                model_id,
+                custom_model_id,
+                ckpt_loc,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+            ],
+            outputs=[gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+        stable_diffusion.click(
+            txt2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_size,
+                scheduler,
+                model_id,
+                custom_model_id,
+                ckpt_loc,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+            ],
+            outputs=[gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+shark_web.queue()
+shark_web.launch(
+    share=args.share,
+    inbrowser=True,
+    server_name="0.0.0.0",
+    server_port=args.server_port,
+)
--- a/apps/stable_diffusion/web/logos/Nod_logo.png
+++ b/apps/stable_diffusion/web/logos/Nod_logo.png
--- a/apps/stable_diffusion/web/logos/nod-logo.png
+++ b/apps/stable_diffusion/web/logos/nod-logo.png
--- a/apps/stable_diffusion/web/logos/sd-demo-logo.png
+++ b/apps/stable_diffusion/web/logos/sd-demo-logo.png
--- a/benchmarks/init.py
+++ b/benchmarks/init.py
--- a/benchmarks/hf_model_benchmark.py
+++ b/benchmarks/hf_model_benchmark.py
@@ -0,0 +1,22 @@
+import torch
+from shark.parser import parser
+from benchmarks.hf_transformer import SharkHFBenchmarkRunner
+
+parser.add_argument(
+    "--model_name",
+    type=str,
+    required=True,
+    help='Specifies name of HF model to benchmark. (For exmaple "microsoft/MiniLM-L12-H384-uncased"',
+)
+load_args, unknown = parser.parse_known_args()
+
+if __name__ == "__main__":
+    model_name = load_args.model_name
+    test_input = torch.randint(2, (1, 128))
+    shark_module = SharkHFBenchmarkRunner(
+        model_name, (test_input,), jit_trace=True
+    )
+    shark_module.benchmark_c()
+    shark_module.benchmark_python((test_input,))
+    shark_module.benchmark_torch(test_input)
+    shark_module.benchmark_onnx(test_input)
--- a/benchmarks/hf_transformer.py
+++ b/benchmarks/hf_transformer.py
@@ -0,0 +1,181 @@
+import torch
+from shark.shark_benchmark_runner import SharkBenchmarkRunner
+from shark.parser import shark_args
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from onnxruntime.transformers.benchmark import (
+    run_pytorch,
+    run_tensorflow,
+    run_onnxruntime,
+)
+from onnxruntime.transformers.huggingface_models import MODELS
+from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
+import os
+import psutil
+
+
+class OnnxFusionOptions(object):
+    def __init__(self):
+        self.disable_gelu = False
+        self.disable_layer_norm = False
+        self.disable_attention = False
+        self.disable_skip_layer_norm = False
+        self.disable_embed_layer_norm = False
+        self.disable_bias_skip_layer_norm = False
+        self.disable_bias_gelu = False
+        self.enable_gelu_approximation = False
+        self.use_mask_index = False
+        self.no_attention_mask = False
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
+    # SharkRunner derived class with Benchmarking capabilities.
+    def __init__(
+        self,
+        model_name: str,
+        input: tuple,
+        dynamic: bool = False,
+        device: str = None,
+        jit_trace: bool = False,
+        from_aot: bool = False,
+        frontend: str = "torch",
+    ):
+        self.device = device if device is not None else shark_args.device
+        if self.device == "gpu":
+            raise ValueError(
+                "Currently GPU Benchmarking is not supported due to OOM from ORT."
+            )
+        self.model_name = model_name
+        model = HuggingFaceLanguage(model_name)
+        SharkBenchmarkRunner.__init__(
+            self,
+            model,
+            input,
+            dynamic,
+            self.device,
+            jit_trace,
+            from_aot,
+            frontend,
+        )
+
+    def benchmark_torch(self, inputs):
+        use_gpu = self.device == "gpu"
+        # Set set the model's layer number to automatic.
+        config_modifier = ConfigModifier(None)
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        verbose = False
+        result = run_pytorch(
+            use_gpu,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            False,
+            cache_dir,
+            verbose,
+        )
+        print(
+            f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    # TODO: Currently non-functional due to TF runtime error. There might be some issue with, initializing TF.
+    def benchmark_tf(self, inputs):
+        use_gpu = self.device == "gpu"
+        # Set set the model's layer number to automatic.
+        config_modifier = ConfigModifier(None)
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        verbose = False
+        result = run_tensorflow(
+            use_gpu,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            cache_dir,
+            verbose,
+        )
+        print(
+            f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    def benchmark_onnx(self, inputs):
+        if self.model_name not in MODELS:
+            print(
+                f"{self.model_name} is currently not supported in ORT's HF. Check \
+https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
+for currently supported models. Exiting benchmark ONNX."
+            )
+            return
+        use_gpu = self.device == "gpu"
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        onnx_dir = os.path.join(".", "onnx_models")
+        verbose = False
+        input_counts = [1]
+        optimize_onnx = True
+        validate_onnx = False
+        disable_ort_io_binding = False
+        use_raw_attention_mask = True
+        model_fusion_statistics = {}
+        overwrite = False
+        model_source = "pt"  # Either "pt" or "tf"
+        provider = None
+        config_modifier = ConfigModifier(None)
+        onnx_args = OnnxFusionOptions()
+        result = run_onnxruntime(
+            use_gpu,
+            provider,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            input_counts,
+            optimize_onnx,
+            validate_onnx,
+            cache_dir,
+            onnx_dir,
+            verbose,
+            overwrite,
+            disable_ort_io_binding,
+            use_raw_attention_mask,
+            model_fusion_statistics,
+            model_source,
+            onnx_args,
+        )
+        print(
+            f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -0,0 +1,231 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers
+
+import torch
+import tensorflow as tf
+import numpy as np
+import torchvision.models as models
+from transformers import (
+    AutoModelForSequenceClassification,
+    BertTokenizer,
+    TFBertModel,
+)
+import importlib
+import pytest
+import unittest
+
+torch.manual_seed(0)
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+##################### Tensorflow Hugging Face LM Models ###################################
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+tf_bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class TFHuggingFaceLanguage(tf.Module):
+    def __init__(self, hf_model_name):
+        super(TFHuggingFaceLanguage, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(hf_model_name, from_pt=True)
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False
+        )
+
+    @tf.function(input_signature=tf_bert_input, jit_compile=True)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+def get_TFhf_model(name):
+    model = TFHuggingFaceLanguage(name)
+    tokenizer = BertTokenizer.from_pretrained(name)
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+    )
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0
+        )
+    test_input = (
+        encoded_input["input_ids"],
+        encoded_input["attention_mask"],
+        encoded_input["token_type_ids"],
+    )
+    actual_out = model.forward(*test_input)
+    return model, test_input, actual_out
+
+
+##################### Hugging Face LM Models ###################################
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+def get_hf_model(name):
+    model = HuggingFaceLanguage(name)
+    # TODO: Currently the test input is set to (1,128)
+    test_input = torch.randint(2, (1, 128))
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+
+################################################################################
+
+##################### Torch Vision Models    ###################################
+
+
+class VisionModule(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+def get_vision_model(torch_model):
+    model = VisionModule(torch_model)
+    # TODO: Currently the test input is set to (1,128)
+    test_input = torch.randn(1, 3, 224, 224)
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+
+#############################   Benchmark Tests ####################################
+
+pytest_benchmark_param = pytest.mark.parametrize(
+    ("dynamic", "device"),
+    [
+        pytest.param(False, "cpu"),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, "cpu", marks=pytest.mark.skip),
+        pytest.param(
+            False,
+            "gpu",
+            marks=pytest.mark.skipif(
+                check_device_drivers("gpu"), reason="nvidia-smi not found"
+            ),
+        ),
+        pytest.param(True, "gpu", marks=pytest.mark.skip),
+        pytest.param(
+            False,
+            "vulkan",
+            marks=pytest.mark.skipif(
+                check_device_drivers("vulkan"),
+                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases",
+            ),
+        ),
+        pytest.param(
+            True,
+            "vulkan",
+            marks=pytest.mark.skipif(
+                check_device_drivers("vulkan"),
+                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases",
+            ),
+        ),
+    ],
+)
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("iree.tools") is None,
+    reason="Cannot find tools to import TF",
+)
+@pytest_benchmark_param
+def test_bench_minilm_torch(dynamic, device):
+    model, test_input, act_out = get_hf_model(
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
+    shark_module = SharkInference(
+        model,
+        (test_input,),
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.compile()
+        shark_module.benchmark_all((test_input,))
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("iree.tools") is None,
+    reason="Cannot find tools to import TF",
+)
+@pytest_benchmark_param
+def test_bench_distilbert(dynamic, device):
+    model, test_input, act_out = get_TFhf_model("distilbert-base-uncased")
+    shark_module = SharkInference(
+        model,
+        test_input,
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.set_frontend("tensorflow")
+        shark_module.compile()
+        shark_module.benchmark_all(test_input)
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
+
+
+@pytest.mark.skip(reason="XLM Roberta too large to test.")
+@pytest_benchmark_param
+def test_bench_xlm_roberta(dynamic, device):
+    model, test_input, act_out = get_TFhf_model("xlm-roberta-base")
+    shark_module = SharkInference(
+        model,
+        test_input,
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.set_frontend("tensorflow")
+        shark_module.compile()
+        shark_module.benchmark_all(test_input)
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
--- a/benchmarks/tests/test_hf_benchmark.py
+++ b/benchmarks/tests/test_hf_benchmark.py
@@ -0,0 +1,45 @@
+import torch
+from benchmarks.hf_transformer import SharkHFBenchmarkRunner
+import importlib
+import pytest
+
+torch.manual_seed(0)
+
+############################# HF Benchmark Tests ####################################
+
+# Test running benchmark module without failing.
+pytest_benchmark_param = pytest.mark.parametrize(
+    ("dynamic", "device"),
+    [
+        pytest.param(False, "cpu"),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, "cpu", marks=pytest.mark.skip),
+    ],
+)
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("onnxruntime") is None,
+    reason="Cannot find ONNXRUNTIME.",
+)
+@pytest_benchmark_param
+def test_HFbench_minilm_torch(dynamic, device):
+    model_name = "bert-base-uncased"
+    test_input = torch.randint(2, (1, 128))
+    try:
+        shark_module = SharkHFBenchmarkRunner(
+            model_name,
+            (test_input,),
+            jit_trace=True,
+            dynamic=dynamic,
+            device=device,
+        )
+        shark_module.benchmark_c()
+        shark_module.benchmark_python((test_input,))
+        shark_module.benchmark_torch(test_input)
+        shark_module.benchmark_onnx(test_input)
+        # If becnhmarking succesful, assert success/True.
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -0,0 +1,45 @@
+import argparse
+from PIL import Image
+import numpy as np
+
+import requests
+import shutil
+import os
+import subprocess
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-n", "--newfile")
+parser.add_argument(
+    "-g",
+    "--golden_url",
+    default="https://storage.googleapis.com/shark_tank/testdata/cyberpunk_fores_42_0_230119_021148.png",
+)
+
+
+def get_image(url, local_filename):
+    res = requests.get(url, stream=True)
+    if res.status_code == 200:
+        with open(local_filename, "wb") as f:
+            shutil.copyfileobj(res.raw, f)
+
+
+def compare_images(new_filename, golden_filename):
+    new = np.array(Image.open(new_filename)) / 255.0
+    golden = np.array(Image.open(golden_filename)) / 255.0
+    diff = np.abs(new - golden)
+    mean = np.mean(diff)
+    if mean > 0.01:
+        subprocess.run(
+            ["gsutil", "cp", new_filename, "gs://shark_tank/testdata/builder/"]
+        )
+        raise SystemExit("new and golden not close")
+    else:
+        print("SUCCESS")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    tempfile_name = os.path.join(os.getcwd(), "golden.png")
+    get_image(args.golden_url, tempfile_name)
+    compare_images(args.newfile, tempfile_name)
--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+IMPORTER=1 BENCHMARK=1 ./setup_venv.sh
+source $GITHUB_WORKSPACE/shark.venv/bin/activate
+python generate_sharktank.py --upload=False --ci_tank_dir=True
--- a/build_tools/scrape_releases.py
+++ b/build_tools/scrape_releases.py
@@ -0,0 +1,37 @@
+"""Scrapes the github releases API to generate a static pip-install-able releases page.
+
+See https://github.com/llvm/torch-mlir/issues/1374
+"""
+import argparse
+import json
+
+import requests
+
+# Parse arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("owner", type=str)
+parser.add_argument("repo", type=str)
+args = parser.parse_args()
+
+# Get releases
+response = requests.get(
+    f"https://api.github.com/repos/{args.owner}/{args.repo}/releases"
+)
+body = json.loads(response.content)
+
+# Parse releases
+releases = []
+for row in body:
+    for asset in row["assets"]:
+        releases.append((asset["name"], asset["browser_download_url"]))
+
+# Output HTML
+html = """<!DOCTYPE html>
+<html>
+  <body>
+"""
+for name, url in releases:
+    html += f"    <a href='{url}'>{name}</a><br />\n"
+html += """  </body>
+</html>"""
+print(html)
--- a/build_tools/stable_diff_main_test.sh
+++ b/build_tools/stable_diff_main_test.sh
@@ -0,0 +1,7 @@
+rm -rf ./test_images
+mkdir test_images
+python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned
+python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned --beta_models=True
+
+python build_tools/image_comparison.py -n ./test_images/*.png
+exit $?
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -0,0 +1,77 @@
+import os
+import subprocess
+from shark.examples.shark_inference.stable_diffusion.resources import (
+    get_json_file,
+)
+from shark.shark_downloader import download_public_file
+from image_comparison import compare_images
+import argparse
+from glob import glob
+import shutil
+
+model_config_dicts = get_json_file(
+    os.path.join(
+        os.getcwd(),
+        "shark/examples/shark_inference/stable_diffusion/resources/model_config.json",
+    )
+)
+
+
+def test_loop(device="vulkan", beta=False, extra_flags=[]):
+    # Get golden values from tank
+    shutil.rmtree("./test_images", ignore_errors=True)
+    os.mkdir("./test_images")
+    os.mkdir("./test_images/golden")
+    hf_model_names = model_config_dicts[0].values()
+    tuned_options = ["--no-use_tuned"]  #'use_tuned']
+    devices = ["vulkan"]
+    if beta:
+        extra_flags.append("--beta_models=True")
+    for model_name in hf_model_names:
+        for use_tune in tuned_options:
+            command = [
+                "python",
+                "shark/examples/shark_inference/stable_diffusion/main.py",
+                "--device=" + device,
+                "--output_dir=./test_images/" + model_name,
+                "--hf_model_id=" + model_name,
+                use_tune,
+            ]
+            command += extra_flags
+            generated_image = not subprocess.call(
+                command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+            )
+            if generated_image:
+                os.makedirs(
+                    "./test_images/golden/" + model_name, exist_ok=True
+                )
+                download_public_file(
+                    "gs://shark_tank/testdata/golden/" + model_name,
+                    "./test_images/golden/" + model_name,
+                )
+                comparison = [
+                    "python",
+                    "build_tools/image_comparison.py",
+                    "--golden_url=gs://shark_tank/testdata/golden/"
+                    + model_name
+                    + "/*.png",
+                    "--newfile=./test_images/" + model_name + "/*.png",
+                ]
+                test_file = glob("./test_images/" + model_name + "/*.png")[0]
+                golden_path = "./test_images/golden/" + model_name + "/*.png"
+                golden_file = glob(golden_path)[0]
+                compare_images(test_file, golden_file)
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-d", "--device", default="vulkan")
+parser.add_argument(
+    "-b", "--beta", action=argparse.BooleanOptionalAction, default=False
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    test_loop(args.device, args.beta, [])
--- a/conftest.py
+++ b/conftest.py
@@ -0,0 +1,62 @@
+def pytest_addoption(parser):
+    # Attaches SHARK command-line arguments to the pytest machinery.
+    parser.addoption(
+        "--benchmark",
+        action="store_true",
+        default="False",
+        help="Pass option to benchmark and write results.csv",
+    )
+    parser.addoption(
+        "--onnx_bench",
+        action="store_true",
+        default="False",
+        help="Add ONNX benchmark results to pytest benchmarks.",
+    )
+    parser.addoption(
+        "--tf32",
+        action="store_true",
+        default="False",
+        help="Use TensorFloat-32 calculations.",
+    )
+    parser.addoption(
+        "--save_repro",
+        action="store_true",
+        default="False",
+        help="Pass option to save reproduction artifacts to SHARK/shark_tmp/test_case/",
+    )
+    parser.addoption(
+        "--save_fails",
+        action="store_true",
+        default="False",
+        help="Save reproduction artifacts for a test case only if it fails. Default is False.",
+    )
+    parser.addoption(
+        "--ci",
+        action="store_true",
+        default="False",
+        help="Enables uploading of reproduction artifacts upon test case failure during iree-compile or validation. Must be passed with --ci_sha option ",
+    )
+    parser.addoption(
+        "--update_tank",
+        action="store_true",
+        default="False",
+        help="Update local shark tank with latest artifacts.",
+    )
+    parser.addoption(
+        "--ci_sha",
+        action="store",
+        default="None",
+        help="Passes the github SHA of the CI workflow to include in google storage directory for reproduction artifacts.",
+    )
+    parser.addoption(
+        "--local_tank_cache",
+        action="store",
+        default="",
+        help="Specify the directory in which all downloaded shark_tank artifacts will be cached.",
+    )
+    parser.addoption(
+        "--tank_url",
+        type=str,
+        default="gs://shark_tank/latest",
+        help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
+    )
--- a/cpp/.gitignore
+++ b/cpp/.gitignore
@@ -0,0 +1,3 @@
+*.mlir
+*.vmfb
+*.ini
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -0,0 +1,52 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required(VERSION 3.21...3.23)
+
+#-------------------------------------------------------------------------------
+# Project configuration
+#-------------------------------------------------------------------------------
+
+project(iree-samples C CXX)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+#-------------------------------------------------------------------------------
+# Core project dependency
+#-------------------------------------------------------------------------------
+
+message(STATUS "Fetching core IREE repo (this may take a few minutes)...")
+# Note: for log output, set -DFETCHCONTENT_QUIET=OFF,
+# see https://gitlab.kitware.com/cmake/cmake/-/issues/18238#note_440475
+
+include(FetchContent)
+
+FetchContent_Declare(
+  iree
+  GIT_REPOSITORY https://github.com/nod-ai/shark-runtime.git
+  GIT_TAG shark 
+  GIT_SUBMODULES_RECURSE OFF
+  GIT_SHALLOW OFF
+  GIT_PROGRESS ON
+  USES_TERMINAL_DOWNLOAD ON
+)
+
+# Extend module path to find MLIR CMake modules.
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_BINARY_DIR}/lib/cmake/mlir")
+
+# Disable core project features not needed for these out of tree samples.
+set(IREE_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(IREE_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
+
+FetchContent_MakeAvailable(iree)
+FetchContent_GetProperties(iree SOURCE_DIR IREE_SOURCE_DIR)
+
+#-------------------------------------------------------------------------------
+# Individual samples
+#-------------------------------------------------------------------------------
+
+add_subdirectory(vulkan_gui)
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -0,0 +1,82 @@
+# SHARK C/C++ Samples
+
+These C/C++ samples can be built using CMake. The samples depend on the main
+SHARK-Runtime project's C/C++ sources, including both the runtime and the compiler. 
+
+Individual samples may require additional dependencies. Watch CMake's output
+for information about which you are missing for individual samples.
+
+On Windows we recommend using https://github.com/microsoft/vcpkg to download packages for
+your system. The general setup flow looks like
+
+*Install and activate SHARK*
+
+```bash
+source shark.venv/bin/activate #follow main repo instructions to setup your venv
+```
+
+*Install Dependencies*
+
+```bash
+vcpkg install [library] --triplet [your platform]
+vcpkg integrate install
+
+# Then pass `-DCMAKE_TOOLCHAIN_FILE=[check logs for path]` when configuring CMake
+```
+
+In Ubuntu Linux you can install
+
+```bash
+sudo apt install libsdl2-dev
+```
+
+*Build*
+```bash
+cd cpp
+cmake -GNinja -B build/
+cmake --build build/
+```
+
+*Prepare the model*
+```bash
+wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvm-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
+```
+*Prepare the input*
+
+```bash
+python save_img.py
+```
+Note that this requires tensorflow, e.g.
+```bash
+python -m pip install tensorflow
+```
+
+*Run the vulkan_gui*
+```bash
+./build/vulkan_gui/iree-samples-resnet-vulkan-gui
+```
+
+## Other models
+A tool for benchmarking other models is built and can be invoked with a command like the following
+```bash
+./build/vulkan_gui/iree-vulkan-gui --module-file=path/to/.vmfb --function_input=...
+```
+see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
+```bash
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
+```
+VAE and Autoencoder are also available
+```bash
+# VAE
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32
+
+# CLIP Autoencoder
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
+```
--- a/cpp/dog_imagenet.jpg
+++ b/cpp/dog_imagenet.jpg
--- a/cpp/save_img.py
+++ b/cpp/save_img.py
@@ -0,0 +1,18 @@
+import numpy as np
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+
+def load_and_preprocess_image(fname: str):
+    image = tf.io.read_file(fname)
+    image = tf.image.decode_image(image, channels=3)
+    image = tf.image.resize(image, (224, 224))
+    image = image[tf.newaxis, :]
+    # preprocessing pipeline
+    input_tensor = tf.keras.applications.resnet50.preprocess_input(image)
+    return input_tensor
+
+
+data = load_and_preprocess_image("dog_imagenet.jpg").numpy()
+
+data.tofile("dog.bin")
--- a/cpp/vision_inference/CMakeLists.txt
+++ b/cpp/vision_inference/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR
+   NOT IREE_HAL_EXECUTABLE_LOADER_EMBEDDED_ELF)
+  message(STATUS "Missing LLVM backend and/or embeddded elf loader, skipping vision_inference sample")
+  return()
+endif()
+
+# vcpkg install stb
+#   tested with version 2021-09-10
+find_package(Stb)
+if(NOT Stb_FOUND)
+  message(STATUS "Could not find Stb, skipping vision inference sample")
+  return()
+endif()
+
+# Compile mnist.mlir to mnist.vmfb.
+set(_COMPILE_TOOL_EXECUTABLE $<TARGET_FILE:iree-compile>)
+set(_COMPILE_ARGS)
+list(APPEND _COMPILE_ARGS "--iree-input-type=mhlo")
+list(APPEND _COMPILE_ARGS "--iree-hal-target-backends=llvm-cpu")
+list(APPEND _COMPILE_ARGS "${IREE_SOURCE_DIR}/samples/models/mnist.mlir")
+list(APPEND _COMPILE_ARGS "-o")
+list(APPEND _COMPILE_ARGS "mnist.vmfb")
+add_custom_command(
+  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
+  COMMAND ${_COMPILE_TOOL_EXECUTABLE} ${_COMPILE_ARGS}
+  DEPENDS ${_COMPILE_TOOL_EXECUTABLE} "${IREE_SOURCE_DIR}/samples/models/mnist.mlir"
+)
+# Embed mnist.vmfb into a C file as mnist_bytecode_module_c.[h/c]
+set(_EMBED_DATA_EXECUTABLE $<TARGET_FILE:generate_embed_data>)
+set(_EMBED_ARGS)
+list(APPEND _EMBED_ARGS "--output_header=mnist_bytecode_module_c.h")
+list(APPEND _EMBED_ARGS "--output_impl=mnist_bytecode_module_c.c")
+list(APPEND _EMBED_ARGS "--identifier=iree_samples_vision_inference_mnist_bytecode_module")
+list(APPEND _EMBED_ARGS "--flatten")
+list(APPEND _EMBED_ARGS "${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb")
+add_custom_command(
+  OUTPUT "mnist_bytecode_module_c.h" "mnist_bytecode_module_c.c"
+  COMMAND ${_EMBED_DATA_EXECUTABLE} ${_EMBED_ARGS}
+  DEPENDS ${_EMBED_DATA_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
+)
+# Define a library target for mnist_bytecode_module_c.
+add_library(iree_samples_vision_inference_mnist_bytecode_module_c OBJECT)
+target_sources(iree_samples_vision_inference_mnist_bytecode_module_c
+  PRIVATE
+    mnist_bytecode_module_c.h
+    mnist_bytecode_module_c.c
+)
+
+# Define the sample executable.
+set(_NAME "iree-run-mnist-module")
+add_executable(${_NAME} "")
+target_sources(${_NAME}
+  PRIVATE
+    "image_util.h"
+    "image_util.c"
+    "iree-run-mnist-module.c"
+)
+set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-run-mnist-module")
+target_include_directories(${_NAME} PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+)
+target_include_directories(${_NAME} PRIVATE
+    ${Stb_INCLUDE_DIR}
+)
+target_link_libraries(${_NAME}
+  iree_base_base
+  iree_base_tracing
+  iree_hal_hal
+  iree_runtime_runtime
+  iree_samples_vision_inference_mnist_bytecode_module_c
+)
+
+# Define a target that copies the test image into the build directory.
+add_custom_target(iree_samples_vision_inference_test_image
+  COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/mnist_test.png" "${CMAKE_CURRENT_BINARY_DIR}/mnist_test.png")
+add_dependencies(${_NAME} iree_samples_vision_inference_test_image)
+
+message(STATUS "Configured vision_inference sample successfully")
--- a/cpp/vision_inference/README.md
+++ b/cpp/vision_inference/README.md
@@ -0,0 +1,8 @@
+# Vision Inference Sample (C code)
+
+This sample demonstrates how to run a MNIST handwritten digit detection vision
+model on an image using IREE's C API.
+
+A similar sample is implemented using a Python script and IREE's command line
+tools over in the primary iree repository at
+https://github.com/iree-org/iree/tree/main/samples/vision_inference
--- a/cpp/vision_inference/image_util.c
+++ b/cpp/vision_inference/image_util.c
@@ -0,0 +1,224 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "image_util.h"
+
+#include <math.h>
+
+#include "iree/base/internal/flags.h"
+#include "iree/base/tracing.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
+    const uint8_t* pixel_data, iree_host_size_t buffer_length,
+    const float* input_range, iree_host_size_t range_length,
+    float* out_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  if (range_length != 2) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "range defined as 2-element [min, max] array.");
+  }
+  float input_scale = fabsf(input_range[1] - input_range[0]) / 2.0f;
+  float input_offset = (input_range[0] + input_range[1]) / 2.0f;
+  const float kUint8Mean = 127.5f;
+  for (int i = 0; i < buffer_length; ++i) {
+    out_buffer[i] =
+        (((float)(pixel_data[i])) - kUint8Mean) / kUint8Mean * input_scale +
+        input_offset;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_tools_utils_load_pixel_data_impl(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
+  int img_dims[3];
+  if (stbi_info(filename.data, img_dims, &(img_dims[1]), &(img_dims[2])) == 0) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
+                            (int)filename.size, filename.data);
+  }
+  if (!(element_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 ||
+        element_type == IREE_HAL_ELEMENT_TYPE_SINT_8 ||
+        element_type == IREE_HAL_ELEMENT_TYPE_UINT_8)) {
+    char element_type_str[16];
+    IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
+        element_type, sizeof(element_type_str), element_type_str, NULL));
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "element type %s not supported", element_type_str);
+  }
+  switch (shape_rank) {
+    case 2: {  // Assume tensor <height x width>
+      if (img_dims[2] != 1 || (shape[0] != img_dims[1]) ||
+          (shape[1] != img_dims[0])) {
+        return iree_make_status(
+            IREE_STATUS_INVALID_ARGUMENT,
+            "image size: %dx%dx%d, expected: %" PRIdim "x%" PRIdim, img_dims[0],
+            img_dims[1], img_dims[2], shape[1], shape[0]);
+      }
+      break;
+    }
+    case 3: {  // Assume tensor <height x width x channel>
+      if (shape[0] != img_dims[1] || shape[1] != img_dims[0] ||
+          shape[2] != img_dims[2]) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "image size: %dx%dx%d, expected: %" PRIdim
+                                "x%" PRIdim "x%" PRIdim,
+                                img_dims[0], img_dims[1], img_dims[2], shape[1],
+                                shape[0], shape[2]);
+      }
+      break;
+    }
+    case 4: {  // Assume tensor <batch x height x width x channel>
+      if (shape[1] != img_dims[1] || shape[2] != img_dims[0] ||
+          shape[3] != img_dims[2]) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "image size: %dx%dx%d, expected: %" PRIdim
+                                "x%" PRIdim "x%" PRIdim,
+                                img_dims[0], img_dims[1], img_dims[2], shape[2],
+                                shape[1], shape[3]);
+      }
+      break;
+    }
+    default:
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "Input buffer shape rank %" PRIhsz " not supported", shape_rank);
+  }
+  // Drop the alpha channel if present.
+  int req_ch = (img_dims[2] >= 3) ? 3 : 0;
+  *out_pixel_data = stbi_load(filename.data, img_dims, &(img_dims[1]),
+                              &(img_dims[2]), req_ch);
+  if (*out_pixel_data == NULL) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
+                            (int)filename.size, filename.data);
+  }
+  *out_buffer_length =
+      img_dims[0] * img_dims[1] * (img_dims[2] > 3 ? 3 : img_dims[2]);
+  return iree_ok_status();
+}
+
+iree_status_t iree_tools_utils_load_pixel_data(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t result = iree_tools_utils_load_pixel_data_impl(
+      filename, shape, shape_rank, element_type, out_pixel_data,
+      out_buffer_length);
+  IREE_TRACE_ZONE_END(z0);
+  return result;
+}
+
+iree_status_t iree_tools_utils_buffer_view_from_image(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_buffer_view = NULL;
+  if (element_type != IREE_HAL_ELEMENT_TYPE_SINT_8 &&
+      element_type != IREE_HAL_ELEMENT_TYPE_UINT_8) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "element type should be i8 or u8");
+  }
+
+  iree_status_t result;
+  uint8_t* pixel_data = NULL;
+  iree_host_size_t buffer_length;
+  result = iree_tools_utils_load_pixel_data(
+      filename, shape, shape_rank, element_type, &pixel_data, &buffer_length);
+  if (iree_status_is_ok(result)) {
+    iree_host_size_t element_byte =
+        iree_hal_element_dense_byte_count(element_type);
+    // SINT_8 and UINT_8 perform direct buffer wrap.
+    result = iree_hal_buffer_view_allocate_buffer(
+        allocator, shape_rank, shape, element_type,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+        (iree_hal_buffer_params_t){
+            .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+            .access = IREE_HAL_MEMORY_ACCESS_READ,
+            .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                     IREE_HAL_BUFFER_USAGE_TRANSFER,
+        },
+        iree_make_const_byte_span(pixel_data, element_byte * buffer_length),
+        out_buffer_view);
+  }
+  stbi_image_free(pixel_data);
+  IREE_TRACE_ZONE_END(z0);
+  return result;
+}
+
+typedef struct iree_tools_utils_buffer_view_load_params_t {
+  const uint8_t* pixel_data;
+  iree_host_size_t pixel_data_length;
+  const float* input_range;
+  iree_host_size_t input_range_length;
+} iree_tools_utils_buffer_view_load_params_t;
+static iree_status_t iree_tools_utils_buffer_view_load_image_rescaled(
+    iree_hal_buffer_mapping_t* mapping, void* user_data) {
+  iree_tools_utils_buffer_view_load_params_t* params =
+      (iree_tools_utils_buffer_view_load_params_t*)user_data;
+  return iree_tools_utils_pixel_rescaled_to_buffer(
+      params->pixel_data, params->pixel_data_length, params->input_range,
+      params->input_range_length, (float*)mapping->contents.data);
+}
+
+iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, const float* input_range,
+    iree_host_size_t input_range_length,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_buffer_view = NULL;
+  if (element_type != IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "element type should be f32");
+  }
+
+  // Classic row-major image layout.
+  iree_hal_encoding_type_t encoding_type =
+      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR;
+
+  // Load pixel data from the file into a new host memory allocation (the only
+  // interface stb_image provides). A real application would want to use the
+  // generation callback to directly decode the image into the target mapped
+  // device buffer.
+  uint8_t* pixel_data = NULL;
+  iree_host_size_t buffer_length = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_tools_utils_load_pixel_data(filename, shape, shape_rank,
+                                           element_type, &pixel_data,
+                                           &buffer_length));
+
+  iree_tools_utils_buffer_view_load_params_t params = {
+      .pixel_data = pixel_data,
+      .pixel_data_length = buffer_length,
+      .input_range = input_range,
+      .input_range_length = input_range_length,
+  };
+  iree_status_t status = iree_hal_buffer_view_generate_buffer(
+      allocator, shape_rank, shape, element_type, encoding_type,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                  IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
+          .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                   IREE_HAL_BUFFER_USAGE_TRANSFER |
+                   IREE_HAL_BUFFER_USAGE_MAPPING,
+      },
+      iree_tools_utils_buffer_view_load_image_rescaled, &params,
+      out_buffer_view);
+
+  stbi_image_free(pixel_data);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
--- a/cpp/vision_inference/image_util.h
+++ b/cpp/vision_inference/image_util.h
@@ -0,0 +1,77 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
+#define IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/buffer_view.h"
+
+#if __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Loads the image at |filename| into |out_pixel_data| and sets
+// |out_buffer_length| to its length.
+//
+// The image dimension must match the width, height, and channel in|shape|,
+// while 2 <= |shape_rank| <= 4 to match the image tensor format.
+//
+// The file must be in a format supported by stb_image.h.
+// The returned |out_pixel_data| buffer must be released by the caller.
+iree_status_t iree_tools_utils_load_pixel_data(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length);
+
+// Parse the content in an image file in |filename| into a HAL buffer view
+// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
+// |shape_rank|, and |element_type|, while being allocated by |allocator|.
+//
+// The |element_type| has to be SINT_8 or UINT_8. For FLOAT_32, use
+// |iree_tools_utils_buffer_view_from_image_rescaled| instead.
+//
+// The returned |out_buffer_view| must be released by the caller.
+iree_status_t iree_tools_utils_buffer_view_from_image(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view);
+
+// Parse the content in an image file in |filename| into a HAL buffer view
+// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
+// |shape_rank|, and |element_type|, while being allocated by |allocator|.
+// The value in |out_buffer_view| is rescaled with |input_range|.
+//
+// The |element_type| has to be FLOAT_32, For SINT_8 or UINT_8, use
+// |iree_tools_utils_buffer_view_from_image| instead.
+//
+// The returned |out_buffer_view| must be released by the caller.
+iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, const float* input_range,
+    iree_host_size_t input_range_length,
+    iree_hal_buffer_view_t** out_buffer_view);
+
+// Normalize uint8_t |pixel_data| of the size |buffer_length| to float buffer
+// |out_buffer| with the range |input_range|.
+//
+// float32_x = (uint8_x - 127.5) / 127.5 * input_scale + input_offset, where
+// input_scale = abs(|input_range[0]| - |input_range[1]| / 2
+// input_offset = |input_range[0]| + |input_range[1]| / 2
+//
+// |out_buffer| needs to be allocated before the call.
+iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
+    const uint8_t* pixel_data, iree_host_size_t pixel_count,
+    const float* input_range, iree_host_size_t input_range_length,
+    float* out_buffer);
+
+#if __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
--- a/cpp/vision_inference/iree-run-mnist-module.c
+++ b/cpp/vision_inference/iree-run-mnist-module.c
@@ -0,0 +1,121 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This sample uses image_util to load a hand-written image as an
+// iree_hal_buffer_view_t then passes it to the bytecode module built from
+// mnist.mlir on the CPU backend with the local-task driver.
+
+#include <float.h>
+
+#include "image_util.h"
+#include "iree/runtime/api.h"
+#include "mnist_bytecode_module_c.h"
+
+iree_status_t Run(const iree_string_view_t image_path) {
+  iree_runtime_instance_options_t instance_options;
+  iree_runtime_instance_options_initialize(IREE_API_VERSION_LATEST,
+                                           &instance_options);
+  iree_runtime_instance_options_use_all_available_drivers(&instance_options);
+  iree_runtime_instance_t* instance = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_instance_create(
+      &instance_options, iree_allocator_system(), &instance));
+
+  // TODO(#5724): move device selection into the compiled modules.
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_instance_try_create_default_device(
+      instance, iree_make_cstring_view("local-task"), &device));
+
+  // Create one session per loaded module to hold the module state.
+  iree_runtime_session_options_t session_options;
+  iree_runtime_session_options_initialize(&session_options);
+  iree_runtime_session_t* session = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_session_create_with_device(
+      instance, &session_options, device,
+      iree_runtime_instance_host_allocator(instance), &session));
+  iree_hal_device_release(device);
+
+  const struct iree_file_toc_t* module_file =
+      iree_samples_vision_inference_mnist_bytecode_module_create();
+
+  IREE_RETURN_IF_ERROR(iree_runtime_session_append_bytecode_module_from_memory(
+      session, iree_make_const_byte_span(module_file->data, module_file->size),
+      iree_allocator_null()));
+
+  iree_runtime_call_t call;
+  IREE_RETURN_IF_ERROR(iree_runtime_call_initialize_by_name(
+      session, iree_make_cstring_view("module.predict"), &call));
+
+  // Prepare the input hal buffer view with image_util library.
+  // The input of the mmist model is single 28x28 pixel image as a
+  // tensor<1x28x28x1xf32>, with pixels in [0.0, 1.0].
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  iree_hal_dim_t buffer_shape[] = {1, 28, 28, 1};
+  iree_hal_element_type_t hal_element_type = IREE_HAL_ELEMENT_TYPE_FLOAT_32;
+  float input_range[2] = {0.0f, 1.0f};
+  IREE_RETURN_IF_ERROR(
+      iree_tools_utils_buffer_view_from_image_rescaled(
+          image_path, buffer_shape, IREE_ARRAYSIZE(buffer_shape),
+          hal_element_type, iree_hal_device_allocator(device), input_range,
+          IREE_ARRAYSIZE(input_range), &buffer_view),
+      "load image");
+  IREE_RETURN_IF_ERROR(
+      iree_runtime_call_inputs_push_back_buffer_view(&call, buffer_view));
+  iree_hal_buffer_view_release(buffer_view);
+
+  IREE_RETURN_IF_ERROR(iree_runtime_call_invoke(&call, /*flags=*/0));
+
+  // Get the result buffers from the invocation.
+  iree_hal_buffer_view_t* ret_buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_runtime_call_outputs_pop_front_buffer_view(&call, &ret_buffer_view));
+
+  // Read back the results. The output of the mnist model is a 1x10 prediction
+  // confidence values for each digit in [0, 9].
+  float predictions[1 * 10] = {0.0f};
+  IREE_RETURN_IF_ERROR(iree_hal_device_transfer_d2h(
+      iree_runtime_session_device(session),
+      iree_hal_buffer_view_buffer(ret_buffer_view), 0, predictions,
+      sizeof(predictions), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+      iree_infinite_timeout()));
+  iree_hal_buffer_view_release(ret_buffer_view);
+
+  // Get the highest index from the output.
+  float result_val = FLT_MIN;
+  int result_idx = 0;
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(predictions); ++i) {
+    if (predictions[i] > result_val) {
+      result_val = predictions[i];
+      result_idx = i;
+    }
+  }
+  fprintf(stdout, "Detected number: %d\n", result_idx);
+
+  iree_runtime_call_deinitialize(&call);
+  iree_runtime_session_release(session);
+  iree_runtime_instance_release(instance);
+  return iree_ok_status();
+}
+
+int main(int argc, char** argv) {
+  if (argc > 2) {
+    fprintf(stderr, "Usage: iree-run-mnist-module <image file>\n");
+    return -1;
+  }
+  iree_string_view_t image_path;
+  if (argc == 1) {
+    image_path = iree_make_cstring_view("mnist_test.png");
+  } else {
+    image_path = iree_make_cstring_view(argv[1]);
+  }
+  iree_status_t result = Run(image_path);
+  if (!iree_status_is_ok(result)) {
+    iree_status_fprint(stderr, result);
+    iree_status_ignore(result);
+    return -1;
+  }
+  iree_status_ignore(result);
+  return 0;
+}
--- a/cpp/vision_inference/mnist_test.png
+++ b/cpp/vision_inference/mnist_test.png
--- a/cpp/vulkan_gui/CMakeLists.txt
+++ b/cpp/vulkan_gui/CMakeLists.txt
@@ -0,0 +1,116 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT IREE_TARGET_BACKEND_VULKAN_SPIRV OR
+   NOT IREE_HAL_DRIVER_VULKAN)
+  message(STATUS "Missing Vulkan backend and/or driver, skipping vulkan_gui sample")
+  return()
+endif()
+
+# This target statically links against Vulkan.
+# One way to achieve this is by installing the Vulkan SDK from
+# https://vulkan.lunarg.com/.
+include(FindVulkan)
+if(NOT Vulkan_FOUND)
+  message(STATUS "Could not find Vulkan, skipping vulkan_gui sample")
+  return()
+endif()
+
+# vcpkg install sdl2[vulkan]
+#   tested with versions 2.0.14#4 - 2.0.22#1
+find_package(SDL2)
+if(NOT SDL2_FOUND)
+  message(STATUS "Could not find SDL2, skipping vulkan_gui sample")
+  return()
+endif()
+
+FetchContent_Declare(
+  imgui
+  GIT_REPOSITORY https://github.com/ocornut/imgui
+  GIT_TAG        master
+)
+
+FetchContent_MakeAvailable(imgui)
+
+# Dear ImGui
+set(IMGUI_DIR ${CMAKE_BINARY_DIR}/_deps/imgui-src)
+message("Looking for Imgui in ${IMGUI_DIR}")
+include_directories(${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
+
+
+function(iree_vulkan_sample)
+
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "NAME"
+    "SRCS"
+    ${ARGN}
+  )
+
+
+  # Define the sample executable.
+  set(_NAME "${_RULE_NAME}")
+  set(SRCS "${_RULE_SRCS}")
+  add_executable(${_NAME} "")
+  target_sources(${_NAME}
+    PRIVATE
+      ${SRCS}
+      "${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
+      "${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
+      "${IMGUI_DIR}/imgui.cpp"
+      "${IMGUI_DIR}/imgui_draw.cpp"
+      "${IMGUI_DIR}/imgui_demo.cpp"
+      "${IMGUI_DIR}/imgui_tables.cpp"
+      "${IMGUI_DIR}/imgui_widgets.cpp"
+  )
+  set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_NAME}")
+  target_include_directories(${_NAME} PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+  )
+  target_link_libraries(${_NAME}
+    SDL2::SDL2
+    Vulkan::Vulkan
+    iree_runtime_runtime
+    iree_base_internal_main
+    iree_hal_drivers_vulkan_registration_registration
+    iree_modules_hal_hal
+    iree_vm_vm
+    iree_vm_bytecode_module
+    iree_vm_cc
+    iree_tooling_vm_util_cc
+    iree_tooling_context_util
+  )
+
+  if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+    set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
+  else()
+    set(_GUI_LINKOPTS "")
+  endif()
+
+  target_link_options(${_NAME}
+    PRIVATE
+      ${_GUI_LINKOPTS}
+  )
+endfunction()
+
+iree_vulkan_sample(
+    NAME
+      iree-samples-resnet-vulkan-gui
+
+    SRCS
+      vulkan_resnet_inference_gui.cc
+)
+
+iree_vulkan_sample(
+    NAME
+      iree-vulkan-gui
+
+    SRCS
+      vulkan_inference_gui.cc
+)
+
+message(STATUS "Configured vulkan_gui sample successfully")
--- a/cpp/vulkan_gui/simple_mul.mlir
+++ b/cpp/vulkan_gui/simple_mul.mlir
@@ -0,0 +1,4 @@
+func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = "arith.mulf"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
--- a/cpp/vulkan_gui/snail_imagenet.jpg
+++ b/cpp/vulkan_gui/snail_imagenet.jpg
--- a/cpp/vulkan_gui/stb_image.h
+++ b/cpp/vulkan_gui/stb_image.h
--- a/cpp/vulkan_gui/vulkan_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_inference_gui.cc
@@ -0,0 +1,957 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Vulkan Graphics + IREE API Integration Sample.
+
+#include <SDL.h>
+#include <SDL_vulkan.h>
+#include <imgui.h>
+#include <imgui_impl_sdl.h>
+#include <imgui_impl_vulkan.h>
+#include <vulkan/vulkan.h>
+
+
+#include <cstring>
+#include <set>
+#include <vector>
+#include <fstream>
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+#include <iterator>
+#include <string>
+#include <utility>
+
+#include "iree/hal/drivers/vulkan/api.h"
+
+// IREE's C API:
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/drivers/vulkan/registration/driver_module.h"
+#include "iree/modules/hal/module.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+#include "iree/vm/ref_cc.h"
+
+// iree-run-module
+#include "iree/base/internal/flags.h"
+#include "iree/base/status_cc.h"
+#include "iree/base/tracing.h"
+#include "iree/modules/hal/types.h"
+#include "iree/tooling/comparison.h"
+#include "iree/tooling/context_util.h"
+#include "iree/tooling/vm_util_cc.h"
+
+// Other dependencies (helpers, etc.)
+#include "iree/base/internal/main.h"
+
+#define IMGUI_UNLIMITED_FRAME_RATE
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+IREE_FLAG(string, entry_function, "",
+          "Name of a function contained in the module specified by module_file "
+          "to run.");
+
+// TODO(benvanik): move --function_input= flag into a util.
+static iree_status_t parse_function_io(iree_string_view_t flag_name,
+                                       void* storage,
+                                       iree_string_view_t value) {
+  auto* list = (std::vector<std::string>*)storage;
+  list->push_back(std::string(value.data, value.size));
+  return iree_ok_status();
+}
+static void print_function_io(iree_string_view_t flag_name, void* storage,
+                              FILE* file) {
+  auto* list = (std::vector<std::string>*)storage;
+  if (list->empty()) {
+    fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
+  } else {
+    for (size_t i = 0; i < list->size(); ++i) {
+      fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
+              list->at(i).c_str());
+    }
+  }
+}
+static std::vector<std::string> FLAG_function_inputs;
+IREE_FLAG_CALLBACK(
+    parse_function_io, print_function_io, &FLAG_function_inputs, function_input,
+    "An input (a) value or (b) buffer of the format:\n"
+    "  (a) scalar value\n"
+    "     value\n"
+    "     e.g.: --function_input=\"3.14\"\n"
+    "  (b) buffer:\n"
+    "     [shape]xtype=[value]\n"
+    "     e.g.: --function_input=\"2x2xi32=1 2 3 4\"\n"
+    "Optionally, brackets may be used to separate the element values:\n"
+    "  2x2xi32=[[1 2][3 4]]\n"
+    "Raw binary files can be read to provide buffer contents:\n"
+    "  2x2xi32=@some/file.bin\n"
+    "numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
+    "  @some.npy\n"
+    "Each occurrence of the flag indicates an input in the order they were\n"
+    "specified on the command line.");
+
+typedef struct iree_file_toc_t {
+  const char* name;             // the file's original name
+  char* data;             // beginning of the file
+  size_t size;                  // length of the file
+} iree_file_toc_t;
+
+bool load_file(const char* filename, char** pOut, size_t* pSize)
+{
+    FILE* f = fopen(filename, "rb");
+    if (f == NULL)
+    {
+        fprintf(stderr, "Can't open %s\n", filename);
+        return false;
+    }
+
+    fseek(f, 0L, SEEK_END);
+    *pSize = ftell(f);
+    fseek(f, 0L, SEEK_SET);
+
+    *pOut = (char*)malloc(*pSize);
+
+    size_t size = fread(*pOut, *pSize, 1, f);
+
+    fclose(f);
+
+    return size != 0;
+}
+
+static VkAllocationCallbacks* g_Allocator = NULL;
+static VkInstance g_Instance = VK_NULL_HANDLE;
+static VkPhysicalDevice g_PhysicalDevice = VK_NULL_HANDLE;
+static VkDevice g_Device = VK_NULL_HANDLE;
+static uint32_t g_QueueFamily = (uint32_t)-1;
+static VkQueue g_Queue = VK_NULL_HANDLE;
+static VkPipelineCache g_PipelineCache = VK_NULL_HANDLE;
+static VkDescriptorPool g_DescriptorPool = VK_NULL_HANDLE;
+
+static ImGui_ImplVulkanH_Window g_MainWindowData;
+static uint32_t g_MinImageCount = 2;
+static bool g_SwapChainRebuild = false;
+static int g_SwapChainResizeWidth = 0;
+static int g_SwapChainResizeHeight = 0;
+
+static void check_vk_result(VkResult err) {
+  if (err == 0) return;
+  fprintf(stderr, "VkResult: %d\n", err);
+  abort();
+}
+
+// Returns the names of the Vulkan layers used for the given IREE
+// |extensibility_set| and |features|.
+std::vector<const char*> GetIreeLayers(
+    iree_hal_vulkan_extensibility_set_t extensibility_set,
+    iree_hal_vulkan_features_t features) {
+  iree_host_size_t required_count;
+  iree_hal_vulkan_query_extensibility_set(
+      features, extensibility_set, /*string_capacity=*/0, &required_count,
+      /*out_string_values=*/NULL);
+  std::vector<const char*> layers(required_count);
+  iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
+                                          layers.size(), &required_count,
+                                          layers.data());
+  return layers;
+}
+
+// Returns the names of the Vulkan extensions used for the given IREE
+// |extensibility_set| and |features|.
+std::vector<const char*> GetIreeExtensions(
+    iree_hal_vulkan_extensibility_set_t extensibility_set,
+    iree_hal_vulkan_features_t features) {
+  iree_host_size_t required_count;
+  iree_hal_vulkan_query_extensibility_set(
+      features, extensibility_set, /*string_capacity=*/0, &required_count,
+      /*out_string_values=*/NULL);
+  std::vector<const char*> extensions(required_count);
+  iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
+                                          extensions.size(), &required_count,
+                                          extensions.data());
+  return extensions;
+}
+
+// Returns the names of the Vulkan extensions used for the given IREE
+// |vulkan_features|.
+std::vector<const char*> GetDeviceExtensions(
+    VkPhysicalDevice physical_device,
+    iree_hal_vulkan_features_t vulkan_features) {
+  std::vector<const char*> iree_required_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
+      vulkan_features);
+  std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
+      vulkan_features);
+
+  uint32_t extension_count = 0;
+  check_vk_result(vkEnumerateDeviceExtensionProperties(
+      physical_device, nullptr, &extension_count, nullptr));
+  std::vector<VkExtensionProperties> extension_properties(extension_count);
+  check_vk_result(vkEnumerateDeviceExtensionProperties(
+      physical_device, nullptr, &extension_count, extension_properties.data()));
+
+  // Merge extensions lists, including optional and required for simplicity.
+  std::set<const char*> ext_set;
+  ext_set.insert("VK_KHR_swapchain");
+  ext_set.insert(iree_required_extensions.begin(),
+                 iree_required_extensions.end());
+  for (int i = 0; i < iree_optional_extensions.size(); ++i) {
+    const char* optional_extension = iree_optional_extensions[i];
+    for (int j = 0; j < extension_count; ++j) {
+      if (strcmp(optional_extension, extension_properties[j].extensionName) ==
+          0) {
+        ext_set.insert(optional_extension);
+        break;
+      }
+    }
+  }
+  std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
+  return extensions;
+}
+
+std::vector<const char*> GetInstanceLayers(
+    iree_hal_vulkan_features_t vulkan_features) {
+  // Query the layers that IREE wants / needs.
+  std::vector<const char*> required_layers = GetIreeLayers(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_REQUIRED, vulkan_features);
+  std::vector<const char*> optional_layers = GetIreeLayers(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL, vulkan_features);
+
+  // Query the layers that are available on the Vulkan ICD.
+  uint32_t layer_property_count = 0;
+  check_vk_result(
+      vkEnumerateInstanceLayerProperties(&layer_property_count, NULL));
+  std::vector<VkLayerProperties> layer_properties(layer_property_count);
+  check_vk_result(vkEnumerateInstanceLayerProperties(&layer_property_count,
+                                                     layer_properties.data()));
+
+  // Match between optional/required and available layers.
+  std::vector<const char*> layers;
+  for (const char* layer_name : required_layers) {
+    bool found = false;
+    for (const auto& layer_property : layer_properties) {
+      if (std::strcmp(layer_name, layer_property.layerName) == 0) {
+        found = true;
+        layers.push_back(layer_name);
+        break;
+      }
+    }
+    if (!found) {
+      fprintf(stderr, "Required layer %s not available\n", layer_name);
+      abort();
+    }
+  }
+  for (const char* layer_name : optional_layers) {
+    for (const auto& layer_property : layer_properties) {
+      if (std::strcmp(layer_name, layer_property.layerName) == 0) {
+        layers.push_back(layer_name);
+        break;
+      }
+    }
+  }
+
+  return layers;
+}
+
+std::vector<const char*> GetInstanceExtensions(
+    SDL_Window* window, iree_hal_vulkan_features_t vulkan_features) {
+  // Ask SDL for its list of required instance extensions.
+  uint32_t sdl_extensions_count = 0;
+  SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count, NULL);
+  std::vector<const char*> sdl_extensions(sdl_extensions_count);
+  SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count,
+                                   sdl_extensions.data());
+
+  std::vector<const char*> iree_required_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_REQUIRED,
+      vulkan_features);
+  std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
+      vulkan_features);
+
+  // Merge extensions lists, including optional and required for simplicity.
+  std::set<const char*> ext_set;
+  ext_set.insert(sdl_extensions.begin(), sdl_extensions.end());
+  ext_set.insert(iree_required_extensions.begin(),
+                 iree_required_extensions.end());
+  ext_set.insert(iree_optional_extensions.begin(),
+                 iree_optional_extensions.end());
+  std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
+  return extensions;
+}
+
+void SetupVulkan(iree_hal_vulkan_features_t vulkan_features,
+                 const char** instance_layers, uint32_t instance_layers_count,
+                 const char** instance_extensions,
+                 uint32_t instance_extensions_count,
+                 const VkAllocationCallbacks* allocator, VkInstance* instance,
+                 uint32_t* queue_family_index,
+                 VkPhysicalDevice* physical_device, VkQueue* queue,
+                 VkDevice* device, VkDescriptorPool* descriptor_pool) {
+  VkResult err;
+
+  // Create Vulkan Instance
+  {
+    VkInstanceCreateInfo create_info = {};
+    create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+    create_info.enabledLayerCount = instance_layers_count;
+    create_info.ppEnabledLayerNames = instance_layers;
+    create_info.enabledExtensionCount = instance_extensions_count;
+    create_info.ppEnabledExtensionNames = instance_extensions;
+    err = vkCreateInstance(&create_info, allocator, instance);
+    check_vk_result(err);
+  }
+
+  // Select GPU
+  {
+    uint32_t gpu_count;
+    err = vkEnumeratePhysicalDevices(*instance, &gpu_count, NULL);
+    check_vk_result(err);
+    IM_ASSERT(gpu_count > 0);
+
+    VkPhysicalDevice* gpus =
+        (VkPhysicalDevice*)malloc(sizeof(VkPhysicalDevice) * gpu_count);
+    err = vkEnumeratePhysicalDevices(*instance, &gpu_count, gpus);
+    check_vk_result(err);
+
+    // Use the first reported GPU for simplicity.
+    *physical_device = gpus[0];
+
+    VkPhysicalDeviceProperties properties;
+    vkGetPhysicalDeviceProperties(*physical_device, &properties);
+    fprintf(stdout, "Selected Vulkan device: '%s'\n", properties.deviceName);
+    free(gpus);
+  }
+
+  // Select queue family. We want a single queue with graphics and compute for
+  // simplicity, but we could also discover and use separate queues for each.
+  {
+    uint32_t count;
+    vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, NULL);
+    VkQueueFamilyProperties* queues = (VkQueueFamilyProperties*)malloc(
+        sizeof(VkQueueFamilyProperties) * count);
+    vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, queues);
+    for (uint32_t i = 0; i < count; i++) {
+      if (queues[i].queueFlags &
+          (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)) {
+        *queue_family_index = i;
+        break;
+      }
+    }
+    free(queues);
+    IM_ASSERT(*queue_family_index != (uint32_t)-1);
+  }
+
+  // Create Logical Device (with 1 queue)
+  {
+    std::vector<const char*> device_extensions =
+        GetDeviceExtensions(*physical_device, vulkan_features);
+    const float queue_priority[] = {1.0f};
+    VkDeviceQueueCreateInfo queue_info = {};
+    queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    queue_info.queueFamilyIndex = *queue_family_index;
+    queue_info.queueCount = 1;
+    queue_info.pQueuePriorities = queue_priority;
+    VkDeviceCreateInfo create_info = {};
+    create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+    create_info.queueCreateInfoCount = 1;
+    create_info.pQueueCreateInfos = &queue_info;
+    create_info.enabledExtensionCount =
+        static_cast<uint32_t>(device_extensions.size());
+    create_info.ppEnabledExtensionNames = device_extensions.data();
+
+    // Enable timeline semaphores.
+    VkPhysicalDeviceFeatures2 features2;
+    memset(&features2, 0, sizeof(features2));
+    features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    create_info.pNext = &features2;
+    VkPhysicalDeviceTimelineSemaphoreFeatures semaphore_features;
+    memset(&semaphore_features, 0, sizeof(semaphore_features));
+    semaphore_features.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES;
+    semaphore_features.pNext = features2.pNext;
+    features2.pNext = &semaphore_features;
+    semaphore_features.timelineSemaphore = VK_TRUE;
+
+    err = vkCreateDevice(*physical_device, &create_info, allocator, device);
+    check_vk_result(err);
+    vkGetDeviceQueue(*device, *queue_family_index, 0, queue);
+  }
+
+  // Create Descriptor Pool
+  {
+    VkDescriptorPoolSize pool_sizes[] = {
+        {VK_DESCRIPTOR_TYPE_SAMPLER, 1000},
+        {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1000},
+        {VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC, 1000},
+        {VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, 1000}};
+    VkDescriptorPoolCreateInfo pool_info = {};
+    pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
+    pool_info.maxSets = 1000 * IREE_ARRAYSIZE(pool_sizes);
+    pool_info.poolSizeCount = (uint32_t)IREE_ARRAYSIZE(pool_sizes);
+    pool_info.pPoolSizes = pool_sizes;
+    err =
+        vkCreateDescriptorPool(*device, &pool_info, allocator, descriptor_pool);
+    check_vk_result(err);
+  }
+}
+
+void SetupVulkanWindow(ImGui_ImplVulkanH_Window* wd,
+                       const VkAllocationCallbacks* allocator,
+                       VkInstance instance, uint32_t queue_family_index,
+                       VkPhysicalDevice physical_device, VkDevice device,
+                       VkSurfaceKHR surface, int width, int height,
+                       uint32_t min_image_count) {
+  wd->Surface = surface;
+
+  // Check for WSI support
+  VkBool32 res;
+  vkGetPhysicalDeviceSurfaceSupportKHR(physical_device, queue_family_index,
+                                       wd->Surface, &res);
+  if (res != VK_TRUE) {
+    fprintf(stderr, "Error no WSI support on physical device 0\n");
+    exit(-1);
+  }
+
+  // Select Surface Format
+  const VkFormat requestSurfaceImageFormat[] = {
+      VK_FORMAT_B8G8R8A8_UNORM, VK_FORMAT_R8G8B8A8_UNORM,
+      VK_FORMAT_B8G8R8_UNORM, VK_FORMAT_R8G8B8_UNORM};
+  const VkColorSpaceKHR requestSurfaceColorSpace =
+      VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+  wd->SurfaceFormat = ImGui_ImplVulkanH_SelectSurfaceFormat(
+      physical_device, wd->Surface, requestSurfaceImageFormat,
+      (size_t)IREE_ARRAYSIZE(requestSurfaceImageFormat),
+      requestSurfaceColorSpace);
+
+  // Select Present Mode
+#ifdef IMGUI_UNLIMITED_FRAME_RATE
+  VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_MAILBOX_KHR,
+                                      VK_PRESENT_MODE_IMMEDIATE_KHR,
+                                      VK_PRESENT_MODE_FIFO_KHR};
+#else
+  VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_FIFO_KHR};
+#endif
+  wd->PresentMode = ImGui_ImplVulkanH_SelectPresentMode(
+      physical_device, wd->Surface, &present_modes[0],
+      IREE_ARRAYSIZE(present_modes));
+
+  // Create SwapChain, RenderPass, Framebuffer, etc.
+  IM_ASSERT(min_image_count >= 2);
+  ImGui_ImplVulkanH_CreateOrResizeWindow(instance, physical_device, device, wd,
+                                         queue_family_index, allocator, width,
+                                         height, min_image_count);
+
+  // Set clear color.
+  ImVec4 clear_color = ImVec4(0.45f, 0.55f, 0.60f, 1.00f);
+  memcpy(&wd->ClearValue.color.float32[0], &clear_color, 4 * sizeof(float));
+}
+
+void RenderFrame(ImGui_ImplVulkanH_Window* wd, VkDevice device, VkQueue queue) {
+  VkResult err;
+
+  VkSemaphore image_acquired_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].ImageAcquiredSemaphore;
+  VkSemaphore render_complete_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
+  err = vkAcquireNextImageKHR(device, wd->Swapchain, UINT64_MAX,
+                              image_acquired_semaphore, VK_NULL_HANDLE,
+                              &wd->FrameIndex);
+  check_vk_result(err);
+
+  ImGui_ImplVulkanH_Frame* fd = &wd->Frames[wd->FrameIndex];
+  {
+    err = vkWaitForFences(
+        device, 1, &fd->Fence, VK_TRUE,
+        UINT64_MAX);  // wait indefinitely instead of periodically checking
+    check_vk_result(err);
+
+    err = vkResetFences(device, 1, &fd->Fence);
+    check_vk_result(err);
+  }
+  {
+    err = vkResetCommandPool(device, fd->CommandPool, 0);
+    check_vk_result(err);
+    VkCommandBufferBeginInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    err = vkBeginCommandBuffer(fd->CommandBuffer, &info);
+    check_vk_result(err);
+  }
+  {
+    VkRenderPassBeginInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+    info.renderPass = wd->RenderPass;
+    info.framebuffer = fd->Framebuffer;
+    info.renderArea.extent.width = wd->Width;
+    info.renderArea.extent.height = wd->Height;
+    info.clearValueCount = 1;
+    info.pClearValues = &wd->ClearValue;
+    vkCmdBeginRenderPass(fd->CommandBuffer, &info, VK_SUBPASS_CONTENTS_INLINE);
+  }
+
+  // Record Imgui Draw Data and draw funcs into command buffer
+  ImGui_ImplVulkan_RenderDrawData(ImGui::GetDrawData(), fd->CommandBuffer);
+
+  // Submit command buffer
+  vkCmdEndRenderPass(fd->CommandBuffer);
+  {
+    VkPipelineStageFlags wait_stage =
+        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    VkSubmitInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    info.waitSemaphoreCount = 1;
+    info.pWaitSemaphores = &image_acquired_semaphore;
+    info.pWaitDstStageMask = &wait_stage;
+    info.commandBufferCount = 1;
+    info.pCommandBuffers = &fd->CommandBuffer;
+    info.signalSemaphoreCount = 1;
+    info.pSignalSemaphores = &render_complete_semaphore;
+
+    err = vkEndCommandBuffer(fd->CommandBuffer);
+    check_vk_result(err);
+    err = vkQueueSubmit(queue, 1, &info, fd->Fence);
+    check_vk_result(err);
+  }
+}
+
+void PresentFrame(ImGui_ImplVulkanH_Window* wd, VkQueue queue) {
+  VkSemaphore render_complete_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
+  VkPresentInfoKHR info = {};
+  info.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+  info.waitSemaphoreCount = 1;
+  info.pWaitSemaphores = &render_complete_semaphore;
+  info.swapchainCount = 1;
+  info.pSwapchains = &wd->Swapchain;
+  info.pImageIndices = &wd->FrameIndex;
+  VkResult err = vkQueuePresentKHR(queue, &info);
+  check_vk_result(err);
+  wd->SemaphoreIndex =
+      (wd->SemaphoreIndex + 1) %
+      wd->ImageCount;  // Now we can use the next set of semaphores
+}
+
+static void CleanupVulkan() {
+  vkDestroyDescriptorPool(g_Device, g_DescriptorPool, g_Allocator);
+
+  vkDestroyDevice(g_Device, g_Allocator);
+  vkDestroyInstance(g_Instance, g_Allocator);
+}
+
+static void CleanupVulkanWindow() {
+  ImGui_ImplVulkanH_DestroyWindow(g_Instance, g_Device, &g_MainWindowData,
+                                  g_Allocator);
+}
+
+namespace iree {
+
+extern "C" int iree_main(int argc, char** argv) {
+
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
+  if (argc > 1) {
+    // Avoid iree-run-module spinning endlessly on stdin if the user uses single
+    // dashes for flags.
+    printf(
+        "[ERROR] unexpected positional argument (expected none)."
+        " Did you use pass a flag with a single dash ('-')?"
+        " Use '--' instead.\n");
+    return 1;
+  }
+
+  // --------------------------------------------------------------------------
+  // Create a window.
+  if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) != 0) {
+    fprintf(stderr, "Failed to initialize SDL\n");
+    abort();
+    return 1;
+  }
+
+  // Setup window
+  // clang-format off
+  SDL_WindowFlags window_flags = (SDL_WindowFlags)(
+      SDL_WINDOW_VULKAN | SDL_WINDOW_RESIZABLE | SDL_WINDOW_ALLOW_HIGHDPI);
+  // clang-format on
+  SDL_Window* window = SDL_CreateWindow(
+      "IREE Samples - Vulkan Inference GUI", SDL_WINDOWPOS_CENTERED,
+      SDL_WINDOWPOS_CENTERED, 1280, 720, window_flags);
+  if (window == nullptr)
+  {
+    const char* sdl_err = SDL_GetError();
+    fprintf(stderr, "Error, SDL_CreateWindow returned: %s\n", sdl_err);
+    abort();
+    return 1;
+  }
+
+  // Setup Vulkan
+  iree_hal_vulkan_features_t iree_vulkan_features =
+      static_cast<iree_hal_vulkan_features_t>(
+          IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS |
+          IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
+  std::vector<const char*> layers = GetInstanceLayers(iree_vulkan_features);
+  std::vector<const char*> extensions =
+      GetInstanceExtensions(window, iree_vulkan_features);
+  SetupVulkan(iree_vulkan_features, layers.data(),
+              static_cast<uint32_t>(layers.size()), extensions.data(),
+              static_cast<uint32_t>(extensions.size()), g_Allocator,
+              &g_Instance, &g_QueueFamily, &g_PhysicalDevice, &g_Queue,
+              &g_Device, &g_DescriptorPool);
+
+  // Create Window Surface
+  VkSurfaceKHR surface;
+  VkResult err;
+  if (SDL_Vulkan_CreateSurface(window, g_Instance, &surface) == 0) {
+    fprintf(stderr, "Failed to create Vulkan surface.\n");
+    abort();
+    return 1;
+  }
+
+  // Create Framebuffers
+  int w, h;
+  SDL_GetWindowSize(window, &w, &h);
+  ImGui_ImplVulkanH_Window* wd = &g_MainWindowData;
+  SetupVulkanWindow(wd, g_Allocator, g_Instance, g_QueueFamily,
+                    g_PhysicalDevice, g_Device, surface, w, h, g_MinImageCount);
+
+  // Setup Dear ImGui context
+  IMGUI_CHECKVERSION();
+  ImGui::CreateContext();
+  ImGuiIO& io = ImGui::GetIO();
+  (void)io;
+
+  ImGui::StyleColorsDark();
+
+  // Setup Platform/Renderer bindings
+  ImGui_ImplSDL2_InitForVulkan(window);
+  ImGui_ImplVulkan_InitInfo init_info = {};
+  init_info.Instance = g_Instance;
+  init_info.PhysicalDevice = g_PhysicalDevice;
+  init_info.Device = g_Device;
+  init_info.QueueFamily = g_QueueFamily;
+  init_info.Queue = g_Queue;
+  init_info.PipelineCache = g_PipelineCache;
+  init_info.DescriptorPool = g_DescriptorPool;
+  init_info.Allocator = g_Allocator;
+  init_info.MinImageCount = g_MinImageCount;
+  init_info.ImageCount = wd->ImageCount;
+  init_info.CheckVkResultFn = check_vk_result;
+  ImGui_ImplVulkan_Init(&init_info, wd->RenderPass);
+
+  // Upload Fonts
+  {
+    // Use any command queue
+    VkCommandPool command_pool = wd->Frames[wd->FrameIndex].CommandPool;
+    VkCommandBuffer command_buffer = wd->Frames[wd->FrameIndex].CommandBuffer;
+
+    err = vkResetCommandPool(g_Device, command_pool, 0);
+    check_vk_result(err);
+    VkCommandBufferBeginInfo begin_info = {};
+    begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    begin_info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    err = vkBeginCommandBuffer(command_buffer, &begin_info);
+    check_vk_result(err);
+
+    ImGui_ImplVulkan_CreateFontsTexture(command_buffer);
+
+    VkSubmitInfo end_info = {};
+    end_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    end_info.commandBufferCount = 1;
+    end_info.pCommandBuffers = &command_buffer;
+    err = vkEndCommandBuffer(command_buffer);
+    check_vk_result(err);
+    err = vkQueueSubmit(g_Queue, 1, &end_info, VK_NULL_HANDLE);
+    check_vk_result(err);
+
+    err = vkDeviceWaitIdle(g_Device);
+    check_vk_result(err);
+    ImGui_ImplVulkan_DestroyFontUploadObjects();
+  }
+
+  // Demo state.
+  bool show_iree_window = true;
+  // --------------------------------------------------------------------------
+  // Setup IREE.
+
+  // Check API version.
+  iree_api_version_t actual_version;
+  iree_status_t status =
+      iree_api_version_check(IREE_API_VERSION_LATEST, &actual_version);
+  if (iree_status_is_ok(status)) {
+    fprintf(stdout, "IREE runtime API version: %d\n", actual_version);
+  } else {
+    fprintf(stderr, "Unsupported runtime API version: %d\n", actual_version);
+    abort();
+  }
+
+  // Create a runtime Instance.
+  iree_vm_instance_t* iree_instance = nullptr;
+  IREE_CHECK_OK(
+      iree_vm_instance_create(iree_allocator_system(), &iree_instance));
+
+  // Register HAL drivers and VM module types.
+  IREE_CHECK_OK(iree_hal_vulkan_driver_module_register(
+      iree_hal_driver_registry_default()));
+  IREE_CHECK_OK(iree_hal_module_register_all_types(iree_instance));
+
+  // Create IREE Vulkan Driver and Device, sharing our VkInstance/VkDevice.
+  fprintf(stdout, "Creating Vulkan driver/device\n");
+  // Load symbols from our static `vkGetInstanceProcAddr` for IREE to use.
+  iree_hal_vulkan_syms_t* iree_vk_syms = nullptr;
+  IREE_CHECK_OK(iree_hal_vulkan_syms_create(
+      reinterpret_cast<void*>(&vkGetInstanceProcAddr), iree_allocator_system(),
+      &iree_vk_syms));
+  // Create the driver sharing our VkInstance.
+  iree_hal_driver_t* iree_vk_driver = nullptr;
+  iree_string_view_t driver_identifier = iree_make_cstring_view("vulkan");
+  iree_hal_vulkan_driver_options_t driver_options;
+  driver_options.api_version = VK_API_VERSION_1_0;
+  driver_options.requested_features = static_cast<iree_hal_vulkan_features_t>(
+      IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
+  IREE_CHECK_OK(iree_hal_vulkan_driver_create_using_instance(
+      driver_identifier, &driver_options, iree_vk_syms, g_Instance,
+      iree_allocator_system(), &iree_vk_driver));
+  // Create a device sharing our VkDevice and queue.
+  // We could also create a separate (possibly low priority) compute queue for
+  // IREE, and/or provide a dedicated transfer queue.
+  iree_string_view_t device_identifier = iree_make_cstring_view("vulkan");
+  iree_hal_vulkan_queue_set_t compute_queue_set;
+  compute_queue_set.queue_family_index = g_QueueFamily;
+  compute_queue_set.queue_indices = 1 << 0;
+  iree_hal_vulkan_queue_set_t transfer_queue_set;
+  transfer_queue_set.queue_indices = 0;
+  iree_hal_device_t* iree_vk_device = nullptr;
+  IREE_CHECK_OK(iree_hal_vulkan_wrap_device(
+      device_identifier, &driver_options.device_options, iree_vk_syms,
+      g_Instance, g_PhysicalDevice, g_Device, &compute_queue_set,
+      &transfer_queue_set, iree_allocator_system(), &iree_vk_device));
+  // Create a HAL module using the HAL device.
+  iree_vm_module_t* hal_module = nullptr;
+  IREE_CHECK_OK(iree_hal_module_create(iree_instance, iree_vk_device,
+                                       IREE_HAL_MODULE_FLAG_NONE,
+                                       iree_allocator_system(), &hal_module));
+
+
+  // Load bytecode module
+  //iree_file_toc_t module_file_toc;
+  //const char network_model[] = "resnet50_tf.vmfb";
+  //fprintf(stdout, "Loading: %s\n", network_model);
+  //if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
+  //{
+  //    abort();
+  //    return 1;
+  //}
+  //fprintf(stdout, "module size: %zu\n", module_file_toc.size);
+
+  iree_vm_module_t* bytecode_module = nullptr;
+  iree_status_t module_status = iree_tooling_load_module_from_flags(
+      iree_instance, iree_allocator_system(), &bytecode_module);
+  if (!iree_status_is_ok(module_status))
+    return -1;
+  //IREE_CHECK_OK(iree_vm_bytecode_module_create(
+  //    iree_instance,
+  //    iree_const_byte_span_t{
+  //        reinterpret_cast<const uint8_t*>(module_file_toc.data),
+  //        module_file_toc.size},
+  //    iree_allocator_null(), iree_allocator_system(), &bytecode_module));
+  //// Query for details about what is in the loaded module.
+  //iree_vm_module_signature_t bytecode_module_signature =
+  //    iree_vm_module_signature(bytecode_module);
+  //fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
+  //        bytecode_module_signature.export_function_count);
+  //for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
+  //  iree_vm_function_t function;
+  //  IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
+  //      bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
+  //  auto function_name = iree_vm_function_name(&function);
+  //  auto function_signature = iree_vm_function_signature(&function);
+
+  //  fprintf(stdout, "  %d: '%.*s' with calling convention '%.*s'\n", i,
+  //          (int)function_name.size, function_name.data,
+  //          (int)function_signature.calling_convention.size,
+  //          function_signature.calling_convention.data);
+  //}
+
+  // Allocate a context that will hold the module state across invocations.
+  iree_vm_context_t* iree_context = nullptr;
+  std::vector<iree_vm_module_t*> modules = {hal_module, bytecode_module};
+  IREE_CHECK_OK(iree_vm_context_create_with_modules(
+      iree_instance, IREE_VM_CONTEXT_FLAG_NONE, modules.size(), modules.data(),
+      iree_allocator_system(), &iree_context));
+  fprintf(stdout, "Context with modules is ready for use\n");
+
+  // Lookup the entry point function.
+  iree_vm_function_t main_function;
+  const char kMainFunctionName[] = "module.forward";
+  IREE_CHECK_OK(iree_vm_context_resolve_function(
+      iree_context,
+      iree_string_view_t{kMainFunctionName, sizeof(kMainFunctionName) - 1},
+      &main_function));
+  iree_string_view_t main_function_name = iree_vm_function_name(&main_function);
+  fprintf(stdout, "Resolved main function named '%.*s'\n",
+          (int)main_function_name.size, main_function_name.data);
+
+  // --------------------------------------------------------------------------
+
+        // Write inputs into mappable buffers.
+        iree_hal_allocator_t* allocator =
+            iree_hal_device_allocator(iree_vk_device);
+        //iree_hal_memory_type_t input_memory_type =
+        //    static_cast<iree_hal_memory_type_t>(
+        //        IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+        //        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
+        //iree_hal_buffer_usage_t input_buffer_usage =
+        //    static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
+        //iree_hal_buffer_params_t buffer_params;
+        //buffer_params.type = input_memory_type;
+        //buffer_params.usage = input_buffer_usage;
+        //buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;
+
+       // Wrap input buffers in buffer views.
+
+        vm::ref<iree_vm_list_t> inputs;
+        iree_status_t input_status = ParseToVariantList(
+            allocator,
+            iree::span<const std::string>{FLAG_function_inputs.data(),
+                                          FLAG_function_inputs.size()},
+            iree_allocator_system(), &inputs);
+        if (!iree_status_is_ok(input_status))
+            return -1;
+        //vm::ref<iree_vm_list_t> inputs;
+        //IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
+
+        //iree_hal_buffer_view_t* input0_buffer_view = nullptr;
+        //constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
+        //IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
+        //    allocator,
+        //    /*shape_rank=*/4, /*shape=*/input_buffer_shape,
+        //    IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+        //    IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
+        //    iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
+        //    &input0_buffer_view));
+
+        //auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
+        //IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));
+
+        // Prepare outputs list to accept results from the invocation.
+
+        vm::ref<iree_vm_list_t> outputs;
+        constexpr iree_hal_dim_t kOutputCount = 1000;
+        IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, kOutputCount * sizeof(float), iree_allocator_system(), &outputs));
+
+  // --------------------------------------------------------------------------
+
+  // Main loop.
+  bool done = false;
+  while (!done) {
+    SDL_Event event;
+
+    while (SDL_PollEvent(&event)) {
+      if (event.type == SDL_QUIT) {
+        done = true;
+      }
+
+      ImGui_ImplSDL2_ProcessEvent(&event);
+      if (event.type == SDL_QUIT) done = true;
+      if (event.type == SDL_WINDOWEVENT &&
+          event.window.event == SDL_WINDOWEVENT_RESIZED &&
+          event.window.windowID == SDL_GetWindowID(window)) {
+        g_SwapChainResizeWidth = (int)event.window.data1;
+        g_SwapChainResizeHeight = (int)event.window.data2;
+        g_SwapChainRebuild = true;
+      }
+    }
+
+    if (g_SwapChainRebuild) {
+      g_SwapChainRebuild = false;
+      ImGui_ImplVulkan_SetMinImageCount(g_MinImageCount);
+      ImGui_ImplVulkanH_CreateOrResizeWindow(
+          g_Instance, g_PhysicalDevice, g_Device, &g_MainWindowData,
+          g_QueueFamily, g_Allocator, g_SwapChainResizeWidth,
+          g_SwapChainResizeHeight, g_MinImageCount);
+      g_MainWindowData.FrameIndex = 0;
+    }
+
+    // Start the Dear ImGui frame
+    ImGui_ImplVulkan_NewFrame();
+    ImGui_ImplSDL2_NewFrame(window);
+    ImGui::NewFrame();
+
+    // Custom window.
+    {
+      ImGui::Begin("IREE Vulkan Integration Demo", &show_iree_window);
+
+      ImGui::Separator();
+
+      // ImGui Inputs for two input tensors.
+      // Run computation whenever any of the values changes.
+      static bool dirty = true;
+      if (dirty) {
+
+        // Synchronously invoke the function.
+        IREE_CHECK_OK(iree_vm_invoke(iree_context, main_function,
+                                     IREE_VM_INVOCATION_FLAG_NONE,
+                                     /*policy=*/nullptr, inputs.get(),
+                                     outputs.get(), iree_allocator_system()));
+
+
+        // we want to run continuously so we can use tools like RenderDoc, RGP, etc...
+        dirty = true;
+      }
+
+      // Framerate counter.
+      ImGui::Text("Application average %.3f ms/frame (%.1f FPS)",
+                  1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate);
+
+      ImGui::End();
+    }
+
+    // Rendering
+    ImGui::Render();
+    RenderFrame(wd, g_Device, g_Queue);
+
+    PresentFrame(wd, g_Queue);
+  }
+  // --------------------------------------------------------------------------
+
+  // --------------------------------------------------------------------------
+  // Cleanup
+  iree_vm_module_release(hal_module);
+  iree_vm_module_release(bytecode_module);
+  iree_vm_context_release(iree_context);
+  iree_hal_device_release(iree_vk_device);
+  iree_hal_allocator_release(allocator);
+  iree_hal_driver_release(iree_vk_driver);
+  iree_hal_vulkan_syms_release(iree_vk_syms);
+  iree_vm_instance_release(iree_instance);
+
+  err = vkDeviceWaitIdle(g_Device);
+  check_vk_result(err);
+  ImGui_ImplVulkan_Shutdown();
+  ImGui_ImplSDL2_Shutdown();
+  ImGui::DestroyContext();
+
+  CleanupVulkanWindow();
+  CleanupVulkan();
+
+  SDL_DestroyWindow(window);
+  SDL_Quit();
+  // --------------------------------------------------------------------------
+
+  return 0;
+}
+
+}  // namespace iree
--- a/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
--- a/dataset/README.md
+++ b/dataset/README.md
@@ -0,0 +1,27 @@
+# Dataset annotation tool
+
+SHARK annotator for adding or modifying prompts of dataset images
+
+## Set up
+
+Activate SHARK Python virtual environment and install additional packages
+```shell
+source ../shark.venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Run annotator
+
+```shell
+python annotation_tool.py
+```
+
+<img width="1280" alt="annotator" src="https://user-images.githubusercontent.com/49575973/214521137-7ef6ae10-7cd8-46e6-b270-b6c0445157f1.png">
+
+* Select a dataset from `Dataset` dropdown list
+* Select an image from `Image` dropdown list
+* Image and the existing prompt will be loaded
+* Select a prompt from `Prompt` dropdown list to modify or "Add new" to add a prompt
+* Click `Save` to save changes, click `Delete` to delete prompt
+* Click `Back` or `Next` to switch image, you could also select other images from `Image`
+* Click `Finish` when finishing annotation or before switching dataset
--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -0,0 +1,247 @@
+import gradio as gr
+import json
+import jsonlines
+import os
+from args import args
+from pathlib import Path
+from PIL import Image
+from utils import get_datasets
+
+
+shark_root = Path(__file__).parent.parent
+demo_css = shark_root.joinpath("web/demo.css").resolve()
+nodlogo_loc = shark_root.joinpath(
+    "web/models/stable_diffusion/logos/nod-logo.png"
+)
+
+
+with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Column(scale=1, elem_id="demo_title_outer"):
+            gr.Image(
+                value=nod_logo,
+                show_label=False,
+                interactive=False,
+                elem_id="top_logo",
+            ).style(width=150, height=100)
+
+    datasets, images, ds_w_prompts = get_datasets(args.gs_url)
+    prompt_data = dict()
+
+    with gr.Row(elem_id="ui_body"):
+        # TODO: add multiselect dataset, there is a gradio version conflict
+        dataset = gr.Dropdown(label="Dataset", choices=datasets)
+        image_name = gr.Dropdown(label="Image", choices=[])
+
+    with gr.Row(elem_id="ui_body"):
+        # TODO: add ability to search image by typing
+        with gr.Column(scale=1, min_width=600):
+            image = gr.Image(type="filepath").style(height=512)
+
+        with gr.Column(scale=1, min_width=600):
+            prompts = gr.Dropdown(
+                label="Prompts",
+                choices=[],
+            )
+            prompt = gr.Textbox(
+                label="Editor",
+                lines=3,
+            )
+            with gr.Row():
+                save = gr.Button("Save")
+                delete = gr.Button("Delete")
+            with gr.Row():
+                back_image = gr.Button("Back")
+                next_image = gr.Button("Next")
+            finish = gr.Button("Finish")
+
+    def filter_datasets(dataset):
+        if dataset is None:
+            return gr.Dropdown.update(value=None, choices=[])
+
+        # create the dataset dir if doesn't exist and download prompt file
+        dataset_path = str(shark_root) + "/dataset/" + dataset
+        if not os.path.exists(dataset_path):
+            os.mkdir(dataset_path)
+
+        # read prompt jsonlines file
+        prompt_data.clear()
+        if dataset in ds_w_prompts:
+            prompt_gs_path = args.gs_url + "/" + dataset + "/metadata.jsonl"
+            os.system(f'gsutil cp "{prompt_gs_path}" "{dataset_path}"/')
+            with jsonlines.open(dataset_path + "/metadata.jsonl") as reader:
+                for line in reader.iter(type=dict, skip_invalid=True):
+                    prompt_data[line["file_name"]] = (
+                        [line["text"]]
+                        if type(line["text"]) is str
+                        else line["text"]
+                    )
+
+        return gr.Dropdown.update(choices=images[dataset])
+
+    dataset.change(fn=filter_datasets, inputs=dataset, outputs=image_name)
+
+    def display_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return gr.Image.update(value=None), gr.Dropdown.update(value=None)
+
+        # download and load the image
+        img_gs_path = args.gs_url + "/" + dataset + "/" + image_name
+        img_sub_path = "/".join(image_name.split("/")[:-1])
+        img_dst_path = (
+            str(shark_root) + "/dataset/" + dataset + "/" + img_sub_path + "/"
+        )
+        if not os.path.exists(img_dst_path):
+            os.mkdir(img_dst_path)
+        os.system(f'gsutil cp "{img_gs_path}" "{img_dst_path}"')
+        img = Image.open(img_dst_path + image_name.split("/")[-1])
+
+        if image_name not in prompt_data.keys():
+            prompt_data[image_name] = []
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Image.update(value=img), gr.Dropdown.update(
+            choices=prompt_choices
+        )
+
+    image_name.change(
+        fn=display_image,
+        inputs=[dataset, image_name],
+        outputs=[image, prompts],
+    )
+
+    def edit_prompt(prompts):
+        if prompts == "Add new":
+            return gr.Textbox.update(value=None)
+
+        return gr.Textbox.update(value=prompts)
+
+    prompts.change(fn=edit_prompt, inputs=prompts, outputs=prompt)
+
+    def save_prompt(dataset, image_name, prompts, prompt):
+        if (
+            dataset is None
+            or image_name is None
+            or prompts is None
+            or prompt is None
+        ):
+            return
+
+        if prompts == "Add new":
+            prompt_data[image_name].append(prompt)
+        else:
+            idx = prompt_data[image_name].index(prompts)
+            prompt_data[image_name][idx] = prompt
+
+        prompt_path = (
+            str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
+        )
+        # write prompt jsonlines file
+        with open(prompt_path, "w") as f:
+            for key, value in prompt_data.items():
+                if not value:
+                    continue
+                v = value if len(value) > 1 else value[0]
+                f.write(json.dumps({"file_name": key, "text": v}))
+                f.write("\n")
+
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Dropdown.update(choices=prompt_choices, value=None)
+
+    save.click(
+        fn=save_prompt,
+        inputs=[dataset, image_name, prompts, prompt],
+        outputs=prompts,
+    )
+
+    def delete_prompt(dataset, image_name, prompts):
+        if dataset is None or image_name is None or prompts is None:
+            return
+        if prompts == "Add new":
+            return
+
+        prompt_data[image_name].remove(prompts)
+        prompt_path = (
+            str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
+        )
+        # write prompt jsonlines file
+        with open(prompt_path, "w") as f:
+            for key, value in prompt_data.items():
+                if not value:
+                    continue
+                v = value if len(value) > 1 else value[0]
+                f.write(json.dumps({"file_name": key, "text": v}))
+                f.write("\n")
+
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Dropdown.update(choices=prompt_choices, value=None)
+
+    delete.click(
+        fn=delete_prompt,
+        inputs=[dataset, image_name, prompts],
+        outputs=prompts,
+    )
+
+    def get_back_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return
+
+        # remove local image
+        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
+        os.system(f'rm "{img_path}"')
+        # get the index for the back image
+        idx = images[dataset].index(image_name)
+        if idx == 0:
+            return gr.Dropdown.update(value=None)
+
+        return gr.Dropdown.update(value=images[dataset][idx - 1])
+
+    back_image.click(
+        fn=get_back_image, inputs=[dataset, image_name], outputs=image_name
+    )
+
+    def get_next_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return
+
+        # remove local image
+        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
+        os.system(f'rm "{img_path}"')
+        # get the index for the next image
+        idx = images[dataset].index(image_name)
+        if idx == len(images[dataset]) - 1:
+            return gr.Dropdown.update(value=None)
+
+        return gr.Dropdown.update(value=images[dataset][idx + 1])
+
+    next_image.click(
+        fn=get_next_image, inputs=[dataset, image_name], outputs=image_name
+    )
+
+    def finish_annotation(dataset):
+        if dataset is None:
+            return
+
+        # upload prompt and remove local data
+        dataset_path = str(shark_root) + "/dataset/" + dataset
+        dataset_gs_path = args.gs_url + "/" + dataset + "/"
+        os.system(
+            f'gsutil cp "{dataset_path}/metadata.jsonl" "{dataset_gs_path}"'
+        )
+        os.system(f'rm -rf "{dataset_path}"')
+
+        return gr.Dropdown.update(value=None)
+
+    finish.click(fn=finish_annotation, inputs=dataset, outputs=dataset)
+
+
+if __name__ == "__main__":
+    shark_web.launch(
+        share=args.share,
+        inbrowser=True,
+        server_name="0.0.0.0",
+        server_port=args.server_port,
+    )
--- a/dataset/args.py
+++ b/dataset/args.py
@@ -0,0 +1,34 @@
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Dataset Annotator flags
+##############################################################################
+
+p.add_argument(
+    "--gs_url",
+    type=str,
+    required=True,
+    help="URL to datasets in GS bucket",
+)
+
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
+##############################################################################
+
+args = p.parse_args()
--- a/dataset/requirements.txt
+++ b/dataset/requirements.txt
@@ -0,0 +1,3 @@
+# SHARK Annotator
+gradio==3.15.0
+jsonlines
--- a/dataset/utils.py
+++ b/dataset/utils.py
@@ -0,0 +1,29 @@
+from google.cloud import storage
+
+
+def get_datasets(gs_url):
+    datasets = set()
+    images = dict()
+    ds_w_prompts = []
+
+    storage_client = storage.Client()
+    bucket_name = gs_url.split("/")[2]
+    source_blob_name = "/".join(gs_url.split("/")[3:])
+    blobs = storage_client.list_blobs(bucket_name, prefix=source_blob_name)
+
+    for blob in blobs:
+        dataset_name = blob.name.split("/")[1]
+        if dataset_name == "":
+            continue
+        datasets.add(dataset_name)
+        if dataset_name not in images.keys():
+            images[dataset_name] = []
+
+        # check if image or jsonl
+        file_sub_path = "/".join(blob.name.split("/")[2:])
+        if "/" in file_sub_path:
+            images[dataset_name] += [file_sub_path]
+        elif "metadata.jsonl" in file_sub_path:
+            ds_w_prompts.append(dataset_name)
+
+    return list(datasets), images, ds_w_prompts
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -0,0 +1,282 @@
+# Lint as: python3
+"""SHARK Tank"""
+# python generate_sharktank.py, you have to give a csv tile with [model_name, model_download_url]
+# will generate local shark tank folder like this:
+#   HOME
+#     /.local
+#       /shark_tank
+#           /albert_lite_base
+#           /...model_name...
+#
+
+import os
+import csv
+import argparse
+from shark.shark_importer import SharkImporter
+from shark.parser import shark_args
+import subprocess as sp
+import hashlib
+import numpy as np
+from pathlib import Path
+from shark.examples.shark_inference.stable_diffusion import (
+    model_wrappers as mw,
+)
+from shark.examples.shark_inference.stable_diffusion.stable_args import (
+    args,
+)
+
+
+def create_hash(file_name):
+    with open(file_name, "rb") as f:
+        file_hash = hashlib.blake2b()
+        while chunk := f.read(2**20):
+            file_hash.update(chunk)
+
+    return file_hash.hexdigest()
+
+
+def save_torch_model(torch_model_list):
+    from tank.model_utils import (
+        get_hf_model,
+        get_vision_model,
+        get_hf_img_cls_model,
+        get_fp16_model,
+    )
+
+    with open(torch_model_list) as csvfile:
+        torch_reader = csv.reader(csvfile, delimiter=",")
+        fields = next(torch_reader)
+        for row in torch_reader:
+            torch_model_name = row[0]
+            tracing_required = row[1]
+            model_type = row[2]
+            is_dynamic = row[3]
+
+            tracing_required = False if tracing_required == "False" else True
+            is_dynamic = False if is_dynamic == "False" else True
+
+            model = None
+            input = None
+            if model_type == "stable_diffusion":
+                args.use_tuned = False
+                args.import_mlir = True
+                args.use_tuned = False
+                args.local_tank_cache = WORKDIR
+
+                precision_values = ["fp16"]
+                seq_lengths = [64, 77]
+                for precision_value in precision_values:
+                    args.precision = precision_value
+                    for length in seq_lengths:
+                        model = mw.SharkifyStableDiffusionModel(
+                            model_id=torch_model_name,
+                            custom_weights="",
+                            precision=precision_value,
+                            max_len=length,
+                            width=512,
+                            height=512,
+                            use_base_vae=False,
+                            debug=True,
+                            sharktank_dir=WORKDIR,
+                            generate_vmfb=False,
+                        )
+                        model()
+                continue
+            if model_type == "vision":
+                model, input, _ = get_vision_model(torch_model_name)
+            elif model_type == "hf":
+                model, input, _ = get_hf_model(torch_model_name)
+            elif model_type == "hf_img_cls":
+                model, input, _ = get_hf_img_cls_model(torch_model_name)
+            elif model_type == "fp16":
+                model, input, _ = get_fp16_model(torch_model_name)
+            torch_model_name = torch_model_name.replace("/", "_")
+            torch_model_dir = os.path.join(
+                WORKDIR, str(torch_model_name) + "_torch"
+            )
+            os.makedirs(torch_model_dir, exist_ok=True)
+
+            mlir_importer = SharkImporter(
+                model,
+                (input,),
+                frontend="torch",
+            )
+            mlir_importer.import_debug(
+                is_dynamic=False,
+                tracing_required=tracing_required,
+                dir=torch_model_dir,
+                model_name=torch_model_name,
+            )
+            mlir_hash = create_hash(
+                os.path.join(
+                    torch_model_dir, torch_model_name + "_torch" + ".mlir"
+                )
+            )
+            np.save(os.path.join(torch_model_dir, "hash"), np.array(mlir_hash))
+            # Generate torch dynamic models.
+            if is_dynamic:
+                mlir_importer.import_debug(
+                    is_dynamic=True,
+                    tracing_required=tracing_required,
+                    dir=torch_model_dir,
+                    model_name=torch_model_name + "_dynamic",
+                )
+
+
+def save_tf_model(tf_model_list):
+    from tank.model_utils_tf import (
+        get_causal_image_model,
+        get_causal_lm_model,
+        get_keras_model,
+        get_TFhf_model,
+    )
+    import tensorflow as tf
+
+    visible_default = tf.config.list_physical_devices("GPU")
+    try:
+        tf.config.set_visible_devices([], "GPU")
+        visible_devices = tf.config.get_visible_devices()
+        for device in visible_devices:
+            assert device.device_type != "GPU"
+    except:
+        # Invalid device or cannot modify virtual devices once initialized.
+        pass
+
+    with open(tf_model_list) as csvfile:
+        tf_reader = csv.reader(csvfile, delimiter=",")
+        fields = next(tf_reader)
+        for row in tf_reader:
+            tf_model_name = row[0]
+            model_type = row[1]
+
+            model = None
+            input = None
+            print(f"Generating artifacts for model {tf_model_name}")
+            if model_type == "hf":
+                model, input, _ = get_causal_lm_model(tf_model_name)
+            if model_type == "img":
+                model, input, _ = get_causal_image_model(tf_model_name)
+            if model_type == "keras":
+                model, input, _ = get_keras_model(tf_model_name)
+            if model_type == "TFhf":
+                model, input, _ = get_TFhf_model(tf_model_name)
+
+            tf_model_name = tf_model_name.replace("/", "_")
+            tf_model_dir = os.path.join(WORKDIR, str(tf_model_name) + "_tf")
+            os.makedirs(tf_model_dir, exist_ok=True)
+
+            mlir_importer = SharkImporter(
+                model,
+                input,
+                frontend="tf",
+            )
+            mlir_importer.import_debug(
+                dir=tf_model_dir,
+                model_name=tf_model_name,
+            )
+            mlir_hash = create_hash(
+                os.path.join(tf_model_dir, tf_model_name + "_tf" + ".mlir")
+            )
+            np.save(os.path.join(tf_model_dir, "hash"), np.array(mlir_hash))
+
+
+def save_tflite_model(tflite_model_list):
+    from shark.tflite_utils import TFLitePreprocessor
+
+    with open(tflite_model_list) as csvfile:
+        tflite_reader = csv.reader(csvfile, delimiter=",")
+        for row in tflite_reader:
+            print("\n")
+            tflite_model_name = row[0]
+            tflite_model_link = row[1]
+            print("tflite_model_name", tflite_model_name)
+            print("tflite_model_link", tflite_model_link)
+            tflite_model_name_dir = os.path.join(
+                WORKDIR, str(tflite_model_name) + "_tflite"
+            )
+            os.makedirs(tflite_model_name_dir, exist_ok=True)
+            print(f"TMP_TFLITE_MODELNAME_DIR = {tflite_model_name_dir}")
+
+            # Preprocess to get SharkImporter input args
+            tflite_preprocessor = TFLitePreprocessor(str(tflite_model_name))
+            raw_model_file_path = tflite_preprocessor.get_raw_model_file()
+            inputs = tflite_preprocessor.get_inputs()
+            tflite_interpreter = tflite_preprocessor.get_interpreter()
+
+            # Use SharkImporter to get SharkInference input args
+            my_shark_importer = SharkImporter(
+                module=tflite_interpreter,
+                inputs=inputs,
+                frontend="tflite",
+                raw_model_file=raw_model_file_path,
+            )
+            my_shark_importer.import_debug(
+                dir=tflite_model_name_dir,
+                model_name=tflite_model_name,
+                func_name="main",
+            )
+            mlir_hash = create_hash(
+                os.path.join(
+                    tflite_model_name_dir,
+                    tflite_model_name + "_tflite" + ".mlir",
+                )
+            )
+            np.save(
+                os.path.join(tflite_model_name_dir, "hash"),
+                np.array(mlir_hash),
+            )
+
+
+# Validates whether the file is present or not.
+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
+if __name__ == "__main__":
+    # Note, all of these flags are overridden by the import of args from stable_args.py, flags are duplicated temporarily to preserve functionality
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument(
+    #    "--torch_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/torch_model_list.csv",
+    #    help="""Contains the file with torch_model name and args.
+    #         Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
+    # )
+    # parser.add_argument(
+    #    "--tf_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/tf_model_list.csv",
+    #    help="Contains the file with tf model name and args.",
+    # )
+    # parser.add_argument(
+    #    "--tflite_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/tflite/tflite_model_list.csv",
+    #    help="Contains the file with tf model name and args.",
+    # )
+    # parser.add_argument(
+    #    "--ci_tank_dir",
+    #    type=bool,
+    #    default=False,
+    # )
+    # parser.add_argument("--upload", type=bool, default=False)
+
+    # old_args = parser.parse_args()
+
+    home = str(Path.home())
+    if args.ci_tank_dir == True:
+        WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
+    else:
+        WORKDIR = os.path.join(home, ".local/shark_tank/")
+
+    if args.torch_model_csv:
+        save_torch_model(args.torch_model_csv)
+
+    if args.tf_model_csv:
+        save_tf_model(args.tf_model_csv)
+
+    if args.tflite_model_csv:
+        save_tflite_model(args.tflite_model_csv)
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -0,0 +1,192 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.17)
+
+project(sharkbackend LANGUAGES C CXX)
+
+#
+# Options
+#
+
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
+option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+#
+# Dependencies
+#
+# FetchContent requires us to include the transitive closure of all
+# repos that we depend on so that we can override the tags.
+#
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-core
+  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_TAG ${TRITON_CORE_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-backend
+  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(repo-common repo-core repo-backend)
+
+#
+# The backend must be built into a shared library. Use an ldscript to
+# hide all symbols except for the TRITONBACKEND API.
+#
+configure_file(src/libtriton_dshark.ldscript libtriton_dshark.ldscript COPYONLY)
+
+add_library(
+  triton-dshark-backend SHARED
+  src/dshark.cc
+  #src/dshark_driver_module.c
+)
+
+add_library(
+  SharkBackend::triton-dshark-backend ALIAS triton-dshark-backend
+)
+
+target_include_directories(
+  triton-dshark-backend
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_BINARY_DIR}/lib/cmake/mlir")
+
+add_subdirectory(thirdparty/shark-runtime EXCLUDE_FROM_ALL)
+
+target_link_libraries(triton-dshark-backend PRIVATE iree_base_base
+  iree_hal_hal
+  iree_hal_cuda_cuda
+  iree_hal_cuda_registration_registration
+  iree_hal_vmvx_registration_registration
+  iree_hal_dylib_registration_registration
+  iree_modules_hal_hal
+  iree_vm_vm
+  iree_vm_bytecode_module
+  iree_hal_local_loaders_system_library_loader
+  iree_hal_local_loaders_vmvx_module_loader
+  )
+
+target_compile_features(triton-dshark-backend PRIVATE cxx_std_11)
+
+
+target_link_libraries(
+  triton-dshark-backend
+  PRIVATE
+    triton-core-serverapi   # from repo-core
+    triton-core-backendapi  # from repo-core
+    triton-core-serverstub  # from repo-core
+    triton-backend-utils    # from repo-backend
+)
+
+if(WIN32)
+  set_target_properties(
+    triton-dshark-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_dshark
+  )
+else()
+  set_target_properties(
+    triton-dshark-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_dshark
+    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_dshark.ldscript
+    LINK_FLAGS "-Wl,--version-script libtriton_dshark.ldscript"
+  )
+endif()
+
+
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/SharkBackend)
+
+install(
+  TARGETS
+    triton-dshark-backend
+  EXPORT
+    triton-dshark-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
+  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
+)
+
+install(
+  EXPORT
+    triton-dshark-backend-targets
+  FILE
+    SharkBackendTargets.cmake
+  NAMESPACE
+    SharkBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/SharkBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-dshark-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendTargets.cmake
+  NAMESPACE SharkBackend::
+)
+
+export(PACKAGE SharkBackend)
+
--- a/inference/README.md
+++ b/inference/README.md
@@ -0,0 +1,100 @@
+# SHARK Triton Backend
+
+The triton backend for shark.
+
+# Build
+
+Install SHARK
+
+```
+git clone https://github.com/nod-ai/SHARK.git
+# skip above step if dshark is already installed
+cd SHARK/inference
+```
+
+install dependancies
+
+```
+apt-get install patchelf rapidjson-dev python3-dev
+git submodule update --init
+```
+
+update the submodules of iree
+
+```
+cd thirdparty/shark-runtime
+git submodule update --init
+```
+
+Next, make the backend and install it
+
+```
+cd ../..
+mkdir build && cd build
+cmake -DTRITON_ENABLE_GPU=ON \
+-DIREE_HAL_DRIVER_CUDA=ON \
+-DIREE_TARGET_BACKEND_CUDA=ON \
+-DMLIR_ENABLE_CUDA_RUNNER=ON \
+-DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \
+-DTRITON_BACKEND_REPO_TAG=r22.02 \
+-DTRITON_CORE_REPO_TAG=r22.02 \
+-DTRITON_COMMON_REPO_TAG=r22.02 ..
+make install
+```
+
+# Incorporating into Triton
+
+There are much more in depth explenations for the following steps in triton's documentation:
+https://github.com/triton-inference-server/server/blob/main/docs/compose.md#triton-with-unsupported-and-custom-backends
+
+There should be a file at /build/install/backends/dshark/libtriton_dshark.so.  You will need to copy it into your triton server image.  
+More documentation is in the link above, but to create the docker image, you need to run the compose.py command in the triton-backend server repo
+
+
+To first build your image, clone the tritonserver repo.
+
+```
+git clone https://github.com/triton-inference-server/server.git
+```
+
+then run `compose.py` to build a docker compose file 
+```
+cd server
+python3 compose.py --repoagent checksum --dry-run
+```
+
+Because dshark is a third party backend, you will need to manually modify the `Dockerfile.compose` to include the dshark backend.  To do this, in the Dockerfile.compose file produced, copy this line.
+the dshark backend will be located in the build folder from earlier under `/build/install/backends`
+
+```
+COPY /path/to/build/install/backends/dshark /opt/tritonserver/backends/dshark
+```
+
+Next run 
+```
+docker build -t tritonserver_custom -f Dockerfile.compose .
+docker run -it --gpus=1 --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
+```
+
+where `path/to/model_repos` is where you are storing the models you want to run
+
+if your not using gpus, omit `--gpus=1`
+
+```
+docker run -it  --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
+```
+
+# Setting up a model
+
+to include a model in your backend, add a directory with your model name to your model repository directory.  examples of models can be seen here: https://github.com/triton-inference-server/backend/tree/main/examples/model_repos/minimal_models
+
+make sure to adjust the input correctly in the config.pbtxt file, and save a vmfb file under 1/model.vmfb
+
+# CUDA
+
+if you're having issues with cuda, make sure your correct drivers are installed, and that `nvidia-smi` works, and also make sure that the nvcc compiler is on the path.
+
+
+
+
+
--- a/inference/cmake/SharkBackendConfig.cmake.in
+++ b/inference/cmake/SharkBackendConfig.cmake.in
@@ -0,0 +1,39 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  SHARKBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${SHARKBACKEND_CMAKE_DIR})
+
+if(NOT TARGET SharkBackend::triton-dshark-backend)
+  include("${SHARKBACKEND_CMAKE_DIR}/SharkBackendTargets.cmake")
+endif()
+
+set(SHARKBACKEND_LIBRARIES SharkBackend::triton-dshark-backend)
--- a/inference/src/dshark.cc
+++ b/inference/src/dshark.cc
--- a/inference/src/libtriton_dshark.ldscript
+++ b/inference/src/libtriton_dshark.ldscript
@@ -0,0 +1,30 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
--- a/inference/thirdparty/shark-runtime
+++ b/inference/thirdparty/shark-runtime
--- a/package-index/index.html
+++ b/package-index/index.html
@@ -1,45 +0,0 @@
-<!DOCTYPE html>
-<html>
-  <body>
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230130.481/shark_sd_20230130_481.exe'>shark_sd_20230130_481.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230130.481/shark_sd_cli_20230130_481.exe'>shark_sd_cli_20230130_481.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.479/shark_sd_20230129_479.exe'>shark_sd_20230129_479.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.479/shark_sd_cli_20230129_479.exe'>shark_sd_cli_20230129_479.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.480/shark_sd_20230129_480.exe'>shark_sd_20230129_480.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.480/shark_sd_cli_20230129_480.exe'>shark_sd_cli_20230129_480.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.478/shark_sd_20230129_478.exe'>shark_sd_20230129_478.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.478/shark_sd_cli_20230129_478.exe'>shark_sd_cli_20230129_478.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230128.477/shark_sd_20230128_477.exe'>shark_sd_20230128_477.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230128.477/shark_sd_cli_20230128_477.exe'>shark_sd_cli_20230128_477.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230127.476/shark_sd_20230127_476.exe'>shark_sd_20230127_476.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230127.476/shark_sd_cli_20230127_476.exe'>shark_sd_cli_20230127_476.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230126.475/shark_sd_20230126_475.exe'>shark_sd_20230126_475.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230126.475/shark_sd_cli_20230126_475.exe'>shark_sd_cli_20230126_475.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.474/shark_sd_20230125_474.exe'>shark_sd_20230125_474.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.474/shark_sd_cli_20230125_474.exe'>shark_sd_cli_20230125_474.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.473/shark_sd_20230125_473.exe'>shark_sd_20230125_473.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.473/shark_sd_cli_20230125_473.exe'>shark_sd_cli_20230125_473.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.472/shark_sd_20230125_472.exe'>shark_sd_20230125_472.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.471/shark_sd_20230125_471.exe'>shark_sd_20230125_471.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.468/shark_sd_20230125_468.exe'>shark_sd_20230125_468.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.470/shark_sd_20230124_470.exe'>shark_sd_20230124_470.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.470/shark_sd_cli_20230124_470.exe'>shark_sd_cli_20230124_470.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.469/shark_sd_20230124_469.exe'>shark_sd_20230124_469.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.467/shark_sd_20230124_467.exe'>shark_sd_20230124_467.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.466/shark_sd_20230124_466.exe'>shark_sd_20230124_466.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.462/shark_sd_20230124_462.exe'>shark_sd_20230124_462.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230123.461/shark_sd_20230123_461.exe'>shark_sd_20230123_461.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230123.460/shark_sd_20230123_460.exe'>shark_sd_20230123_460.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.459/shark_sd_20230122_459.exe'>shark_sd_20230122_459.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.458/shark_sd_20230122_458.exe'>shark_sd_20230122_458.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.457/shark_sd_20230122_457.exe'>shark_sd_20230122_457.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230121.456/shark_sd_20230121_456.exe'>shark_sd_20230121_456.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230120.455/shark_sd_20230120_455.exe'>shark_sd_20230120_455.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230119.454/shark_sd_20230119_454.exe'>shark_sd_20230119_454.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230118.453/shark_sd_20230118_453.exe'>shark_sd_20230118_453.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230117.452/shark_sd_20230117_452.exe'>shark_sd_20230117_452.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230116.451/shark_sd_20230116_451.exe'>shark_sd_20230116_451.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230115.450/shark_sd_20230115_450.exe'>shark_sd_20230115_450.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230114.449/shark_sd_20230114_449.exe'>shark_sd_20230114_449.exe</a><br />
-  </body>
-</html>
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,12 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel",
+    "packaging",
+
+    "numpy>=1.22.4",
+    "torch-mlir>=20221021.633",
+    "iree-compiler>=20221022.190",
+    "iree-runtime>=20221022.190",
+]
+build-backend = "setuptools.build_meta"
--- a/pytest.ini
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+addopts = --verbose -p no:warnings
+norecursedirs = inference tank/tflite examples benchmarks shark 
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -0,0 +1,45 @@
+-f https://download.pytorch.org/whl/nightly/cpu/
+--pre
+
+numpy
+torch
+torchvision
+
+tqdm
+
+#iree-compiler  | iree-runtime should already be installed
+#these dont work ok osx
+#iree-tools-tflite
+#iree-tools-xla
+#iree-tools-tf
+
+# TensorFlow and JAX.
+gin-config
+tensorflow-macos
+tensorflow-metal
+#tf-models-nightly
+#tensorflow-text-nightly
+transformers
+tensorflow-probability
+#jax[cpu]
+
+# tflitehub dependencies.
+Pillow
+
+# web dependecies.
+gradio
+altair
+
+# Testing and support.
+#lit
+#pyyaml
+
+#ONNX and ORT for benchmarking
+#--extra-index-url https://test.pypi.org/simple/
+#protobuf
+#coloredlogs
+#flatbuffers
+#sympy
+#psutil
+#onnx-weekly
+#ort-nightly
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -0,0 +1,50 @@
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+--pre
+
+numpy==1.22.4
+torchvision
+pytorch-triton
+tabulate
+
+tqdm
+
+#iree-compiler  | iree-runtime should already be installed
+iree-tools-tflite
+iree-tools-xla
+iree-tools-tf
+
+# TensorFlow and JAX.
+gin-config
+tensorflow==2.10.1
+keras==2.10
+#tf-models-nightly
+#tensorflow-text-nightly
+transformers
+diffusers
+#tensorflow-probability
+#jax[cpu]
+
+
+# tflitehub dependencies.
+Pillow
+
+# Testing and support.
+lit
+pyyaml
+python-dateutil
+sacremoses
+
+# web dependecies.
+gradio
+altair
+scipy
+
+#ONNX and ORT for benchmarking
+#--extra-index-url https://test.pypi.org/simple/
+#protobuf
+#coloredlogs
+#flatbuffers
+#sympy
+#psutil
+#onnx-weekly
+#ort-nightly
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,29 @@
+setuptools
+wheel
+
+# SHARK Runner
+tqdm
+
+# SHARK Downloader
+google-cloud-storage
+
+# Testing
+pytest
+pytest-xdist
+pytest-forked
+Pillow
+parameterized
+
+# Add transformers, diffusers and scipy since it most commonly used
+transformers
+diffusers
+scipy
+ftfy
+gradio
+altair
+omegaconf
+safetensors
+
+# Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
+pefile
+pyinstaller
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,45 @@
+from setuptools import find_packages
+from setuptools import setup
+
+import os
+import glob
+
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.5"
+backend_deps = []
+if "NO_BACKEND" in os.environ.keys():
+    backend_deps = [
+        "iree-compiler>=20221022.190",
+        "iree-runtime>=20221022.190",
+    ]
+
+setup(
+    name="nodai-SHARK",
+    version=f"{PACKAGE_VERSION}",
+    description="SHARK provides a High Performance Machine Learning Framework",
+    author="nod.ai",
+    author_email="stdin@nod.ai",
+    url="https://nod.ai",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    project_urls={
+        "Code": "https://github.com/nod-ai/SHARK",
+        "Bug Tracker": "https://github.com/nod-ai/SHARK/issues",
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    packages=find_packages(exclude=("examples")),
+    python_requires=">=3.9",
+    data_files=glob.glob("apps/stable_diffusion/resources/**"),
+    install_requires=[
+        "numpy",
+        "PyYAML",
+        "torch-mlir>=20221021.633",
+    ]
+    + backend_deps,
+)
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -0,0 +1,45 @@
+param([string]$arguments)
+
+if ($arguments -eq "--update-src"){
+	git pull
+}
+
+#Write-Host "Installing python"
+
+#Start-Process winget install Python.Python.3.10 '/quiet InstallAllUsers=1 PrependPath=1' -wait -NoNewWindow
+
+#Write-Host "python installation completed successfully"
+
+#Write-Host "Reload environment variables"
+#$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
+#Write-Host "Reloaded environment variables"
+
+
+# redirect stderr into stdout
+$p = &{python -V} 2>&1
+# check if an ErrorRecord was returned
+$version = if($p -is [System.Management.Automation.ErrorRecord])
+{
+    # grab the version string from the error message
+    $p.Exception.Message
+}
+else
+{
+    # otherwise return as is
+    $p
+}
+
+Write-Host "Python version found is"
+Write-Host $p
+
+
+Write-Host "Installing Build Dependencies"
+python -m venv .\shark.venv\
+.\shark.venv\Scripts\activate
+pip install -r requirements.txt
+pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
+pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
+Write-Host "Building SHARK..."
+pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
+Write-Host "Build and installation completed successfully"
+Write-Host "Source your venv with ./shark.venv/Scripts/activate"
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Sets up a venv suitable for running samples.
+# e.g:
+# ./setup_venv.sh  #setup a default $PYTHON3 shark.venv
+# Environment Variables by the script.
+# PYTHON=$PYTHON3.10 ./setup_venv.sh  #pass a version of $PYTHON to use
+# VENV_DIR=myshark.venv #create a venv called myshark.venv
+# USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
+# IMPORTER=1 #Install importer deps
+# BENCHMARK=1 #Install benchmark deps
+# NO_BACKEND=1 #Don't install iree or shark backend
+# if you run the script from a conda env it will install in your conda env
+
+TD="$(cd $(dirname $0) && pwd)"
+if [ -z "$PYTHON" ]; then
+  PYTHON="$(which python3)"
+fi
+
+function die() {
+  echo "Error executing command: $*"
+  exit 1
+}
+
+PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))'`
+
+echo "Python: $PYTHON"
+echo "Python version: $PYTHON_VERSION_X_Y"
+
+if [[ -z "${CONDA_PREFIX}" ]]; then
+  # Not a conda env. So create a new VENV dir
+  VENV_DIR=${VENV_DIR:-shark.venv}
+  echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
+  $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
+  source "$VENV_DIR/bin/activate" || die "Could not activate venv"
+  PYTHON="$(which python3)"
+else
+  echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
+fi
+
+Red=`tput setaf 1`
+Green=`tput setaf 2`
+Yellow=`tput setaf 3`
+
+# Assume no binary torch-mlir.
+# Currently available for macOS m1&intel (3.10) and Linux(3.7,3.8,3.9,3.10)
+torch_mlir_bin=false
+if [[ $(uname -s) = 'Darwin' ]]; then
+  echo "${Yellow}Apple macOS detected"
+  if [[ $(uname -m) == 'arm64' ]]; then
+    echo "${Yellow}Apple M1 Detected"
+    hash rustc 2>/dev/null
+    if [ $? -eq 0 ];then
+      echo "${Green}rustc found to compile HF tokenizers"
+    else
+      echo "${Red}Could not find rustc" >&2
+      echo "${Red}Please run:"
+      echo "${Red}curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh"
+      exit 1
+    fi
+  fi
+  echo "${Yellow}Run the following commands to setup your SSL certs for your Python version if you see SSL errors with tests"
+  echo "${Yellow}/Applications/Python\ 3.XX/Install\ Certificates.command"
+  if [ "$PYTHON_VERSION_X_Y" == "3.10" ]; then
+    torch_mlir_bin=true
+  fi
+elif [[ $(uname -s) = 'Linux' ]]; then
+  echo "${Yellow}Linux detected"
+  if [ "$PYTHON_VERSION_X_Y" == "3.7" ] || [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.9" ] || [ "$PYTHON_VERSION_X_Y" == "3.10" ] ; then
+    torch_mlir_bin=true
+  fi
+else
+  echo "${Red}OS not detected. Pray and Play"
+fi
+
+# Upgrade pip and install requirements.
+$PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
+$PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
+if [ "$torch_mlir_bin" = true ]; then
+  if [[ $(uname -s) = 'Darwin' ]]; then
+    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
+    $PYTHON -m pip install --pre --no-cache-dir  torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
+  else
+    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
+    if [ $? -eq 0 ];then
+      echo "Successfully Installed torch-mlir"
+    else
+      echo "Could not install torch-mlir" >&2
+    fi
+  fi
+else
+  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
+  echo "${Yello}Python 3.10 supported on macOS and 3.7,3.8,3.9 and 3.10 on Linux"
+  echo "${Red}Please build torch-mlir from source in your environment"
+  exit 1
+fi
+if [[ -z "${USE_IREE}" ]]; then
+  rm .use-iree
+  RUNTIME="https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html"
+else
+  touch ./.use-iree
+  RUNTIME="https://iree-org.github.io/iree/pip-release-links.html"
+fi
+if [[ -z "${NO_BACKEND}" ]]; then
+  echo "Installing ${RUNTIME}..."
+  $PYTHON -m pip install --upgrade --find-links ${RUNTIME} iree-compiler iree-runtime
+else
+  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
+fi
+
+if [[ ! -z "${IMPORTER}" ]]; then
+  echo "${Yellow}Installing importer tools.."
+  if [[ $(uname -s) = 'Linux' ]]; then
+    echo "${Yellow}Linux detected.. installing Linux importer tools"
+    #Always get the importer tools from upstream IREE
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  elif [[ $(uname -s) = 'Darwin' ]]; then
+    echo "${Yellow}macOS detected.. installing macOS importer tools"
+    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  fi
+fi
+
+$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/torch/
+
+if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
+  T_VER=$($PYTHON -m pip show torch | grep Version)
+  TORCH_VERSION=${T_VER:9:17}
+  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
+  TV_VERSION=${TV_VER:9:18}
+  $PYTHON -m pip uninstall -y torch torchvision
+  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
+  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu117/torch-${TORCH_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu117/torchvision-${TV_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl
+  if [ $? -eq 0 ];then
+    echo "Successfully Installed torch + cu117."
+  else
+    echo "Could not install torch + cu117." >&2
+  fi
+fi
+
+if [[ ! -z "${ONNX}" ]]; then
+  echo "${Yellow}Installing ONNX and onnxruntime for benchmarks..."
+  $PYTHON -m pip install onnx onnxruntime psutil
+  if [ $? -eq 0 ];then
+    echo "Successfully installed ONNX and ONNX runtime."
+  else
+    echo "Could not install ONNX." >&2
+  fi
+fi
+
+if [[ -z "${CONDA_PREFIX}" ]]; then
+  echo "${Green}Before running examples activate venv with:"
+  echo "  ${Green}source $VENV_DIR/bin/activate"
+fi
+
--- a/shark/init.py
+++ b/shark/init.py
--- a/shark/backward_makefx.py
+++ b/shark/backward_makefx.py
@@ -0,0 +1,78 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch._decomp import get_decompositions
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.nn.utils import _stateless
+
+from torch import fx
+import tempfile
+
+
+class MakeFxModule:
+    def __init__(self, model, inputs, labels=None, custom_inference_fn=None):
+        self.model = model
+        self.inputs = inputs
+        self.custom_inference_fn = custom_inference_fn
+        self.training_graph = None
+
+    # Doesn't replace the None type.
+    def change_fx_graph_return_to_tuple(self, fx_g: fx.GraphModule):
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                # output nodes always have one argument
+                node_arg = node.args[0]
+                out_nodes = []
+                if isinstance(node_arg, list):
+                    # Don't return NoneType elements.
+                    for out_node in node_arg:
+                        if not isinstance(out_node, type(None)):
+                            out_nodes.append(out_node)
+                    # If there is a single tensor/element to be returned don't
+                    # a tuple for it.
+                    if len(out_nodes) == 1:
+                        node.args = out_nodes
+                    else:
+                        node.args = (tuple(out_nodes),)
+        fx_g.graph.lint()
+        fx_g.recompile()
+        return fx_g
+
+    def generate_graph(self):
+        fx_g = make_fx(
+            self.custom_inference_fn,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                ]
+            ),
+        )(
+            dict(self.model.named_parameters()),
+            dict(self.model.named_buffers()),
+            self.inputs,
+        )
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+        fx_g = self.change_fx_graph_return_to_tuple(fx_g)
+        ts_g = torch.jit.script(fx_g)
+        temp = tempfile.NamedTemporaryFile(
+            suffix="_shark_ts", prefix="temp_ts_"
+        )
+        ts_g.save(temp.name)
+        new_ts = torch.jit.load(temp.name)
+        self.training_graph = new_ts
--- a/shark/examples/shark_dynamo/basic_examples.py
+++ b/shark/examples/shark_dynamo/basic_examples.py
@@ -0,0 +1,70 @@
+import torchdynamo
+import torch
+import torch_mlir
+from shark.sharkdynamo.utils import make_shark_compiler
+
+
+import warnings, logging
+
+warnings.simplefilter("ignore")
+torchdynamo.config.log_level = logging.ERROR
+
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
+)
+def foo(t):
+    return 2 * t
+
+
+example_input = torch.rand((2, 3))
+x = foo(example_input)
+print(x)
+
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
+)
+def foo(a, b):
+    x = a / (a + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+print(foo(torch.rand((2, 3)), -torch.rand((2, 3))))
+
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
+)
+def foo(a):
+    for i in range(10):
+        a += 1.0
+    return a
+
+
+print(foo(torch.rand((1, 2))))
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
+)
+def test_unsupported_types(t, y):
+    return t, 2 * y
+
+
+str_input = "hello"
+tensor_input = torch.randn(2)
+print(test_unsupported_types(str_input, tensor_input))
--- a/shark/examples/shark_eager/dynamo_demo.ipynb
+++ b/shark/examples/shark_eager/dynamo_demo.ipynb
@@ -0,0 +1,309 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mlevental/miniconda3/envs/torch-mlir/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "# standard imports\n",
+    "import torch\n",
+    "from shark.iree_utils import get_iree_compiled_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "# torch dynamo related imports\n",
+    "try:\n",
+    "    import torchdynamo\n",
+    "    from torchdynamo.optimizations.backends import create_backend\n",
+    "    from torchdynamo.optimizations.subgraph import SubGraph\n",
+    "except ModuleNotFoundError:\n",
+    "    print(\n",
+    "        \"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\"\n",
+    "    )\n",
+    "    exit()\n",
+    "\n",
+    "# torch-mlir imports for compiling\n",
+    "from torch_mlir import compile, OutputType"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "[TorchDynamo](https://github.com/pytorch/torchdynamo) is a compiler for PyTorch programs that uses the [frame evaluation API](https://www.python.org/dev/peps/pep-0523/) in CPython to dynamically modify Python bytecode right before it is executed. It creates this FX Graph through bytecode analysis and is designed to mix Python execution with compiled backends."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "def toy_example(*args):\n",
+    "    a, b = args\n",
+    "\n",
+    "    x = a / (torch.abs(a) + 1)\n",
+    "    if b.sum() < 0:\n",
+    "        b = b * -1\n",
+    "    return x * b"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [],
+   "source": [
+    "# compiler that lowers fx_graph to through MLIR\n",
+    "def __torch_mlir(fx_graph, *args, **kwargs):\n",
+    "    assert isinstance(\n",
+    "        fx_graph, torch.fx.GraphModule\n",
+    "    ), \"Model must be an FX GraphModule.\"\n",
+    "\n",
+    "    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule):\n",
+    "        \"\"\"Replace tuple with tuple element in functions that return one-element tuples.\"\"\"\n",
+    "\n",
+    "        for node in fx_g.graph.nodes:\n",
+    "            if node.op == \"output\":\n",
+    "                assert (\n",
+    "                    len(node.args) == 1\n",
+    "                ), \"Output node must have a single argument\"\n",
+    "                node_arg = node.args[0]\n",
+    "                if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
+    "                    node.args = (node_arg[0],)\n",
+    "        fx_g.graph.lint()\n",
+    "        fx_g.recompile()\n",
+    "        return fx_g\n",
+    "\n",
+    "    fx_graph = _unwrap_single_tuple_return(fx_graph)\n",
+    "    ts_graph = torch.jit.script(fx_graph)\n",
+    "\n",
+    "    # torchdynamo does munges the args differently depending on whether you use\n",
+    "    # the @torchdynamo.optimize decorator or the context manager\n",
+    "    if isinstance(args, tuple):\n",
+    "        args = list(args)\n",
+    "    assert isinstance(args, list)\n",
+    "    if len(args) == 1 and isinstance(args[0], list):\n",
+    "        args = args[0]\n",
+    "\n",
+    "    linalg_module = compile(\n",
+    "        ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS\n",
+    "    )\n",
+    "    callable, _ = get_iree_compiled_module(\n",
+    "        linalg_module, \"cuda\", func_name=\"forward\"\n",
+    "    )\n",
+    "\n",
+    "    def forward(*inputs):\n",
+    "        return callable(*inputs)\n",
+    "\n",
+    "    return forward"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Simplest way to use TorchDynamo with the `torchdynamo.optimize` context manager:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 1 device(s).\n",
+      "Device: 0\n",
+      "  Name: NVIDIA GeForce RTX 3080\n",
+      "  Compute Capability: 8.6\n",
+      "[-0.40066046 -0.4210303   0.03225489 -0.44849953  0.10370405 -0.04422468\n",
+      "  0.33262825 -0.20109026  0.02102537 -0.24882983]\n",
+      "[-0.07824923 -0.17004533  0.06439921 -0.06163602  0.26633525 -1.1560082\n",
+      " -0.06660341  0.24227881  0.1462235  -0.32055548]\n",
+      "[-0.01464001  0.442209   -0.0607936  -0.5477967  -0.25226554 -0.08588809\n",
+      " -0.30497575  0.00061084 -0.50069696  0.2317973 ]\n",
+      "[ 0.25726247  0.39388427 -0.24093066  0.12316308 -0.01981307  0.5661146\n",
+      "  0.26199922  0.8123446  -0.01576749  0.30846444]\n",
+      "[ 0.7878203  -0.45975062 -0.29956317 -0.07032048 -0.55817443 -0.62506855\n",
+      " -1.6837492  -0.38442805  0.28220773 -1.5325156 ]\n",
+      "[ 0.07975311  0.67754704 -0.30927914  0.00347631 -0.07326564  0.01893554\n",
+      " -0.7518105  -0.03078967 -0.07623022  0.38865626]\n",
+      "[-0.7751679  -0.5841397  -0.6622711   0.18574935 -0.6049372   0.02844244\n",
+      " -0.20471913  0.3337415  -0.3619432  -0.35087156]\n",
+      "[-0.08569919 -0.10775139 -0.02338934  0.21933547 -0.46712473  0.00062137\n",
+      " -0.58207744  0.06457533  0.18276742  0.03866556]\n",
+      "[-0.2311981  -0.43036282  0.20561649 -0.10363232 -0.13248594  0.02885137\n",
+      " -0.31241602 -0.36907142  0.08861586  0.2331427 ]\n",
+      "[-0.07273526 -0.31246194 -0.24218291 -0.24145737  0.0364486   0.14382267\n",
+      " -0.00531162  0.15447603 -0.5220248  -0.09016377]\n"
+     ]
+    }
+   ],
+   "source": [
+    "with torchdynamo.optimize(__torch_mlir):\n",
+    "    for _ in range(10):\n",
+    "        print(toy_example(torch.randn(10), torch.randn(10)))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "It can also be used through a decorator:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [],
+   "source": [
+    "@create_backend\n",
+    "def torch_mlir(subgraph, *args, **kwargs):\n",
+    "    assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
+    "    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
+    "\n",
+    "\n",
+    "@torchdynamo.optimize(\"torch_mlir\")\n",
+    "def toy_example2(*args):\n",
+    "    a, b = args\n",
+    "\n",
+    "    x = a / (torch.abs(a) + 1)\n",
+    "    if b.sum() < 0:\n",
+    "        b = b * -1\n",
+    "    return x * b"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 1 device(s).\n",
+      "Device: 0\n",
+      "  Name: NVIDIA GeForce RTX 3080\n",
+      "  Compute Capability: 8.6\n",
+      "[-0.35494277  0.03409214 -0.02271946  0.7335942   0.03122527 -0.41881397\n",
+      " -0.6609761  -0.6418614   0.29336175 -0.01973678]\n",
+      "[-2.7246824e-01 -3.5543957e-01  6.0087401e-01 -7.4570496e-03\n",
+      " -4.2481605e-02 -5.0296803e-04  7.2928613e-01 -1.4673788e-03\n",
+      " -2.7621329e-01 -6.0995776e-02]\n",
+      "[-0.03165906  0.3889693   0.24052973  0.27279532 -0.02773128 -0.12602475\n",
+      " -1.0124422   0.5720256  -0.35437614 -0.20992722]\n",
+      "[-0.41831446  0.5525326  -0.29749998 -0.17044766  0.11804754 -0.05210691\n",
+      " -0.46145165 -0.8776549   0.10090438  0.17463352]\n",
+      "[ 0.02194221  0.20959911  0.26973712  0.12551276 -0.0020404   0.1490246\n",
+      " -0.04456685  1.1100804   0.8105744   0.6676846 ]\n",
+      "[ 0.06528181 -0.13591261  0.5370964  -0.4398162  -0.03372452  0.9691372\n",
+      " -0.01120087  0.2947028   0.4804801  -0.3324341 ]\n",
+      "[ 0.33549032 -0.23001772 -0.08681437  0.16490957 -0.11223086  0.09168988\n",
+      "  0.02403045  0.17344482  0.46406478 -0.00129451]\n",
+      "[-0.27475086  0.42384806  1.9090122  -0.41147137 -0.6888369   0.08435658\n",
+      " -0.26628923 -0.17436793 -0.8058869  -0.02582378]\n",
+      "[-0.10109414  0.08681287 -0.10055986  0.6858881   0.29267687 -0.02797117\n",
+      " -0.01425194  0.4882803   0.3551982  -0.858935  ]\n",
+      "[-0.22086617  0.524994    0.17721705 -0.03813264 -0.54570735 -0.4421502\n",
+      "  0.11938014 -0.01122053  0.39294165 -0.61770755]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for _ in range(10):\n",
+    "    print(toy_example2(torch.randn(10), torch.randn(10)))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/shark/examples/shark_eager/dynamo_demo.py
+++ b/shark/examples/shark_eager/dynamo_demo.py
@@ -0,0 +1,92 @@
+import torch
+from torch_mlir import compile, OutputType
+
+from shark.iree_utils import get_iree_compiled_module
+
+try:
+    import torchdynamo
+    from torchdynamo.optimizations.backends import create_backend
+    from torchdynamo.optimizations.subgraph import SubGraph
+except ModuleNotFoundError:
+    print(
+        "Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo"
+    )
+    exit()
+
+NUM_ITERS = 10
+
+
+def __torch_mlir(fx_graph, *args, **kwargs):
+    assert isinstance(
+        fx_graph, torch.fx.GraphModule
+    ), "Model must be an FX GraphModule."
+
+    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule):
+        """Replace tuple with tuple element in functions that return one-element tuples."""
+
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, tuple) and len(node_arg) == 1:
+                    node.args = (node_arg[0],)
+        fx_g.graph.lint()
+        fx_g.recompile()
+        return fx_g
+
+    fx_graph = _unwrap_single_tuple_return(fx_graph)
+    ts_graph = torch.jit.script(fx_graph)
+
+    if isinstance(args, tuple):
+        args = list(args)
+    assert isinstance(args, list)
+    if len(args) == 1 and isinstance(args[0], list):
+        args = args[0]
+
+    linalg_module = compile(
+        ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS
+    )
+    callable, _ = get_iree_compiled_module(
+        linalg_module, "cuda", func_name="forward"
+    )
+
+    def forward(*inputs):
+        return callable(*inputs)
+
+    return forward
+
+
+def toy_example(*args):
+    a, b = args
+
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+with torchdynamo.optimize(__torch_mlir):
+    for _ in range(10):
+        print(toy_example(torch.randn(10), torch.randn(10)))
+
+
+@create_backend
+def torch_mlir(subgraph, *args, **kwargs):
+    assert isinstance(subgraph, SubGraph), "Model must be a dynamo SubGraph."
+    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))
+
+
+@torchdynamo.optimize("torch_mlir")
+def toy_example2(*args):
+    a, b = args
+
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+for _ in range(10):
+    print(toy_example2(torch.randn(10), torch.randn(10)))
--- a/shark/examples/shark_eager/eager_mode.ipynb
+++ b/shark/examples/shark_eager/eager_mode.ipynb
@@ -0,0 +1,805 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mlevental/miniconda3/envs/torch-mlir/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "# standard imports\n",
+    "import torch\n",
+    "from torch_mlir.eager_mode import torch_mlir_tensor"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "# eager mode imports\n",
+    "from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor\n",
+    "from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The simplest way of using Eager Mode (through IREE) requires setting a \"backend\":"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(\"cpu\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and wrapping all your `torch.Tensor`s:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_ITERS = 10\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = 2 * torch.ones((10, 10))\n",
+    "\n",
+    "tt = TorchMLIRTensor(t)\n",
+    "print(tt)\n",
+    "uu = TorchMLIRTensor(u)\n",
+    "print(uu)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "`TorchMLIRTensor` is a \"tensor wrapper subclass\" (more info [here](https://github.com/albanD/subclass_zoo)) that keeps the IREE `DeviceArray` in a field `elem`:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(NUM_ITERS):\n",
+    "    yy = tt + uu\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())\n",
+    "    yy = tt * uu\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "If you have a GPU (and CUDA installed) that works too (you can verify by having `watch -n1 nvidia-smi` up in a terminal while running the next cell):"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(\"gpu\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = 2 * torch.ones((10, 10))\n",
+    "\n",
+    "tt = TorchMLIRTensor(t)\n",
+    "print(tt)\n",
+    "uu = TorchMLIRTensor(u)\n",
+    "print(uu)\n",
+    "\n",
+    "yy = tt + uu\n",
+    "print(yy.elem.to_host())\n",
+    "yy = tt * uu\n",
+    "print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "There is a convenience class `SharkEagerMode` that will handle both the installation of the backend and the wrapping of `torch.Tensor`s:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# eager mode RAII\n",
+    "from shark.shark_runner import SharkEagerMode\n",
+    "\n",
+    "shark_eager_mode = SharkEagerMode(\"cpu\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = torch.ones((10, 10))\n",
+    "\n",
+    "print(t)\n",
+    "print(u)\n",
+    "\n",
+    "for i in range(NUM_ITERS):\n",
+    "    yy = t + u\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())\n",
+    "    yy = t * u\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The `SharkEagerMode` class is a hacky take on [RAII](https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization) that defines a \"deleter\" that runs when an instantiation (of `SharkEagerMode`) is garbage collected. Takeaway is that if you want to turn off `SharkEagerMode`, or switch backends, you need to `del` the instance:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "del shark_eager_mode\n",
+    "shark_eager_mode = SharkEagerMode(\"cuda\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = torch.ones((10, 10))\n",
+    "\n",
+    "print(t)\n",
+    "print(u)\n",
+    "\n",
+    "yy = t + u\n",
+    "print(type(yy))\n",
+    "print(yy.elem.to_host())\n",
+    "yy = t * u\n",
+    "print(type(yy))\n",
+    "print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/shark/examples/shark_eager/eager_mode.py
+++ b/shark/examples/shark_eager/eager_mode.py
@@ -0,0 +1,148 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.utils.cpp_extension import load_inline, include_paths
+from torch_mlir.eager_mode import torch_mlir_tensor
+from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor
+
+from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
+from shark.shark_runner import SharkEagerMode
+
+
+def test_cpu():
+    torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend("cpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = 2 * torch.ones((10, 10), device="cpu")
+
+    tt = TorchMLIRTensor(t)
+    print(tt)
+    uu = TorchMLIRTensor(u)
+    print(uu)
+
+    for i in range(NUM_ITERS):
+        yy = tt + uu
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = tt * uu
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+def test_gpu():
+    source = """
+    #include <iostream>
+    #include "cuda.h"
+    #include "cuda_runtime_api.h"
+
+    using namespace std;
+
+    void print_free_mem() {
+        int num_gpus;
+        size_t free, total;
+        cudaSetDevice(0);
+        int id;
+        cudaGetDevice(&id);
+        cudaMemGetInfo(&free, &total);
+        cout << "GPU " << id << " memory: used=" << (total-free)/(1<<20) << endl;
+    }
+    """
+    gpu_stats = load_inline(
+        name="inline_extension",
+        cpp_sources=[source],
+        extra_include_paths=include_paths(cuda=True),
+        functions=["print_free_mem"],
+    )
+    torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend("gpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = 2 * torch.ones((10, 10), device="cpu")
+
+    tt = TorchMLIRTensor(t)
+    print(tt)
+    uu = TorchMLIRTensor(u)
+    print(uu)
+
+    for i in range(NUM_ITERS):
+        yy = tt + uu
+        print(yy.elem.to_host())
+        yy = tt * uu
+        print(yy.elem.to_host())
+        gpu_stats.print_free_mem()
+
+
+def test_python_mode_ref_backend():
+    # hide this wherever you want?
+    _ = SharkEagerMode("refbackend")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        print(i)
+        yy = t + u
+        print(yy.elem)
+        yy = t * u
+        print(yy.elem)
+
+
+def test_python_mode_iree_cpu():
+    # hide this wherever you want?
+    _ = SharkEagerMode("cpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        yy = t + u
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = t * u
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+def test_python_mode_iree_gpu():
+    _ = SharkEagerMode("gpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        yy = t + u
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = t * u
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+if __name__ == "__main__":
+    NUM_ITERS = 10
+    test_cpu()
+    if torch.cuda.is_available():
+        test_gpu()
+    test_python_mode_ref_backend()
+    test_python_mode_iree_cpu()
+    test_python_mode_iree_gpu()
--- a/shark/examples/shark_eager/squeezenet_lockstep.py
+++ b/shark/examples/shark_eager/squeezenet_lockstep.py
@@ -0,0 +1,73 @@
+import torch
+import numpy as np
+
+model = torch.hub.load(
+    "pytorch/vision:v0.10.0", "squeezenet1_0", pretrained=True
+)
+model.eval()
+
+# from PIL import Image
+# from torchvision import transforms
+# import urllib
+#
+# url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+# try: urllib.URLopener().retrieve(url, filename)
+# except: urllib.request.urlretrieve(url, filename)
+#
+#
+# input_image = Image.open(filename)
+# preprocess = transforms.Compose([
+#     transforms.Resize(256),
+#     transforms.CenterCrop(224),
+#     transforms.ToTensor(),
+#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+# ])
+# input_tensor = preprocess(input_image)
+# input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+# print(input_batch.shape) # size = [1, 3, 224, 224]
+
+# The above is code for generating sample inputs from an image. We can just use
+# random values for accuracy testing though
+input_batch = torch.randn(1, 3, 224, 224)
+
+
+# Focus on CPU for now
+if False and torch.cuda.is_available():
+    input_batch = input_batch.to("cuda")
+    model.to("cuda")
+
+with torch.no_grad():
+    output = model(input_batch)
+# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
+golden_confidences = output[0]
+# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
+golden_probabilities = torch.nn.functional.softmax(
+    golden_confidences, dim=0
+).numpy()
+
+golden_confidences = golden_confidences.numpy()
+
+from shark.torch_mlir_lockstep_tensor import TorchMLIRLockstepTensor
+
+input_detached_clone = input_batch.clone()
+eager_input_batch = TorchMLIRLockstepTensor(input_detached_clone)
+
+print("getting torch-mlir result")
+
+output = model(eager_input_batch)
+
+static_output = output.elem
+confidences = static_output[0]
+probabilities = torch.nn.functional.softmax(
+    torch.from_numpy(confidences), dim=0
+).numpy()
+
+print("The obtained result via shark is: ", confidences)
+print("The golden result is:", golden_confidences)
+
+np.testing.assert_allclose(
+    golden_confidences, confidences, rtol=1e-02, atol=1e-03
+)
+np.testing.assert_allclose(
+    golden_probabilities, probabilities, rtol=1e-02, atol=1e-03
+)
--- a/shark/examples/shark_inference/CLIPModel_tf.py
+++ b/shark/examples/shark_inference/CLIPModel_tf.py
@@ -0,0 +1,65 @@
+from PIL import Image
+import requests
+
+from transformers import CLIPProcessor, TFCLIPModel
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+# Create a set of inputs
+clip_vit_inputs = [
+    tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
+    tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
+    tf.TensorSpec(shape=[1, 3, 224, 224], dtype=tf.float32),
+]
+
+
+class CLIPModule(tf.Module):
+    def __init__(self):
+        super(CLIPModule, self).__init__()
+        self.m = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+
+        self.m.predict = lambda x, y, z: self.m(
+            input_ids=x, attention_mask=y, pixel_values=z
+        )
+
+    @tf.function(input_signature=clip_vit_inputs, jit_compile=True)
+    def forward(self, input_ids, attention_mask, pixel_values):
+        return self.m.predict(
+            input_ids, attention_mask, pixel_values
+        ).logits_per_image
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    inputs = processor(
+        text=["a photo of a cat", "a photo of a dog"],
+        images=image,
+        return_tensors="tf",
+        padding=True,
+    )
+
+    shark_module = SharkInference(
+        CLIPModule(),
+        (
+            inputs["input_ids"],
+            inputs["attention_mask"],
+            inputs["pixel_values"],
+        ),
+    )
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+
+    print(
+        shark_module.forward(
+            (
+                inputs["input_ids"],
+                inputs["attention_mask"],
+                inputs["pixel_values"],
+            )
+        )
+    )
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`from apps.stable_diffusion.scripts.txt2img import txt2img_inf`