Add decompose of aten._scaled_dot_product_flash_attention_for_cpu (#2064 )

New decompose from: https://github.com/pytorch/pytorch/pull/117390 Requied from chatglm model: https://github.com/llvm/torch-mlir/issues/2730
Add decompose of aten._scaled_dot_product_flash_attention.default
2026-01-11 14:58:11 -05:00 · 2024-01-15 20:03:17 -08:00 · 2024-01-16 03:03:14 +00:00 · 2023-12-20 12:48:40 -08:00 · 2023-12-19 11:47:18 -06:00 · 2023-12-14 21:44:37 -06:00
315 changed files with 40582 additions and 5484 deletions
--- a/.flake8
+++ b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+count = 1
+show-source = 1
+select = E9,F63,F7,F82
+exclude = lit.cfg.py, apps/language_models/scripts/vicuna.py, apps/language_models/src/pipelines/minigpt4_pipeline.py, apps/language_models/langchain/h2oai_pipeline.py
--- a/.github/workflows/gh-pages-releases.yml
+++ b/.github/workflows/gh-pages-releases.yml
@@ -0,0 +1,37 @@
+# See: https://github.com/llvm/torch-mlir/issues/1374
+name: Publish releases page
+
+on:
+  workflow_dispatch:
+
+jobs:
+  scrape_and_publish_releases:
+    name: "Scrape and publish releases"
+    runs-on: ubuntu-latest
+
+    # Don't run this in everyone's forks.
+    if: github.repository == 'nod-ai/SHARK'
+
+    steps:
+      - name: Checking out repository
+        uses: actions/checkout@v2
+        with:
+          token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      - name: Run scrape releases script
+        run: python ./build_tools/scrape_releases.py nod-ai SHARK > /tmp/index.html
+        shell: bash
+      - run: git fetch --all
+      - run: git switch github-pages
+      - run: git config --global user.email "none@none.com"
+      - run: git config --global user.name "nod-ai"
+      - run: mv /tmp/index.html package-index/index.html
+      - run: git add package-index/index.html
+
+      # Only try to make a commit if the file has changed.
+      - run: git diff --cached --exit-code || git commit -m "Update releases."
+
+      - name: GitHub Push
+        uses: ad-m/github-push-action@v0.6.0
+        with:
+          github_token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+          branch: github-pages
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -9,13 +9,80 @@ on:
  workflow_dispatch:

 jobs:
-  build:
-
-    runs-on: ubuntu-latest
+  windows-build:
+    runs-on: 7950X
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Compute version
+      shell: powershell
+      run: |
+        $package_version = $(Get-Date -UFormat "%Y%m%d")+"."+${{ github.run_number }}
+        $package_version_ = $(Get-Date -UFormat "%Y%m%d")+"_"+${{ github.run_number }}
+        $tag_name=$package_version
+        echo "package_version=$package_version" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+        echo "package_version_=$package_version_" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+        echo "tag_name=$tag_name" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+
+    - name: Create Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        tag_name: ${{ env.tag_name }}
+        release_name: nod.ai SHARK ${{ env.tag_name }}
+        body: |
+          Automatic snapshot release of nod.ai SHARK.
+        draft: true
+        prerelease: true
+
+    - name: Build Package 
+      shell: powershell
+      run: |
+        ./setup_venv.ps1
+        $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
+        pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
+        python process_skipfiles.py
+        pyinstaller .\apps\stable_diffusion\shark_sd.spec
+        mv ./dist/nodai_shark_studio.exe ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
+        signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
+  
+    - name: Upload Release Assets
+      id: upload-release-assets
+      uses: dwenegar/upload-release-assets@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+        assets_path: ./dist/nodai*
+        #asset_content_type: application/vnd.microsoft.portable-executable 
+
+    - name: Publish Release
+      id: publish_release
+      uses: eregon/publish-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+
+  linux-build:
+
+    runs-on: a100
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11"]
+        backend: [IREE, SHARK]

    steps:
    - uses: actions/checkout@v3
@@ -31,64 +98,56 @@ jobs:
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
-    
-    - name: Compute version
-      run: |
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        tag_name="${package_version}"
-        echo "package_version=${package_version}" >> $GITHUB_ENV
-        echo "tag_name=${tag_name}" >> $GITHUB_ENV    
-    - name: Create Release
-      id: create_release
-      uses: actions/create-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        tag_name: ${{ env.tag_name }}
-        release_name: nod.ai SHARK ${{ env.tag_name }}
-        body: |
-          Automatic snapshot release of nod.ai SHARK.
-        draft: true
-        prerelease: false        
+
    - name: Install dependencies
      run: |
+        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest yapf toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cpu  -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
+        python -m pip install flake8 pytest toml
+        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude shark.venv,lit.cfg.py 
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
-        yapf -i --style .style.yapf shark/*.py
-
-    - name: Build and validate the package
+    - name: Build and validate the IREE package
+      if: ${{ matrix.backend == 'IREE' }}
+      continue-on-error: true
      run: |
        cd $GITHUB_WORKSPACE
-        IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
+        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
+        source iree.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --extra-index-url https://download.pytorch.org/whl/nightly/cpu  -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://openxla.github.io/iree/pip-release-links.html
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
-        pytest -k 'not benchmark' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/
-    
-    - name: Upload Release Assets
-      id: upload-release-assets
-      uses: dwenegar/upload-release-assets@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ./wheelhouse/nodai_*.whl
+        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
+          tail -n 1 |
+          tee -a pytest_results.txt
+        if !(grep -Fxq " failed" pytest_results.txt) 
+          then 
+            export SHA=$(git log -1 --format='%h')
+            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
+            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/nightly/
+        fi
+        rm -rf ./wheelhouse/nodai*

-    - name: Publish Release
-      id: publish_release
-      uses: eregon/publish-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
+    - name: Build and validate the SHARK Runtime package
+      if: ${{ matrix.backend == 'SHARK' }}
+      run: |
+        cd $GITHUB_WORKSPACE
+        ./setup_venv.sh
+        source shark.venv/bin/activate
+        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
+        SHARK_PACKAGE_VERSION=${package_version} \
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
+        # Install the built wheel
+        pip install ./wheelhouse/nodai*
+        # Validate the Models
+        pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
+          tail -n 1 |
+          tee -a pytest_results.txt
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -1,102 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Validate torch-models on Shark Runtime
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-  workflow_dispatch:
-
-jobs:
-  build-linux:
-
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.10"]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    
-    - name: Setup pip cache
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest yapf toml
-
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude lit.cfg.py
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py
-        yapf -i --style .style.yapf shark/*.py
-
-    - name: Validate Models
-      run: |
-        cd $GITHUB_WORKSPACE
-        IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        pytest -k 'not benchmark' --ignore=tank/tf/ --ignore=shark/tests/test_shark_importer.py
-        
-  perf-macOS:
-    runs-on: MacStudio
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.10"]
-
-    steps:
-    - uses: actions/checkout@v3   
-    - name: Validate Models dependencies
-      run: |
-        cd $GITHUB_WORKSPACE
-        PYTHON=python3.10 IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        pytest -k 'not benchmark' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=tank/tf/ --ignore=shark/tests/test_shark_importer.py 
-        
-  perf-linux:
-    runs-on: a100
-    timeout-minutes: 45
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.10"]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    
-    - name: Setup pip cache
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
-
-    - name: Validate Models
-      run: |
-        cd $GITHUB_WORKSPACE
-        IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        pytest --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/
--- a/.github/workflows/test-studio.yml
+++ b/.github/workflows/test-studio.yml
@@ -0,0 +1,86 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Validate Shark Studio
+
+on:
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
+  pull_request:
+    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
+  workflow_dispatch:
+
+# Ensure that only a single job or workflow using the same
+# concurrency group will run at a time. This would cancel
+# any in-progress jobs in the same github workflow and github
+# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-validate:
+    strategy:
+      fail-fast: true
+      matrix:
+        os: [nodai-ubuntu-builder-large]
+        suite: [cpu] #,cuda,vulkan]
+        python-version: ["3.11"]
+        include:
+          - os: nodai-ubuntu-builder-large
+            suite: lint
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - uses: actions/checkout@v3
+    
+    - name: Set Environment Variables
+      run: |
+        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
+        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+        
+    - name: Set up Python Version File ${{ matrix.python-version }}
+      run: |
+        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
+    
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: '${{ matrix.python-version }}'
+          
+    - name: Install dependencies
+      if: matrix.suite == 'lint'
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest toml black
+        
+    - name: Lint with flake8
+      if: matrix.suite == 'lint'
+      run: |
+        # black format check
+        black --version
+        black --check apps/shark_studio 
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --isolated --count --exit-zero --max-complexity=10 --max-line-length=127 \
+          --statistics --exclude lit.cfg.py
+
+    - name: Validate Models on CPU
+      if: matrix.suite == 'cpu'
+      run: |
+        cd $GITHUB_WORKSPACE
+        python${{ matrix.python-version }} -m venv shark.venv
+        source shark.venv/bin/activate
+        pip install -r requirements.txt --no-cache-dir
+        pip install -e .
+        pip uninstall -y torch
+        pip install torch==2.1.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        python apps/shark_studio/tests/api_test.py
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.mlir
+*.vmfb

 # C extensions
 *.so
@@ -31,7 +33,6 @@ MANIFEST
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
-*.spec

 # Installer logs
 pip-log.txt
@@ -158,11 +159,46 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+
+# vscode related
+.vscode

 # Shark related artefacts
 *venv/
+shark_tmp/
+*.vmfb
+.use-iree
+tank/dict_configs.py
+*.csv
+reproducers/

 # ORT related artefacts
 cache_models/
 onnx_models/
+
+# Generated images
+generated_imgs/
+
+# Custom model related artefacts
+variants.json
+/models/
+
+# models folder
+apps/stable_diffusion/web/models/
+
+# Stencil annotators.
+stencil_annotator/
+
+# For DocuChat
+apps/language_models/langchain/user_path/
+db_dir_UserData
+
+# Embeded browser cache and other
+apps/stable_diffusion/web/EBWebView/
+
+# Llama2 tokenizer configs
+llama2_tokenizer_configs/
+
+# Webview2 runtime artefacts
+EBWebView/
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "inference/thirdparty/shark-runtime"]
 	path = inference/thirdparty/shark-runtime
-	url =https://github.com/nod-ai/SHARK-Runtime.git
+	url =https://github.com/nod-ai/SRT.git
 	branch = shark-06032022
--- a/.style.yapf
+++ b/.style.yapf
@@ -1,3 +0,0 @@
-[style]
-  based_on_style = google
-  column_limit = 80
--- a/218
+++ b/218
@@ -0,0 +1,218 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
--- a/README.md
+++ b/README.md
@@ -1,29 +1,161 @@
 # SHARK

-High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerators and Heterogeneous Clusters
+High Performance Machine Learning Distribution

 [![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
 [![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)

-## Communication Channels
-
-*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
-*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
-
-
-## Installation

 <details>
-  <summary>Installation (Linux and macOS)</summary>
+  <summary>Prerequisites - Drivers </summary>
  
+#### Install your Windows hardware drivers
+* [AMD RDNA Users] Download the latest driver (23.2.1 is the oldest supported) [here](https://www.amd.com/en/support).
+* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
+* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
+  
+#### Linux Drivers
+* MESA / RADV drivers wont work with FP16. Please use the latest AMGPU-PRO drivers (non-pro OSS drivers also wont work) or the latest NVidia Linux Drivers.
+
+Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
+
+</details>
+
+
+ 
+### Quick Start for SHARK Stable Diffusion for Windows 10/11 Users
+
+Install the Driver from [Prerequisites](https://github.com/nod-ai/SHARK#install-your-hardware-drivers) above 
+
+Download the [stable release](https://github.com/nod-ai/shark/releases/latest)
+
+Double click the .exe and you should have the [UI](http://localhost:8080/) in the browser. 
+
+If you have custom models put them in a `models/` directory where the .exe is. 
+
+Enjoy. 
+
+<details>
+  <summary>More installation notes</summary>
+* We recommend that you download EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files with `rm *.vmfb`. You can also use `--clear_all` flag once to clean all the old files. 
+* If you recently updated the driver or this binary (EXE file), we recommend you clear all the local artifacts with `--clear_all` 
+
+## Running
+
+* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE)
+* The first run may take few minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
+* You will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
+* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/.
+
+## Stopping
+
+* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment or close the terminal. 
+</details>
+
+<details>
+  <summary>Advanced Installation (Only for developers)</summary>
+  
+## Advanced Installation (Windows, Linux and macOS) for developers
+
+## Check out the code
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git
+cd SHARK
+```
+
+## Setup your Python VirtualEnvironment and Dependencies
+
+### Windows 10/11 Users
+
+* Install the latest Python 3.11.x version from [here](https://www.python.org/downloads/windows/)
+
+* Install Git for Windows from [here](https://git-scm.com/download/win)
+
+#### Allow the install script to run in Powershell
+```powershell
+set-executionpolicy remotesigned
+```
+
+#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
+```powershell
+./setup_venv.ps1 #You can re-run this script to get the latest version
+```
+
+### Linux / macOS Users
+
+```shell
+./setup_venv.sh
+source shark.venv/bin/activate
+```
+
+
+### Run Stable Diffusion on your device - WebUI
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\g\shark> cd .\apps\stable_diffusion\web\
+(shark.venv) PS C:\g\shark\apps\stable_diffusion\web> python .\index.py
+```
+#### Linux / macOS Users
+```shell
+(shark.venv) > cd apps/stable_diffusion/web
+(shark.venv) > python index.py
+```
+
+#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
+
+
+<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
+
+
+
+### Run Stable Diffusion on your device - Commandline
+
+#### Windows 10/11 Users
+```powershell
+(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\main.py --app="txt2img" --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+```
+
+#### Linux / macOS Users
+```shell
+python3.11 apps/stable_diffusion/scripts/main.py --app=txt2img --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+```
+
+You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
+</details>
+
+The output on a AMD 7900XTX would look something like:
+
+```shell
+Average step time: 47.19188690185547ms/it
+Clip Inference time (ms) = 109.531
+VAE Inference time (ms): 78.590
+
+Total image generation time: 2.5788655281066895sec
+```
+
+Here are some samples generated:
+
+![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
+
+![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
+
+
+Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
+
+
+<details>
+  <summary>Binary Installation</summary>
+
 ### Setup a new pip Virtual Environment

 This step sets up a new VirtualEnv for Python
-  
+
 ```shell
-python --version #Check you have 3.7->3.10 on Linux or 3.10 on macOS
+python --version #Check you have 3.11 on Linux, macOS or Windows Powershell
 python -m venv shark_venv
-source shark_venv/bin/activate
+source shark_venv/bin/activate   # Use shark_venv/Scripts/activate on Windows

 # If you are using conda create and activate a new conda env

@@ -31,91 +163,143 @@ source shark_venv/bin/activate
 python -m pip install --upgrade pip
 ```

-*macOS Metal* users please install https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg
+*macOS Metal* users please install https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg and enable "System wide install"

 ### Install SHARK
-  
-This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10
+
+This step pip installs SHARK and related packages on Linux Python 3.8, 3.10 and 3.11 and macOS / Windows Python 3.11

 ```shell
-pip install nodai-shark -f https://github.com/nod-ai/SHARK/releases -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/shark-runtime/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SRT/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 ```
-If you are on an Intel macOS machine you need this [workaround](https://github.com/nod-ai/SHARK/issues/102) for an upstream issue.
+
+### Run shark tank model tests.
+```shell
+pytest tank/test_models.py
+```
+See tank/README.md for a more detailed walkthrough of our pytest suite and CLI.

 ### Download and run Resnet50 sample
-    
+
 ```shell
 curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/resnet50_script.py
 #Install deps for test script
-pip install --pre torch torchvision torchaudio tqdm pillow --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-python ./resnet50_script.py --device="cpu"  #use cuda or vulkan or metal 
+pip install --pre torch torchvision torchaudio tqdm pillow gsutil --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+python ./resnet50_script.py --device="cpu"  #use cuda or vulkan or metal
 ```
-        
+
 ### Download and run BERT (MiniLM) sample
 ```shell
 curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/minilm_jit.py
 #Install deps for test script
 pip install transformers torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal 
+python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
 ```
 </details>


+
 <details>
-  <summary>Source Installation</summary>
+  <summary>Development, Testing and Benchmarks</summary>

-## Check out the code
-
-```shell
-git clone https://github.com/nod-ai/SHARK.git 
+If you want to use Python3.11 and with TF Import tools you can use the environment variables like:
+Set `USE_IREE=1` to use upstream IREE
+```
+# PYTHON=python3.11 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
 ```

-## Setup your Python VirtualEnvironment and Dependencies
-```shell
-# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
-./setup_venv.sh
-# Please activate the venv after installation.
-```
-
-### Run a demo script
+### Run any of the hundreds of SHARK tank models via the test framework
 ```shell
 python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
+# Or a pytest
+pytest tank/test_models.py -k "MiniLM"
 ```
+  
+### How to use your locally built IREE / Torch-MLIR with SHARK
+If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
+the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
+with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
+for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
+for Torch-MLIR.

-
-### Run all model tests on CPU/GPU/VULKAN/Metal
+How to use your locally built Torch-MLIR with SHARK:
 ```shell
-pytest shark/tests/models
+1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
+2.) Run `pip uninstall torch-mlir`.
+3.) Go to your local Torch-MLIR directory.
+4.) Activate mlir_venv virtual envirnoment.
+5.) Run `pip uninstall -r requirements.txt`.
+6.) Run `pip install -r requirements.txt`.
+7.) Build Torch-MLIR.
+8.) Activate shark.venv virtual environment from the Torch-MLIR directory.
+8.) Run `export PYTHONPATH=`pwd`/build/tools/torch-mlir/python_packages/torch_mlir:`pwd`/examples` in the Torch-MLIR directory.
+9.) Go to the SHARK directory.
+```
+Now the SHARK will use your locally build Torch-MLIR repo.

-# If on Linux for quicker results:
-pytest shark/tests/models -n auto
+
+## Benchmarking Dispatches
+
+To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your pytest command line argument.  
+If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`.  E.G. `--dispatch_benchmarks="0 1 2 10"`
+
+For example, to generate and run dispatch benchmarks for MiniLM on CUDA:
+```
+pytest -k "MiniLM and torch and static and cuda" --benchmark_dispatches=All -s --dispatch_benchmarks_dir=./my_dispatch_benchmarks                                                                                
+```
+The given command will populate `<dispatch_benchmarks_dir>/<model_name>/` with an `ordered_dispatches.txt` that lists and orders the dispatches and their latencies, as well as folders for each dispatch that contain .mlir, .vmfb, and results of the benchmark for that dispatch.
+
+if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled.  E.G:
+
+```
+shark_module = SharkInference(
+        mlir_model,
+        device=args.device,
+        mlir_dialect="tm_tensor",
+        dispatch_benchmarks="all",
+        dispatch_benchmarks_dir="results"
+    )
 ```

-### Run all model benchmark tests on CPU/GPU/VULKAN/Metal
-```shell
-pytest shark/tests/benchmarks
-```
+Output will include:
+- An ordered list ordered-dispatches.txt of all the dispatches with their runtime
+- Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
+- An .mlir file containing the dispatch benchmark 
+- A compiled .vmfb file containing the dispatch benchmark
+- An .mlir file containing just the hal executable
+- A compiled .vmfb file of the hal executable
+- A .txt file containing benchmark output
+
+
+See tank/README.md for further instructions on how to run model tests and benchmarks from the SHARK tank.
+
 </details>

-
 <details>
  <summary>API Reference</summary>

 ### Shark Inference API

 ```
-from shark_runner import SharkInference

-shark_module = SharkInference(
-        module = model class.
-        (input,)  = inputs to model (must be a torch-tensor)
-        dynamic (boolean) = Pass the input shapes as static or dynamic.
-        device = `cpu`, `gpu` or `vulkan` is supported.
-        tracing_required = (boolean) = Jit trace the module with the given input, useful in the case where jit.script doesn't work. )
-shark_module.set_frontend("pytorch") # Use tensorflow, mhlo, linalg, tosa
+from shark.shark_importer import SharkImporter
+
+# SharkImporter imports mlir file from the torch, tensorflow or tf-lite module.
+
+mlir_importer = SharkImporter(
+    torch_module,
+    (input),
+    frontend="torch",  #tf, #tf-lite
+)
+torch_mlir, func_name = mlir_importer.import_mlir(tracing_required=True)
+
+# SharkInference accepts mlir in linalg, mhlo, and tosa dialect.
+
+from shark.shark_inference import SharkInference
+shark_module = SharkInference(torch_mlir, device="cpu", mlir_dialect="linalg")
 shark_module.compile()
+result = shark_module.forward((input))

-result = shark_module.forward(inputs)
 ```


@@ -135,104 +319,40 @@ mhlo_ir = r"""builtin.module  {

 arg0 = np.ones((1, 4)).astype(np.float32)
 arg1 = np.ones((4, 1)).astype(np.float32)
-
-shark_module = SharkInference(mhlo_ir, (arg0, arg1))
-shark_module.set_frontend("mhlo")
+shark_module = SharkInference(mhlo_ir, device="cpu", mlir_dialect="mhlo")
 shark_module.compile()
-print(shark_module.forward((arg0, arg1)))
+result = shark_module.forward((arg0, arg1))
 ```
 </details>

+## Examples Using the REST API
+
+* [Setting up SHARK for use with Blender](./docs/shark_sd_blender.md)
+* [Setting up SHARK for use with Koboldcpp](./docs/shark_sd_koboldcpp.md)

 ## Supported and Validated Models

-<details>
-  <summary>PyTorch Models</summary>
+SHARK is maintained to support the latest innovations in ML Models: 

-### Huggingface PyTorch Models
+| TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------|----------|-------------|
+| BERT                | :green_heart:         | :green_heart:         | :green_heart:            |
+| DistilBERT         | :green_heart:         | :green_heart:         | :green_heart:            |
+| GPT2         | :green_heart:         | :green_heart:         | :green_heart:            |
+| BLOOM         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Stable Diffusion         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Vision Transformer       | :green_heart:         | :green_heart:         | :green_heart:            |
+| ResNet50         | :green_heart:         | :green_heart:         | :green_heart:            |

-| Hugging Face Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :heavy_check_mark: (JIT)          | :heavy_check_mark:         |          |             |
-| Albert              | :heavy_check_mark: (JIT)            | :heavy_check_mark:         |          |             |
-| BigBird             | :heavy_check_mark: (AOT)            |          |          |             |
-| DistilBERT          | :heavy_check_mark: (JIT)            | :heavy_check_mark:         |          |             |
-| GPT2                | :x: (AOT)            |          |          |             |
+For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).

-### Torchvision  Models
-  
-| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|--------------------|----------------------|----------|----------|-------------|
-| AlexNet            | :heavy_check_mark: (Script)         | :heavy_check_mark:         | :heavy_check_mark:         |             |
-| DenseNet121        | :heavy_check_mark: (Script)         |          |          |             |
-| MNasNet1_0         | :heavy_check_mark: (Script)         |          |          |             |
-| MobileNetV2        | :heavy_check_mark: (Script)         |          |          |             |
-| MobileNetV3        | :heavy_check_mark: (Script)         |          |          |             |
-| Unet               | :x: (Script)         |          |          |             |
-| Resnet18           | :heavy_check_mark: (Script)         | :heavy_check_mark:         |  :heavy_check_mark:        |             |
-| Resnet50           | :heavy_check_mark: (Script)         | :heavy_check_mark:         |   :heavy_check_mark:       |             |
-| Resnet101           | :heavy_check_mark: (Script)         | :heavy_check_mark:         |   :heavy_check_mark:       |             |
-| Resnext50_32x4d    | :heavy_check_mark: (Script)         |          |          |             |
-| ShuffleNet_v2      | :x: (Script)         |          |          |             |
-| SqueezeNet         | :heavy_check_mark: (Script)         | :heavy_check_mark:         |   :heavy_check_mark:       |             |
-| EfficientNet       | :heavy_check_mark: (Script)         |          |          |             |
-| Regnet             | :heavy_check_mark: (Script)         |          |          |             |
-| Resnest            | :x: (Script)         |          |          |             |
-| Vision Transformer | :heavy_check_mark: (Script)         |          |          |             |
-| VGG 16             | :heavy_check_mark: (Script)         | :heavy_check_mark:         |   :heavy_check_mark:       |             |
-| Wide Resnet        | :heavy_check_mark: (Script)         | :heavy_check_mark:         | :heavy_check_mark:         |             |
-| RAFT               | :x: (JIT)            |          |          |             |
+## Communication Channels

-For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
-
-### PyTorch Training Models 
-
-| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :x:           | :x:         |          |             |
-| FullyConnected                | :heavy_check_mark:           | :heavy_check_mark:         |          |             |
-
-</details>
-  
-<details>
-  <summary>JAX Models</summary>
-
-
-### JAX  Models 
-
-| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| DALL-E                | :x:           | :x:         |          |             |
-| FullyConnected                | :heavy_check_mark:           | :heavy_check_mark:         |          |             |
- 
-</details>
-  
-<details>
-  <summary>TFLite Models</summary>
- 
-### TFLite Models 
-
-| Models | TOSA/LinAlg  | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :x:           | :x:         |          |             |
-| FullyConnected                | :heavy_check_mark:           | :heavy_check_mark:         |          |             |
-  
-</details>
-
-<details>
-  <summary>TF Models</summary>
- 
-### Tensorflow Models 
-
-| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :x:           | :x:         |          |             |
-| FullyConnected                | :heavy_check_mark:           | :heavy_check_mark:         |          |             |
-  
-</details>
+*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
+*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc

 ## Related Projects
-  
+
 <details>
  <summary>IREE Project Channels</summary>

@@ -243,7 +363,7 @@ For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spr
 *   [iree-discuss email list](https://groups.google.com/forum/#!forum/iree-discuss):
    Announcements, general and low-priority discussion
 </details>
-    
+
 <details>
  <summary>MLIR and Torch-MLIR Project Channels</summary>

--- a/apps/init.py
+++ b/apps/init.py
--- a/apps/shark_studio/api/llm.py
+++ b/apps/shark_studio/api/llm.py
@@ -0,0 +1,179 @@
+from turbine_models.custom_models import stateless_llama
+import time
+from shark.iree_utils.compile_utils import (
+    get_iree_compiled_module,
+    load_vmfb_using_mmap,
+)
+from apps.shark_studio.api.utils import get_resource_path
+import iree.runtime as ireert
+from itertools import chain
+import gc
+import os
+import torch
+from transformers import AutoTokenizer
+
+llm_model_map = {
+    "llama2_7b": {
+        "initializer": stateless_llama.export_transformer_model,
+        "hf_model_name": "meta-llama/Llama-2-7b-chat-hf",
+        "stop_token": 2,
+        "max_tokens": 4096,
+        "system_prompt": """<s>[INST] <<SYS>>Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>""",
+    },
+    "Trelis/Llama-2-7b-chat-hf-function-calling-v2": {
+        "initializer": stateless_llama.export_transformer_model,
+        "hf_model_name": "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+        "stop_token": 2,
+        "max_tokens": 4096,
+        "system_prompt": """<s>[INST] <<SYS>>Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>""",
+    },
+}
+
+
+class LanguageModel:
+    def __init__(
+        self,
+        model_name,
+        hf_auth_token=None,
+        device=None,
+        precision="fp32",
+        external_weights=None,
+        use_system_prompt=True,
+    ):
+        print(llm_model_map[model_name])
+        self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
+        self.tempfile_name = get_resource_path("llm.torch.tempfile")
+        self.vmfb_name = get_resource_path("llm.vmfb.tempfile")
+        self.device = device
+        self.precision = precision
+        self.safe_name = self.hf_model_name.strip("/").replace("/", "_")
+        self.max_tokens = llm_model_map[model_name]["max_tokens"]
+        self.iree_module_dict = None
+        self.external_weight_file = None
+        if external_weights is not None:
+            self.external_weight_file = get_resource_path(
+                self.safe_name + "." + external_weights
+            )
+        self.use_system_prompt = use_system_prompt
+        self.global_iter = 0
+        if os.path.exists(self.vmfb_name) and (
+            external_weights is None or os.path.exists(str(self.external_weight_file))
+        ):
+            self.iree_module_dict = dict()
+            (
+                self.iree_module_dict["vmfb"],
+                self.iree_module_dict["config"],
+                self.iree_module_dict["temp_file_to_unlink"],
+            ) = load_vmfb_using_mmap(
+                self.vmfb_name,
+                device,
+                device_idx=0,
+                rt_flags=[],
+                external_weight_file=self.external_weight_file,
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.hf_model_name,
+                use_fast=False,
+                use_auth_token=hf_auth_token,
+            )
+        elif not os.path.exists(self.tempfile_name):
+            self.torch_ir, self.tokenizer = llm_model_map[model_name]["initializer"](
+                self.hf_model_name,
+                hf_auth_token,
+                compile_to="torch",
+                external_weights=external_weights,
+                external_weight_file=self.external_weight_file,
+            )
+            with open(self.tempfile_name, "w+") as f:
+                f.write(self.torch_ir)
+            del self.torch_ir
+            gc.collect()
+            self.compile()
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.hf_model_name,
+                use_fast=False,
+                use_auth_token=hf_auth_token,
+            )
+            self.compile()
+
+    def compile(self) -> None:
+        # this comes with keys: "vmfb", "config", and "temp_file_to_unlink".
+        self.iree_module_dict = get_iree_compiled_module(
+            self.tempfile_name,
+            device=self.device,
+            mmap=True,
+            frontend="torch",
+            external_weight_file=self.external_weight_file,
+            write_to=self.vmfb_name,
+            extra_args=["--iree-global-opt-enable-quantized-matmul-reassociation"],
+        )
+        # TODO: delete the temp file
+
+    def sanitize_prompt(self, prompt):
+        print(prompt)
+        if isinstance(prompt, list):
+            prompt = list(chain.from_iterable(prompt))
+            prompt = " ".join([x for x in prompt if isinstance(x, str)])
+        prompt = prompt.replace("\n", " ")
+        prompt = prompt.replace("\t", " ")
+        prompt = prompt.replace("\r", " ")
+        if self.use_system_prompt and self.global_iter == 0:
+            prompt = llm_model_map["llama2_7b"]["system_prompt"] + prompt
+        prompt += " [/INST]"
+        print(prompt)
+        return prompt
+
+    def chat(self, prompt):
+        prompt = self.sanitize_prompt(prompt)
+
+        input_tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
+
+        def format_out(results):
+            return torch.tensor(results.to_host()[0][0])
+
+        history = []
+        for iter in range(self.max_tokens):
+            st_time = time.time()
+            if iter == 0:
+                device_inputs = [
+                    ireert.asdevicearray(
+                        self.iree_module_dict["config"].device, input_tensor
+                    )
+                ]
+                token = self.iree_module_dict["vmfb"]["run_initialize"](*device_inputs)
+            else:
+                device_inputs = [
+                    ireert.asdevicearray(
+                        self.iree_module_dict["config"].device,
+                        token,
+                    )
+                ]
+                token = self.iree_module_dict["vmfb"]["run_forward"](*device_inputs)
+
+            total_time = time.time() - st_time
+            history.append(format_out(token))
+            yield self.tokenizer.decode(history), total_time
+
+            if format_out(token) == llm_model_map["llama2_7b"]["stop_token"]:
+                break
+
+        for i in range(len(history)):
+            if type(history[i]) != int:
+                history[i] = int(history[i])
+        result_output = self.tokenizer.decode(history)
+        self.global_iter += 1
+        return result_output, total_time
+
+
+if __name__ == "__main__":
+    lm = LanguageModel(
+        "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+        hf_auth_token=None,
+        device="cpu-task",
+        external_weights="safetensors",
+    )
+
+    print("model loaded")
+    for i in lm.chat("hi, what are you?"):
+        print(i)
--- a/apps/shark_studio/api/utils.py
+++ b/apps/shark_studio/api/utils.py
@@ -0,0 +1,12 @@
+import os
+import sys
+
+
+def get_available_devices():
+    return ["cpu-task"]
+
+
+def get_resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
+    return os.path.join(base_path, relative_path)
--- a/apps/shark_studio/tests/api_test.py
+++ b/apps/shark_studio/tests/api_test.py
@@ -0,0 +1,34 @@
+# Copyright 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import unittest
+from apps.shark_studio.api.llm import LanguageModel
+
+
+class LLMAPITest(unittest.TestCase):
+    def testLLMSimple(self):
+        lm = LanguageModel(
+            "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            hf_auth_token=None,
+            device="cpu-task",
+            external_weights="safetensors",
+        )
+        count = 0
+        for msg, _ in lm.chat("hi, what are you?"):
+            # skip first token output
+            if count == 0:
+                count += 1
+                continue
+            assert (
+                msg.strip(" ") == "Hello"
+            ), f"LLM API failed to return correct response, expected 'Hello', received {msg}"
+            break
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    unittest.main()
--- a/apps/shark_studio/web/index.py
+++ b/apps/shark_studio/web/index.py
@@ -0,0 +1,426 @@
+from multiprocessing import Process, freeze_support
+import os
+import sys
+import logging
+from ui.chat import chat_element
+
+if sys.platform == "darwin":
+    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
+    # import before IREE to avoid MLIR library issues
+    import torch_mlir
+
+# import PIL, transformers, sentencepiece  # ensures inclusion in pysintaller exe generation
+# from apps.stable_diffusion.src import args, clear_all
+# import apps.stable_diffusion.web.utils.global_obj as global_obj
+
+
+def launch_app(address):
+    from tkinter import Tk
+    import webview
+
+    window = Tk()
+
+    # get screen width and height of display and make it more reasonably
+    # sized as we aren't making it full-screen or maximized
+    width = int(window.winfo_screenwidth() * 0.81)
+    height = int(window.winfo_screenheight() * 0.91)
+    webview.create_window(
+        "SHARK AI Studio",
+        url=address,
+        width=width,
+        height=height,
+        text_select=True,
+    )
+    webview.start(private_mode=False, storage_path=os.getcwd())
+
+
+if __name__ == "__main__":
+    # if args.debug:
+    logging.basicConfig(level=logging.DEBUG)
+    # required to do multiprocessing in a pyinstaller freeze
+    freeze_support()
+    #    if args.api or "api" in args.ui.split(","):
+    #        from apps.stable_diffusion.web.ui import (
+    #            txt2img_api,
+    #            img2img_api,
+    #            upscaler_api,
+    #            inpaint_api,
+    #            outpaint_api,
+    #            llm_chat_api,
+    #        )
+    #
+    #        from fastapi import FastAPI, APIRouter
+    #        import uvicorn
+    #
+    #        # init global sd pipeline and config
+    #        global_obj._init()
+    #
+    #        app = FastAPI()
+    #        app.add_api_route("/sdapi/v1/txt2img", txt2img_api, methods=["post"])
+    #        app.add_api_route("/sdapi/v1/img2img", img2img_api, methods=["post"])
+    #        app.add_api_route("/sdapi/v1/inpaint", inpaint_api, methods=["post"])
+    #        app.add_api_route("/sdapi/v1/outpaint", outpaint_api, methods=["post"])
+    #        app.add_api_route("/sdapi/v1/upscaler", upscaler_api, methods=["post"])
+    #
+    #        # chat APIs needed for compatibility with multiple extensions using OpenAI API
+    #        app.add_api_route(
+    #            "/v1/chat/completions", llm_chat_api, methods=["post"]
+    #        )
+    #        app.add_api_route("/v1/completions", llm_chat_api, methods=["post"])
+    #        app.add_api_route("/chat/completions", llm_chat_api, methods=["post"])
+    #        app.add_api_route("/completions", llm_chat_api, methods=["post"])
+    #        app.add_api_route(
+    #            "/v1/engines/codegen/completions", llm_chat_api, methods=["post"]
+    #        )
+    #        app.include_router(APIRouter())
+    #        uvicorn.run(app, host="0.0.0.0", port=args.server_port)
+    #        sys.exit(0)
+    #
+    # Setup to use shark_tmp for gradio's temporary image files and clear any
+    # existing temporary images there if they exist. Then we can import gradio.
+    # It has to be in this order or gradio ignores what we've set up.
+    # from apps.stable_diffusion.web.utils.gradio_configs import (
+    #    config_gradio_tmp_imgs_folder,
+    # )
+
+    # config_gradio_tmp_imgs_folder()
+    import gradio as gr
+
+    # Create custom models folders if they don't exist
+    # from apps.stable_diffusion.web.ui.utils import create_custom_models_folders
+
+    # create_custom_models_folders()
+
+    def resource_path(relative_path):
+        """Get absolute path to resource, works for dev and for PyInstaller"""
+        base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
+        return os.path.join(base_path, relative_path)
+
+    dark_theme = resource_path("ui/css/sd_dark_theme.css")
+
+    # from apps.stable_diffusion.web.ui import (
+    # txt2img_web,
+    # txt2img_custom_model,
+    # txt2img_gallery,
+    # txt2img_png_info_img,
+    # txt2img_status,
+    # txt2img_sendto_img2img,
+    # txt2img_sendto_inpaint,
+    # txt2img_sendto_outpaint,
+    # txt2img_sendto_upscaler,
+    ## h2ogpt_upload,
+    ## h2ogpt_web,
+    # img2img_web,
+    # img2img_custom_model,
+    # img2img_gallery,
+    # img2img_init_image,
+    # img2img_status,
+    # img2img_sendto_inpaint,
+    # img2img_sendto_outpaint,
+    # img2img_sendto_upscaler,
+    # inpaint_web,
+    # inpaint_custom_model,
+    # inpaint_gallery,
+    # inpaint_init_image,
+    # inpaint_status,
+    # inpaint_sendto_img2img,
+    # inpaint_sendto_outpaint,
+    # inpaint_sendto_upscaler,
+    # outpaint_web,
+    # outpaint_custom_model,
+    # outpaint_gallery,
+    # outpaint_init_image,
+    # outpaint_status,
+    # outpaint_sendto_img2img,
+    # outpaint_sendto_inpaint,
+    # outpaint_sendto_upscaler,
+    # upscaler_web,
+    # upscaler_custom_model,
+    # upscaler_gallery,
+    # upscaler_init_image,
+    # upscaler_status,
+    # upscaler_sendto_img2img,
+    # upscaler_sendto_inpaint,
+    # upscaler_sendto_outpaint,
+    ##  lora_train_web,
+    ##  model_web,
+    ##  model_config_web,
+    # hf_models,
+    # modelmanager_sendto_txt2img,
+    # modelmanager_sendto_img2img,
+    # modelmanager_sendto_inpaint,
+    # modelmanager_sendto_outpaint,
+    # modelmanager_sendto_upscaler,
+    # stablelm_chat,
+    # minigpt4_web,
+    # outputgallery_web,
+    # outputgallery_tab_select,
+    # outputgallery_watch,
+    # outputgallery_filename,
+    # outputgallery_sendto_txt2img,
+    # outputgallery_sendto_img2img,
+    # outputgallery_sendto_inpaint,
+    # outputgallery_sendto_outpaint,
+    # outputgallery_sendto_upscaler,
+    # )
+
+    # init global sd pipeline and config
+    # global_obj._init()
+
+    def register_button_click(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                x[0]["name"] if len(x) != 0 else None,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    def register_modelmanager_button(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                "None",
+                x,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    def register_outputgallery_button(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                x,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    with gr.Blocks(
+        css=dark_theme, analytics_enabled=False, title="Shark Studio 2.0 Beta"
+    ) as sd_web:
+        with gr.Tabs() as tabs:
+            # NOTE: If adding, removing, or re-ordering tabs, make sure that they
+            # have a unique id that doesn't clash with any of the other tabs,
+            # and that the order in the code here is the order they should
+            # appear in the ui, as the id value doesn't determine the order.
+
+            # Where possible, avoid changing the id of any tab that is the
+            # destination of one of the 'send to' buttons. If you do have to change
+            # that id, make sure you update the relevant register_button_click calls
+            # further down with the new id.
+            # with gr.TabItem(label="Text-to-Image", id=0):
+            #    txt2img_web.render()
+            # with gr.TabItem(label="Image-to-Image", id=1):
+            #    img2img_web.render()
+            # with gr.TabItem(label="Inpainting", id=2):
+            #    inpaint_web.render()
+            # with gr.TabItem(label="Outpainting", id=3):
+            #    outpaint_web.render()
+            # with gr.TabItem(label="Upscaler", id=4):
+            #    upscaler_web.render()
+            # if args.output_gallery:
+            #    with gr.TabItem(label="Output Gallery", id=5) as og_tab:
+            #        outputgallery_web.render()
+
+            #    # extra output gallery configuration
+            #    outputgallery_tab_select(og_tab.select)
+            #    outputgallery_watch(
+            #        [
+            #            txt2img_status,
+            #            img2img_status,
+            #            inpaint_status,
+            #            outpaint_status,
+            #            upscaler_status,
+            #        ]
+            #    )
+            ##  with gr.TabItem(label="Model Manager", id=6):
+            ##      model_web.render()
+            ##  with gr.TabItem(label="LoRA Training (Experimental)", id=7):
+            ##      lora_train_web.render()
+            with gr.TabItem(label="Chat Bot", id=0):
+                chat_element.render()
+            ##  with gr.TabItem(
+            ##      label="Generate Sharding Config (Experimental)", id=9
+            ##  ):
+            ##      model_config_web.render()
+            # with gr.TabItem(label="MultiModal (Experimental)", id=10):
+            #    minigpt4_web.render()
+            # with gr.TabItem(label="DocuChat Upload", id=11):
+            #     h2ogpt_upload.render()
+            # with gr.TabItem(label="DocuChat(Experimental)", id=12):
+            #     h2ogpt_web.render()
+
+        # send to buttons
+        # register_button_click(
+        #    txt2img_sendto_img2img,
+        #    1,
+        #    [txt2img_gallery],
+        #    [img2img_init_image, tabs],
+        # )
+        # register_button_click(
+        #    txt2img_sendto_inpaint,
+        #    2,
+        #    [txt2img_gallery],
+        #    [inpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    txt2img_sendto_outpaint,
+        #    3,
+        #    [txt2img_gallery],
+        #    [outpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    txt2img_sendto_upscaler,
+        #    4,
+        #    [txt2img_gallery],
+        #    [upscaler_init_image, tabs],
+        # )
+        # register_button_click(
+        #    img2img_sendto_inpaint,
+        #    2,
+        #    [img2img_gallery],
+        #    [inpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    img2img_sendto_outpaint,
+        #    3,
+        #    [img2img_gallery],
+        #    [outpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    img2img_sendto_upscaler,
+        #    4,
+        #    [img2img_gallery],
+        #    [upscaler_init_image, tabs],
+        # )
+        # register_button_click(
+        #    inpaint_sendto_img2img,
+        #    1,
+        #    [inpaint_gallery],
+        #    [img2img_init_image, tabs],
+        # )
+        # register_button_click(
+        #    inpaint_sendto_outpaint,
+        #    3,
+        #    [inpaint_gallery],
+        #    [outpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    inpaint_sendto_upscaler,
+        #    4,
+        #    [inpaint_gallery],
+        #    [upscaler_init_image, tabs],
+        # )
+        # register_button_click(
+        #    outpaint_sendto_img2img,
+        #    1,
+        #    [outpaint_gallery],
+        #    [img2img_init_image, tabs],
+        # )
+        # register_button_click(
+        #    outpaint_sendto_inpaint,
+        #    2,
+        #    [outpaint_gallery],
+        #    [inpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    outpaint_sendto_upscaler,
+        #    4,
+        #    [outpaint_gallery],
+        #    [upscaler_init_image, tabs],
+        # )
+        # register_button_click(
+        #    upscaler_sendto_img2img,
+        #    1,
+        #    [upscaler_gallery],
+        #    [img2img_init_image, tabs],
+        # )
+        # register_button_click(
+        #    upscaler_sendto_inpaint,
+        #    2,
+        #    [upscaler_gallery],
+        #    [inpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    upscaler_sendto_outpaint,
+        #    3,
+        #    [upscaler_gallery],
+        #    [outpaint_init_image, tabs],
+        # )
+        # if args.output_gallery:
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_txt2img,
+        #        0,
+        #        [outputgallery_filename],
+        #        [txt2img_png_info_img, tabs],
+        #    )
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_img2img,
+        #        1,
+        #        [outputgallery_filename],
+        #        [img2img_init_image, tabs],
+        #    )
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_inpaint,
+        #        2,
+        #        [outputgallery_filename],
+        #        [inpaint_init_image, tabs],
+        #    )
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_outpaint,
+        #        3,
+        #        [outputgallery_filename],
+        #        [outpaint_init_image, tabs],
+        #    )
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_upscaler,
+        #        4,
+        #        [outputgallery_filename],
+        #        [upscaler_init_image, tabs],
+        #    )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_txt2img,
+        #    0,
+        #    [hf_models],
+        #    [txt2img_custom_model, tabs],
+        # )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_img2img,
+        #    1,
+        #    [hf_models],
+        #    [img2img_custom_model, tabs],
+        # )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_inpaint,
+        #    2,
+        #    [hf_models],
+        #    [inpaint_custom_model, tabs],
+        # )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_outpaint,
+        #    3,
+        #    [hf_models],
+        #    [outpaint_custom_model, tabs],
+        # )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_upscaler,
+        #    4,
+        #    [hf_models],
+        #    [upscaler_custom_model, tabs],
+        # )
+
+    sd_web.queue()
+    # if args.ui == "app":
+    #    t = Process(
+    #        target=launch_app, args=[f"http://localhost:{args.server_port}"]
+    #    )
+    #    t.start()
+    sd_web.launch(
+        share=True,
+        inbrowser=True,
+        server_name="0.0.0.0",
+        server_port=11911,  # args.server_port,
+    )
--- a/apps/shark_studio/web/ui/init.py
+++ b/apps/shark_studio/web/ui/init.py
--- a/apps/shark_studio/web/ui/chat.py
+++ b/apps/shark_studio/web/ui/chat.py
@@ -0,0 +1,298 @@
+import gradio as gr
+import time
+import os
+from pathlib import Path
+from datetime import datetime as dt
+import json
+import sys
+from apps.shark_studio.api.utils import (
+    get_available_devices,
+)
+from apps.shark_studio.api.llm import (
+    llm_model_map,
+    LanguageModel,
+)
+
+
+def user(message, history):
+    # Append the user's message to the conversation history
+    return "", history + [[message, ""]]
+
+
+language_model = None
+
+
+def create_prompt(model_name, history, prompt_prefix):
+    return ""
+
+
+def get_default_config():
+    return False
+
+
+# model_vmfb_key = ""
+
+
+def chat_fn(
+    prompt_prefix,
+    history,
+    model,
+    device,
+    precision,
+    download_vmfb,
+    config_file,
+    cli=False,
+):
+    global language_model
+    if language_model is None:
+        history[-1][-1] = "Getting the model ready..."
+        yield history, ""
+        language_model = LanguageModel(
+            model,
+            device=device,
+            precision=precision,
+            external_weights="safetensors",
+            external_weight_file="llama2_7b.safetensors",
+            use_system_prompt=prompt_prefix,
+        )
+        history[-1][-1] = "Getting the model ready... Done"
+        yield history, ""
+        history[-1][-1] = ""
+    token_count = 0
+    total_time = 0.001  # In order to avoid divide by zero error
+    prefill_time = 0
+    is_first = True
+    for text, exec_time in language_model.chat(history):
+        history[-1][-1] = text
+        if is_first:
+            prefill_time = exec_time
+            is_first = False
+            yield history, f"Prefill: {prefill_time:.2f}"
+        else:
+            total_time += exec_time
+            token_count += 1
+            tokens_per_sec = token_count / total_time
+            yield history, f"Prefill: {prefill_time:.2f} seconds\n Decode: {tokens_per_sec:.2f} tokens/sec"
+
+
+def llm_chat_api(InputData: dict):
+    return None
+    print(f"Input keys : {InputData.keys()}")
+    # print(f"model : {InputData['model']}")
+    is_chat_completion_api = (
+        "messages" in InputData.keys()
+    )  # else it is the legacy `completion` api
+    # For Debugging input data from API
+    # if is_chat_completion_api:
+    #     print(f"message -> role : {InputData['messages'][0]['role']}")
+    #     print(f"message -> content : {InputData['messages'][0]['content']}")
+    # else:
+    #     print(f"prompt : {InputData['prompt']}")
+    # print(f"max_tokens : {InputData['max_tokens']}") # Default to 128 for now
+    global vicuna_model
+    model_name = InputData["model"] if "model" in InputData.keys() else "codegen"
+    model_path = llm_model_map[model_name]
+    device = "cpu-task"
+    precision = "fp16"
+    max_toks = None if "max_tokens" not in InputData.keys() else InputData["max_tokens"]
+    if max_toks is None:
+        max_toks = 128 if model_name == "codegen" else 512
+
+    # make it working for codegen first
+    from apps.language_models.scripts.vicuna import (
+        UnshardedVicuna,
+    )
+
+    device_id = None
+    if vicuna_model == 0:
+        if "cuda" in device:
+            device = "cuda"
+        elif "sync" in device:
+            device = "cpu-sync"
+        elif "task" in device:
+            device = "cpu-task"
+        elif "vulkan" in device:
+            device_id = int(device.split("://")[1])
+            device = "vulkan"
+        else:
+            print("unrecognized device")
+
+        vicuna_model = UnshardedVicuna(
+            model_name,
+            hf_model_path=model_path,
+            device=device,
+            precision=precision,
+            max_num_tokens=max_toks,
+            download_vmfb=True,
+            load_mlir_from_shark_tank=True,
+            device_id=device_id,
+        )
+
+    # TODO: add role dict for different models
+    if is_chat_completion_api:
+        # TODO: add funtionality for multiple messages
+        prompt = create_prompt(model_name, [(InputData["messages"][0]["content"], "")])
+    else:
+        prompt = InputData["prompt"]
+    print("prompt = ", prompt)
+
+    res = vicuna_model.generate(prompt)
+    res_op = None
+    for op in res:
+        res_op = op
+
+    if is_chat_completion_api:
+        choices = [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": res_op,  # since we are yeilding the result
+                },
+                "finish_reason": "stop",  # or length
+            }
+        ]
+    else:
+        choices = [
+            {
+                "text": res_op,
+                "index": 0,
+                "logprobs": None,
+                "finish_reason": "stop",  # or length
+            }
+        ]
+    end_time = dt.now().strftime("%Y%m%d%H%M%S%f")
+    return {
+        "id": end_time,
+        "object": "chat.completion" if is_chat_completion_api else "text_completion",
+        "created": int(end_time),
+        "choices": choices,
+    }
+
+
+def view_json_file(file_obj):
+    content = ""
+    with open(file_obj.name, "r") as fopen:
+        content = fopen.read()
+    return content
+
+
+with gr.Blocks(title="Chat") as chat_element:
+    with gr.Row():
+        model_choices = list(llm_model_map.keys())
+        model = gr.Dropdown(
+            label="Select Model",
+            value=model_choices[0],
+            choices=model_choices,
+            allow_custom_value=True,
+        )
+        supported_devices = get_available_devices()
+        enabled = True
+        if len(supported_devices) == 0:
+            supported_devices = ["cpu-task"]
+        supported_devices = [x for x in supported_devices if "sync" not in x]
+        device = gr.Dropdown(
+            label="Device",
+            value=supported_devices[0],
+            choices=supported_devices,
+            interactive=enabled,
+            allow_custom_value=True,
+        )
+        precision = gr.Radio(
+            label="Precision",
+            value="int4",
+            choices=[
+                # "int4",
+                # "int8",
+                # "fp16",
+                "fp32",
+            ],
+            visible=False,
+        )
+        tokens_time = gr.Textbox(label="Tokens generated per second")
+        with gr.Column():
+            download_vmfb = gr.Checkbox(
+                label="Download vmfb from Shark tank if available",
+                value=True,
+                interactive=True,
+            )
+            prompt_prefix = gr.Checkbox(
+                label="Add System Prompt",
+                value=False,
+                interactive=True,
+            )
+
+    chatbot = gr.Chatbot(height=500)
+    with gr.Row():
+        with gr.Column():
+            msg = gr.Textbox(
+                label="Chat Message Box",
+                placeholder="Chat Message Box",
+                show_label=False,
+                interactive=enabled,
+                container=False,
+            )
+        with gr.Column():
+            with gr.Row():
+                submit = gr.Button("Submit", interactive=enabled)
+                stop = gr.Button("Stop", interactive=enabled)
+                clear = gr.Button("Clear", interactive=enabled)
+
+    with gr.Row(visible=False):
+        with gr.Group():
+            config_file = gr.File(label="Upload sharding configuration", visible=False)
+            json_view_button = gr.Button(label="View as JSON", visible=False)
+        json_view = gr.JSON(interactive=True, visible=False)
+        json_view_button.click(
+            fn=view_json_file, inputs=[config_file], outputs=[json_view]
+        )
+    submit_event = msg.submit(
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        show_progress=False,
+        queue=False,
+    ).then(
+        fn=chat_fn,
+        inputs=[
+            prompt_prefix,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
+        outputs=[chatbot, tokens_time],
+        show_progress=False,
+        queue=True,
+    )
+    submit_click_event = submit.click(
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        show_progress=False,
+        queue=False,
+    ).then(
+        fn=chat_fn,
+        inputs=[
+            prompt_prefix,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
+        outputs=[chatbot, tokens_time],
+        show_progress=False,
+        queue=True,
+    )
+    stop.click(
+        fn=None,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, submit_click_event],
+        queue=False,
+    )
+    clear.click(lambda: None, None, [chatbot], queue=False)
--- a/benchmarks/hf_model_benchmark.py
+++ b/benchmarks/hf_model_benchmark.py
@@ -6,16 +6,16 @@ parser.add_argument(
    "--model_name",
    type=str,
    required=True,
-    help=
-    "Specifies name of HF model to benchmark. (For exmaple \"microsoft/MiniLM-L12-H384-uncased\""
+    help='Specifies name of HF model to benchmark. (For exmaple "microsoft/MiniLM-L12-H384-uncased"',
 )
 load_args, unknown = parser.parse_known_args()

 if __name__ == "__main__":
    model_name = load_args.model_name
    test_input = torch.randint(2, (1, 128))
-    shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
-                                          jit_trace=True)
+    shark_module = SharkHFBenchmarkRunner(
+        model_name, (test_input,), jit_trace=True
+    )
    shark_module.benchmark_c()
    shark_module.benchmark_python((test_input,))
    shark_module.benchmark_torch(test_input)
--- a/benchmarks/hf_transformer.py
+++ b/benchmarks/hf_transformer.py
@@ -1,8 +1,12 @@
 import torch
-from shark.shark_runner import SharkBenchmarkRunner
+from shark.shark_benchmark_runner import SharkBenchmarkRunner
 from shark.parser import shark_args
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from onnxruntime.transformers.benchmark import run_pytorch, run_tensorflow, run_onnxruntime
+from onnxruntime.transformers.benchmark import (
+    run_pytorch,
+    run_tensorflow,
+    run_onnxruntime,
+)
 from onnxruntime.transformers.huggingface_models import MODELS
 from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
 import os
@@ -10,7 +14,6 @@ import psutil


 class OnnxFusionOptions(object):
-
    def __init__(self):
        self.disable_gelu = False
        self.disable_layer_norm = False
@@ -25,17 +28,13 @@ class OnnxFusionOptions(object):


 class HuggingFaceLanguage(torch.nn.Module):
-
    def __init__(self, hf_model_name):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            hf_model_name,  # The pretrained model.
-            num_labels=
-            2,  # The number of output labels--2 for binary classification.
-            output_attentions=
-            False,  # Whether the model returns attentions weights.
-            output_hidden_states=
-            False,  # Whether the model returns all hidden-states.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
            torchscript=True,
        )

@@ -62,8 +61,16 @@ class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
            )
        self.model_name = model_name
        model = HuggingFaceLanguage(model_name)
-        SharkBenchmarkRunner.__init__(self, model, input, dynamic, self.device,
-                                      jit_trace, from_aot, frontend)
+        SharkBenchmarkRunner.__init__(
+            self,
+            model,
+            input,
+            dynamic,
+            self.device,
+            jit_trace,
+            from_aot,
+            frontend,
+        )

    def benchmark_torch(self, inputs):
        use_gpu = self.device == "gpu"
@@ -74,10 +81,20 @@ class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
        sequence_lengths = [inputs.shape[-1]]
        cache_dir = os.path.join(".", "cache_models")
        verbose = False
-        result = run_pytorch(use_gpu, [self.model_name], None, config_modifier,
-                             Precision.FLOAT32, num_threads, batch_sizes,
-                             sequence_lengths, shark_args.num_iterations, False,
-                             cache_dir, verbose)
+        result = run_pytorch(
+            use_gpu,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            False,
+            cache_dir,
+            verbose,
+        )
        print(
            f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
@@ -92,10 +109,19 @@ class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
        sequence_lengths = [inputs.shape[-1]]
        cache_dir = os.path.join(".", "cache_models")
        verbose = False
-        result = run_tensorflow(use_gpu, [self.model_name], None,
-                                config_modifier, Precision.FLOAT32, num_threads,
-                                batch_sizes, sequence_lengths,
-                                shark_args.num_iterations, cache_dir, verbose)
+        result = run_tensorflow(
+            use_gpu,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            cache_dir,
+            verbose,
+        )
        print(
            f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
@@ -105,7 +131,8 @@ class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
            print(
                f"{self.model_name} is currently not supported in ORT's HF. Check \
 https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
-for currently supported models. Exiting benchmark ONNX.")
+for currently supported models. Exiting benchmark ONNX."
+            )
            return
        use_gpu = self.device == "gpu"
        num_threads = psutil.cpu_count(logical=False)
@@ -121,17 +148,34 @@ for currently supported models. Exiting benchmark ONNX.")
        use_raw_attention_mask = True
        model_fusion_statistics = {}
        overwrite = False
-        model_source = "pt"  #Either "pt" or "tf"
+        model_source = "pt"  # Either "pt" or "tf"
        provider = None
        config_modifier = ConfigModifier(None)
        onnx_args = OnnxFusionOptions()
        result = run_onnxruntime(
-            use_gpu, provider, [self.model_name], None, config_modifier,
-            Precision.FLOAT32, num_threads, batch_sizes, sequence_lengths,
-            shark_args.num_iterations, input_counts, optimize_onnx,
-            validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
-            disable_ort_io_binding, use_raw_attention_mask,
-            model_fusion_statistics, model_source, onnx_args)
+            use_gpu,
+            provider,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            input_counts,
+            optimize_onnx,
+            validate_onnx,
+            cache_dir,
+            onnx_dir,
+            verbose,
+            overwrite,
+            disable_ort_io_binding,
+            use_raw_attention_mask,
+            model_fusion_statistics,
+            model_source,
+            onnx_args,
+        )
        print(
            f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -1,19 +1,23 @@
 from shark.shark_inference import SharkInference
-from shark.iree_utils import check_device_drivers
+from shark.iree_utils._common import check_device_drivers

 import torch
 import tensorflow as tf
 import numpy as np
 import torchvision.models as models
-from transformers import AutoModelForSequenceClassification, BertTokenizer, TFBertModel
+from transformers import (
+    AutoModelForSequenceClassification,
+    BertTokenizer,
+    TFBertModel,
+)
 import importlib
 import pytest
 import unittest

 torch.manual_seed(0)
-gpus = tf.config.experimental.list_physical_devices('GPU')
+gpus = tf.config.experimental.list_physical_devices("GPU")
 for gpu in gpus:
-  tf.config.experimental.set_memory_growth(gpu, True)
+    tf.config.experimental.set_memory_growth(gpu, True)

 ##################### Tensorflow Hugging Face LM Models ###################################
 MAX_SEQUENCE_LENGTH = 512
@@ -23,12 +27,11 @@ BATCH_SIZE = 1
 tf_bert_input = [
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
-    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
 ]


 class TFHuggingFaceLanguage(tf.Module):
-
    def __init__(self, hf_model_name):
        super(TFHuggingFaceLanguage, self).__init__()
        # Create a BERT trainer with the created network.
@@ -36,9 +39,10 @@ class TFHuggingFaceLanguage(tf.Module):

        # Invoke the trainer model on the inputs. This causes the layer to be built.
        self.m.predict = lambda x, y, z: self.m.call(
-            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False
+        )

-    @tf.function(input_signature=tf_bert_input)
+    @tf.function(input_signature=tf_bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

@@ -47,15 +51,21 @@ def get_TFhf_model(name):
    model = TFHuggingFaceLanguage(name)
    tokenizer = BertTokenizer.from_pretrained(name)
    text = "Replace me by any text you'd like."
-    encoded_input = tokenizer(text,
-                              padding='max_length',
-                              truncation=True,
-                              max_length=MAX_SEQUENCE_LENGTH)
+    encoded_input = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+    )
    for key in encoded_input:
        encoded_input[key] = tf.expand_dims(
-            tf.convert_to_tensor(encoded_input[key]), 0)
-    test_input = (encoded_input["input_ids"], encoded_input["attention_mask"],
-                  encoded_input["token_type_ids"])
+            tf.convert_to_tensor(encoded_input[key]), 0
+        )
+    test_input = (
+        encoded_input["input_ids"],
+        encoded_input["attention_mask"],
+        encoded_input["token_type_ids"],
+    )
    actual_out = model.forward(*test_input)
    return model, test_input, actual_out

@@ -64,17 +74,13 @@ def get_TFhf_model(name):


 class HuggingFaceLanguage(torch.nn.Module):
-
    def __init__(self, hf_model_name):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            hf_model_name,  # The pretrained model.
-            num_labels=
-            2,  # The number of output labels--2 for binary classification.
-            output_attentions=
-            False,  # Whether the model returns attentions weights.
-            output_hidden_states=
-            False,  # Whether the model returns all hidden-states.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
            torchscript=True,
        )

@@ -96,7 +102,6 @@ def get_hf_model(name):


 class VisionModule(torch.nn.Module):
-
    def __init__(self, model):
        super().__init__()
        self.model = model
@@ -117,46 +122,56 @@ def get_vision_model(torch_model):
 #############################   Benchmark Tests ####################################

 pytest_benchmark_param = pytest.mark.parametrize(
-    ('dynamic', 'device'),
+    ("dynamic", "device"),
    [
-        pytest.param(False, 'cpu'),
+        pytest.param(False, "cpu"),
        # TODO: Language models are failing for dynamic case..
-        pytest.param(True, 'cpu', marks=pytest.mark.skip),
-        pytest.param(False,
-                     'gpu',
-                     marks=pytest.mark.skipif(check_device_drivers("gpu"),
-                                              reason="nvidia-smi not found")),
-        pytest.param(True,
-                     'gpu',
-                     marks=pytest.mark.skip),
+        pytest.param(True, "cpu", marks=pytest.mark.skip),
        pytest.param(
            False,
-            'vulkan',
+            "cuda",
+            marks=pytest.mark.skipif(
+                check_device_drivers("cuda"), reason="nvidia-smi not found"
+            ),
+        ),
+        pytest.param(True, "cuda", marks=pytest.mark.skip),
+        pytest.param(
+            False,
+            "vulkan",
            marks=pytest.mark.skipif(
                check_device_drivers("vulkan"),
-                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
-            )),
+                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases",
+            ),
+        ),
        pytest.param(
            True,
-            'vulkan',
+            "vulkan",
            marks=pytest.mark.skipif(
                check_device_drivers("vulkan"),
-                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
-            )),
-    ])
+                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases",
+            ),
+        ),
+    ],
+)


-@pytest.mark.skipif(importlib.util.find_spec("iree.tools") is None,
-                    reason="Cannot find tools to import TF")
+@pytest.mark.skipif(
+    importlib.util.find_spec("iree.tools") is None,
+    reason="Cannot find tools to import TF",
+)
@pytest_benchmark_param
 def test_bench_minilm_torch(dynamic, device):
    model, test_input, act_out = get_hf_model(
-        "microsoft/MiniLM-L12-H384-uncased")
-    shark_module = SharkInference(model, (test_input,),
-                                  device=device,
-                                  dynamic=dynamic,
-                                  jit_trace=True,
-                                  benchmark_mode=True)
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
+    shark_module = SharkInference(
+        model,
+        (test_input,),
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
    try:
        # If becnhmarking succesful, assert success/True.
        shark_module.compile()
@@ -167,17 +182,21 @@ def test_bench_minilm_torch(dynamic, device):
        assert False


-@pytest.mark.skipif(importlib.util.find_spec("iree.tools") is None,
-                    reason="Cannot find tools to import TF")
+@pytest.mark.skipif(
+    importlib.util.find_spec("iree.tools") is None,
+    reason="Cannot find tools to import TF",
+)
@pytest_benchmark_param
 def test_bench_distilbert(dynamic, device):
    model, test_input, act_out = get_TFhf_model("distilbert-base-uncased")
-    shark_module = SharkInference(model,
-                                  test_input,
-                                  device=device,
-                                  dynamic=dynamic,
-                                  jit_trace=True,
-                                  benchmark_mode=True)
+    shark_module = SharkInference(
+        model,
+        test_input,
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
    try:
        # If becnhmarking succesful, assert success/True.
        shark_module.set_frontend("tensorflow")
@@ -193,12 +212,14 @@ def test_bench_distilbert(dynamic, device):
@pytest_benchmark_param
 def test_bench_xlm_roberta(dynamic, device):
    model, test_input, act_out = get_TFhf_model("xlm-roberta-base")
-    shark_module = SharkInference(model,
-                                  test_input,
-                                  device=device,
-                                  dynamic=dynamic,
-                                  jit_trace=True,
-                                  benchmark_mode=True)
+    shark_module = SharkInference(
+        model,
+        test_input,
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
    try:
        # If becnhmarking succesful, assert success/True.
        shark_module.set_frontend("tensorflow")
--- a/benchmarks/tests/test_hf_benchmark.py
+++ b/benchmarks/tests/test_hf_benchmark.py
@@ -9,25 +9,31 @@ torch.manual_seed(0)

 # Test running benchmark module without failing.
 pytest_benchmark_param = pytest.mark.parametrize(
-    ('dynamic', 'device'),
+    ("dynamic", "device"),
    [
-        pytest.param(False, 'cpu'),
+        pytest.param(False, "cpu"),
        # TODO: Language models are failing for dynamic case..
-        pytest.param(True, 'cpu', marks=pytest.mark.skip),
-    ])
+        pytest.param(True, "cpu", marks=pytest.mark.skip),
+    ],
+)


-@pytest.mark.skipif(importlib.util.find_spec("onnxruntime") is None,
-                    reason="Cannot find ONNXRUNTIME.")
+@pytest.mark.skipif(
+    importlib.util.find_spec("onnxruntime") is None,
+    reason="Cannot find ONNXRUNTIME.",
+)
@pytest_benchmark_param
 def test_HFbench_minilm_torch(dynamic, device):
    model_name = "bert-base-uncased"
    test_input = torch.randint(2, (1, 128))
    try:
-        shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
-                                              jit_trace=True,
-                                              dynamic=dynamic,
-                                              device=device)
+        shark_module = SharkHFBenchmarkRunner(
+            model_name,
+            (test_input,),
+            jit_trace=True,
+            dynamic=dynamic,
+            device=device,
+        )
        shark_module.benchmark_c()
        shark_module.benchmark_python((test_input,))
        shark_module.benchmark_torch(test_input)
--- a/build_tools/docker/Dockerfile-ubuntu-22.04
+++ b/build_tools/docker/Dockerfile-ubuntu-22.04
@@ -0,0 +1,88 @@
+ARG IMAGE_NAME
+FROM ${IMAGE_NAME}:12.2.0-runtime-ubuntu22.04 as base
+
+ENV NV_CUDA_LIB_VERSION "12.2.0-1"
+
+FROM base as base-amd64
+
+ENV NV_CUDA_CUDART_DEV_VERSION 12.2.53-1
+ENV NV_NVML_DEV_VERSION 12.2.81-1
+ENV NV_LIBCUSPARSE_DEV_VERSION 12.1.1.53-1
+ENV NV_LIBNPP_DEV_VERSION 12.1.1.14-1
+ENV NV_LIBNPP_DEV_PACKAGE libnpp-dev-12-2=${NV_LIBNPP_DEV_VERSION}
+
+ENV NV_LIBCUBLAS_DEV_VERSION 12.2.1.16-1
+ENV NV_LIBCUBLAS_DEV_PACKAGE_NAME libcublas-dev-12-2
+ENV NV_LIBCUBLAS_DEV_PACKAGE ${NV_LIBCUBLAS_DEV_PACKAGE_NAME}=${NV_LIBCUBLAS_DEV_VERSION}
+
+ENV NV_CUDA_NSIGHT_COMPUTE_VERSION 12.2.0-1
+ENV NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE cuda-nsight-compute-12-2=${NV_CUDA_NSIGHT_COMPUTE_VERSION}
+
+ENV NV_NVPROF_VERSION 12.2.60-1
+ENV NV_NVPROF_DEV_PACKAGE cuda-nvprof-12-2=${NV_NVPROF_VERSION}
+FROM base as base-arm64
+
+ENV NV_CUDA_CUDART_DEV_VERSION 12.2.53-1
+ENV NV_NVML_DEV_VERSION 12.2.81-1
+ENV NV_LIBCUSPARSE_DEV_VERSION 12.1.1.53-1
+ENV NV_LIBNPP_DEV_VERSION 12.1.1.14-1
+ENV NV_LIBNPP_DEV_PACKAGE libnpp-dev-12-2=${NV_LIBNPP_DEV_VERSION}
+
+ENV NV_LIBCUBLAS_DEV_PACKAGE_NAME libcublas-dev-12-2
+ENV NV_LIBCUBLAS_DEV_VERSION 12.2.1.16-1
+ENV NV_LIBCUBLAS_DEV_PACKAGE ${NV_LIBCUBLAS_DEV_PACKAGE_NAME}=${NV_LIBCUBLAS_DEV_VERSION}
+
+ENV NV_CUDA_NSIGHT_COMPUTE_VERSION 12.2.0-1
+ENV NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE cuda-nsight-compute-12-2=${NV_CUDA_NSIGHT_COMPUTE_VERSION}
+
+FROM base-${TARGETARCH}
+
+ARG TARGETARCH
+
+LABEL maintainer "SHARK<stdin@nod.com>"
+
+# Register the ROCM package repository, and install rocm-dev package
+ARG ROCM_VERSION=5.6
+ARG AMDGPU_VERSION=5.6
+
+ARG APT_PREF
+RUN echo "$APT_PREF" > /etc/apt/preferences.d/rocm-pin-600
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg \
+  && curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
+  && printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list \
+  && printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list \
+  && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+  sudo \
+  libelf1 \
+  kmod \
+  file \
+  python3 \
+  python3-pip \
+  rocm-dev \
+  rocm-libs \
+  rocm-hip-libraries \
+  build-essential && \
+  apt-get clean && \
+  rm -rf /var/lib/apt/lists/*
+
+RUN  groupadd -g 109 render
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    cuda-cudart-dev-12-2=${NV_CUDA_CUDART_DEV_VERSION} \
+    cuda-command-line-tools-12-2=${NV_CUDA_LIB_VERSION} \
+    cuda-minimal-build-12-2=${NV_CUDA_LIB_VERSION} \
+    cuda-libraries-dev-12-2=${NV_CUDA_LIB_VERSION} \
+    cuda-nvml-dev-12-2=${NV_NVML_DEV_VERSION} \
+    ${NV_NVPROF_DEV_PACKAGE} \
+    ${NV_LIBNPP_DEV_PACKAGE} \
+    libcusparse-dev-12-2=${NV_LIBCUSPARSE_DEV_VERSION} \
+    ${NV_LIBCUBLAS_DEV_PACKAGE} \
+    ${NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE} \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN apt install rocm-hip-libraries
+
+# Keep apt from auto upgrading the cublas and nccl packages. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88
+RUN apt-mark hold ${NV_LIBCUBLAS_DEV_PACKAGE_NAME}
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
--- a/build_tools/docker/README.md
+++ b/build_tools/docker/README.md
@@ -0,0 +1,41 @@
+On your host install your Nvidia or AMD gpu drivers. 
+
+**HOST Setup**
+
+*Ubuntu 23.04 Nvidia*
+```
+sudo ubuntu-drivers install
+```
+
+Install [docker](https://docs.docker.com/engine/install/ubuntu/) and the post-install to run as a [user](https://docs.docker.com/engine/install/linux-postinstall/)
+
+Install Nvidia [Container and register it](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). In Ubuntu 23.04 systems follow [this](https://github.com/NVIDIA/nvidia-container-toolkit/issues/72#issuecomment-1584574298)
+
+
+Build docker with :
+
+```
+docker build . -f Dockerfile-ubuntu-22.04 -t shark/dev-22.04:5.6 --build-arg=ROCM_VERSION=5.6 --build-arg=AMDGPU_VERSION=5.6 --build-arg=APT_PREF="Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" --build-arg=IMAGE_NAME=nvidia/cuda --build-arg=TARGETARCH=amd64
+```
+
+Run with:
+
+*CPU*
+
+```
+docker run  -it docker.io/shark/dev-22.04:5.6
+```
+
+*Nvidia GPU*
+
+```
+docker run --rm -it --gpus all docker.io/shark/dev-22.04:5.6
+```
+
+*AMD GPUs*
+
+```
+docker run --device /dev/kfd --device /dev/dri  docker.io/shark/dev-22.04:5.6
+```
+
+More AMD instructions are [here](https://docs.amd.com/en/latest/deploy/docker.html)
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -0,0 +1,51 @@
+import argparse
+from PIL import Image
+import numpy as np
+
+import requests
+import shutil
+import os
+import subprocess
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-n", "--newfile")
+parser.add_argument(
+    "-g",
+    "--golden_url",
+    default="https://storage.googleapis.com/shark_tank/testdata/cyberpunk_fores_42_0_230119_021148.png",
+)
+
+
+def get_image(url, local_filename):
+    res = requests.get(url, stream=True)
+    if res.status_code == 200:
+        with open(local_filename, "wb") as f:
+            shutil.copyfileobj(res.raw, f)
+
+
+def compare_images(new_filename, golden_filename, upload=False):
+    new = np.array(Image.open(new_filename)) / 255.0
+    golden = np.array(Image.open(golden_filename)) / 255.0
+    diff = np.abs(new - golden)
+    mean = np.mean(diff)
+    if mean > 0.1:
+        if os.name != "nt" and upload == True:
+            subprocess.run(
+                [
+                    "gsutil",
+                    "cp",
+                    new_filename,
+                    "gs://shark_tank/testdata/builder/",
+                ]
+            )
+        raise AssertionError("new and golden not close")
+    else:
+        print("SUCCESS")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    tempfile_name = os.path.join(os.getcwd(), "golden.png")
+    get_image(args.golden_url, tempfile_name)
+    compare_images(args.newfile, tempfile_name)
--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+IMPORTER=1 BENCHMARK=1 NO_BREVITAS=1 ./setup_venv.sh
+source $GITHUB_WORKSPACE/shark.venv/bin/activate
+python build_tools/stable_diffusion_testing.py --gen
+python tank/generate_sharktank.py
--- a/build_tools/scrape_releases.py
+++ b/build_tools/scrape_releases.py
@@ -0,0 +1,37 @@
+"""Scrapes the github releases API to generate a static pip-install-able releases page.
+
+See https://github.com/llvm/torch-mlir/issues/1374
+"""
+import argparse
+import json
+
+import requests
+
+# Parse arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("owner", type=str)
+parser.add_argument("repo", type=str)
+args = parser.parse_args()
+
+# Get releases
+response = requests.get(
+    f"https://api.github.com/repos/{args.owner}/{args.repo}/releases"
+)
+body = json.loads(response.content)
+
+# Parse releases
+releases = []
+for row in body:
+    for asset in row["assets"]:
+        releases.append((asset["name"], asset["browser_download_url"]))
+
+# Output HTML
+html = """<!DOCTYPE html>
+<html>
+  <body>
+"""
+for name, url in releases:
+    html += f"    <a href='{url}'>{name}</a><br />\n"
+html += """  </body>
+</html>"""
+print(html)
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -0,0 +1,284 @@
+import os
+from sys import executable
+import subprocess
+from apps.stable_diffusion.src.utils.resources import (
+    get_json_file,
+)
+from datetime import datetime as dt
+from shark.shark_downloader import download_public_file
+from image_comparison import compare_images
+import argparse
+from glob import glob
+import shutil
+import requests
+
+model_config_dicts = get_json_file(
+    os.path.join(
+        os.getcwd(),
+        "apps/stable_diffusion/src/utils/resources/model_config.json",
+    )
+)
+
+
+def parse_sd_out(filename, command, device, use_tune, model_name, import_mlir):
+    with open(filename, "r+") as f:
+        lines = f.readlines()
+    metrics = {}
+    vals_to_read = [
+        "Clip Inference time",
+        "Average step",
+        "VAE Inference time",
+        "Total image generation",
+    ]
+    for line in lines:
+        for val in vals_to_read:
+            if val in line:
+                metrics[val] = line.split(" ")[-1].strip("\n")
+
+    metrics["Average step"] = metrics["Average step"].strip("ms/it")
+    metrics["Total image generation"] = metrics["Total image generation"].strip("sec")
+    metrics["device"] = device
+    metrics["use_tune"] = use_tune
+    metrics["model_name"] = model_name
+    metrics["import_mlir"] = import_mlir
+    metrics["command"] = command
+    return metrics
+
+
+def get_inpaint_inputs():
+    os.mkdir("./test_images/inputs")
+    img_url = (
+        "https://huggingface.co/datasets/diffusers/test-arrays/resolve"
+        "/main/stable_diffusion_inpaint/input_bench_image.png"
+    )
+    mask_url = (
+        "https://huggingface.co/datasets/diffusers/test-arrays/resolve"
+        "/main/stable_diffusion_inpaint/input_bench_mask.png"
+    )
+    img = requests.get(img_url)
+    mask = requests.get(mask_url)
+    open("./test_images/inputs/image.png", "wb").write(img.content)
+    open("./test_images/inputs/mask.png", "wb").write(mask.content)
+
+
+def test_loop(
+    device="vulkan",
+    beta=False,
+    extra_flags=[],
+    upload_bool=True,
+    exit_on_fail=True,
+    do_gen=False,
+):
+    # Get golden values from tank
+    shutil.rmtree("./test_images", ignore_errors=True)
+    model_metrics = []
+    os.mkdir("./test_images")
+    os.mkdir("./test_images/golden")
+    get_inpaint_inputs()
+    hf_model_names = model_config_dicts[0].values()
+    tuned_options = [
+        "--no-use_tuned",
+        "--use_tuned",
+    ]
+    import_options = ["--import_mlir", "--no-import_mlir"]
+    prompt_text = "--prompt=cyberpunk forest by Salvador Dali"
+    inpaint_prompt_text = (
+        "--prompt=Face of a yellow cat, high resolution, sitting on a park bench"
+    )
+    if os.name == "nt":
+        prompt_text = '--prompt="cyberpunk forest by Salvador Dali"'
+        inpaint_prompt_text = (
+            '--prompt="Face of a yellow cat, high resolution, sitting on a park bench"'
+        )
+    if beta:
+        extra_flags.append("--beta_models=True")
+    extra_flags.append("--no-progress_bar")
+    if do_gen:
+        extra_flags.append("--import_debug")
+    to_skip = [
+        "Linaqruf/anything-v3.0",
+        "prompthero/openjourney",
+        "wavymulder/Analog-Diffusion",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+    ]
+    counter = 0
+    for import_opt in import_options:
+        for model_name in hf_model_names:
+            if model_name in to_skip:
+                continue
+            for use_tune in tuned_options:
+                if (
+                    model_name == "stabilityai/stable-diffusion-2-1"
+                    and use_tune == tuned_options[0]
+                ):
+                    continue
+                elif (
+                    model_name == "stabilityai/stable-diffusion-2-1-base"
+                    and use_tune == tuned_options[1]
+                ):
+                    continue
+                elif use_tune == tuned_options[1]:
+                    continue
+                command = (
+                    [
+                        executable,  # executable is the python from the venv used to run this
+                        "apps/stable_diffusion/scripts/txt2img.py",
+                        "--device=" + device,
+                        prompt_text,
+                        "--negative_prompts=" + '""',
+                        "--seed=42",
+                        import_opt,
+                        "--output_dir="
+                        + os.path.join(os.getcwd(), "test_images", model_name),
+                        "--hf_model_id=" + model_name,
+                        use_tune,
+                    ]
+                    if "inpainting" not in model_name
+                    else [
+                        executable,
+                        "apps/stable_diffusion/scripts/inpaint.py",
+                        "--device=" + device,
+                        inpaint_prompt_text,
+                        "--negative_prompts=" + '""',
+                        "--img_path=./test_images/inputs/image.png",
+                        "--mask_path=./test_images/inputs/mask.png",
+                        "--seed=42",
+                        "--import_mlir",
+                        "--output_dir="
+                        + os.path.join(os.getcwd(), "test_images", model_name),
+                        "--hf_model_id=" + model_name,
+                        use_tune,
+                    ]
+                )
+                command += extra_flags
+                if os.name == "nt":
+                    command = " ".join(command)
+                dumpfile_name = "_".join(model_name.split("/")) + ".txt"
+                dumpfile_name = os.path.join(os.getcwd(), dumpfile_name)
+                with open(dumpfile_name, "w+") as f:
+                    generated_image = not subprocess.call(
+                        command,
+                        stdout=f,
+                        stderr=f,
+                    )
+                if os.name != "nt":
+                    command = " ".join(command)
+                if generated_image:
+                    model_metrics.append(
+                        parse_sd_out(
+                            dumpfile_name,
+                            command,
+                            device,
+                            use_tune,
+                            model_name,
+                            import_opt,
+                        )
+                    )
+                    print(command)
+                    print("Successfully generated image")
+                    os.makedirs("./test_images/golden/" + model_name, exist_ok=True)
+                    download_public_file(
+                        "gs://shark_tank/testdata/golden/" + model_name,
+                        "./test_images/golden/" + model_name,
+                    )
+                    test_file_path = os.path.join(
+                        os.getcwd(),
+                        "test_images",
+                        model_name,
+                        "generated_imgs",
+                        dt.now().strftime("%Y%m%d"),
+                        "*.png",
+                    )
+                    test_file = glob(test_file_path)[0]
+
+                    golden_path = "./test_images/golden/" + model_name + "/*.png"
+                    golden_file = glob(golden_path)[0]
+                    try:
+                        compare_images(test_file, golden_file, upload=upload_bool)
+                    except AssertionError as e:
+                        print(e)
+                        if exit_on_fail == True:
+                            raise
+                else:
+                    print(command)
+                    print("failed to generate image for this configuration")
+                    with open(dumpfile_name, "r+") as f:
+                        output = f.readlines()
+                        print("\n".join(output))
+                    exit(1)
+                if os.name == "nt":
+                    counter += 1
+                    if counter % 2 == 0:
+                        extra_flags.append(
+                            "--iree_vulkan_target_triple=rdna2-unknown-windows"
+                        )
+                    else:
+                        if counter != 1:
+                            extra_flags.remove(
+                                "--iree_vulkan_target_triple=rdna2-unknown-windows"
+                            )
+            if do_gen:
+                prepare_artifacts()
+
+    with open(os.path.join(os.getcwd(), "sd_testing_metrics.csv"), "w+") as f:
+        header = "model_name;device;use_tune;import_opt;Clip Inference time(ms);Average Step (ms/it);VAE Inference time(ms);total image generation(s);command\n"
+        f.write(header)
+        for metric in model_metrics:
+            output = [
+                metric["model_name"],
+                metric["device"],
+                metric["use_tune"],
+                metric["import_mlir"],
+                metric["Clip Inference time"],
+                metric["Average step"],
+                metric["VAE Inference time"],
+                metric["Total image generation"],
+                metric["command"],
+            ]
+            f.write(";".join(output) + "\n")
+
+
+def prepare_artifacts():
+    gen_path = os.path.join(os.getcwd(), "gen_shark_tank")
+    if not os.path.isdir(gen_path):
+        os.mkdir(gen_path)
+    for dirname in os.listdir(os.getcwd()):
+        for modelname in ["clip", "unet", "vae"]:
+            if modelname in dirname and "vmfb" not in dirname:
+                if not os.path.isdir(os.path.join(gen_path, dirname)):
+                    shutil.move(os.path.join(os.getcwd(), dirname), gen_path)
+                    print(f"Moved dir: {dirname} to {gen_path}.")
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-d", "--device", default="vulkan")
+parser.add_argument(
+    "-b", "--beta", action=argparse.BooleanOptionalAction, default=False
+)
+parser.add_argument("-e", "--extra_args", type=str, default=None)
+parser.add_argument(
+    "-u", "--upload", action=argparse.BooleanOptionalAction, default=True
+)
+parser.add_argument(
+    "-x", "--exit_on_fail", action=argparse.BooleanOptionalAction, default=True
+)
+parser.add_argument("-g", "--gen", action=argparse.BooleanOptionalAction, default=False)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    extra_args = []
+    if args.extra_args:
+        for arg in args.extra_args.split(","):
+            extra_args.append(arg)
+    test_loop(
+        args.device,
+        args.beta,
+        extra_args,
+        args.upload,
+        args.exit_on_fail,
+        args.gen,
+    )
+    if args.gen:
+        prepare_artifacts()
--- a/build_tools/vicuna_testing.py
+++ b/build_tools/vicuna_testing.py
@@ -0,0 +1,14 @@
+import os
+from sys import executable
+import subprocess
+from apps.language_models.scripts import vicuna
+
+
+def test_loop():
+    precisions = ["fp16", "int8", "int4"]
+    devices = ["cpu"]
+    for precision in precisions:
+        for device in devices:
+            model = vicuna.UnshardedVicuna(device=device, precision=precision)
+            model.compile()
+            del model
--- a/conftest.py
+++ b/conftest.py
@@ -0,0 +1,92 @@
+def pytest_addoption(parser):
+    # Attaches SHARK command-line arguments to the pytest machinery.
+    parser.addoption(
+        "--benchmark",
+        action="store",
+        type=str,
+        default=None,
+        choices=("baseline", "native", "all"),
+        help="Benchmarks specified engine(s) and writes bench_results.csv.",
+    )
+    parser.addoption(
+        "--onnx_bench",
+        action="store_true",
+        default="False",
+        help="Add ONNX benchmark results to pytest benchmarks.",
+    )
+    parser.addoption(
+        "--tf32",
+        action="store_true",
+        default="False",
+        help="Use TensorFloat-32 calculations.",
+    )
+    parser.addoption(
+        "--save_repro",
+        action="store_true",
+        default="False",
+        help="Pass option to save reproduction artifacts to SHARK/shark_tmp/test_case/",
+    )
+    parser.addoption(
+        "--save_fails",
+        action="store_true",
+        default="False",
+        help="Save reproduction artifacts for a test case only if it fails. Default is False.",
+    )
+    parser.addoption(
+        "--ci",
+        action="store_true",
+        default="False",
+        help="Enables uploading of reproduction artifacts upon test case failure during iree-compile or validation. Must be passed with --ci_sha option ",
+    )
+    parser.addoption(
+        "--update_tank",
+        action="store_true",
+        default="False",
+        help="Update local shark tank with latest artifacts if model artifact hash mismatched.",
+    )
+    parser.addoption(
+        "--force_update_tank",
+        action="store_true",
+        default="False",
+        help="Force-update local shark tank with artifacts from specified shark_tank URL (defaults to nightly).",
+    )
+    parser.addoption(
+        "--ci_sha",
+        action="store",
+        default="None",
+        help="Passes the github SHA of the CI workflow to include in google storage directory for reproduction artifacts.",
+    )
+    parser.addoption(
+        "--local_tank_cache",
+        action="store",
+        default=None,
+        help="Specify the directory in which all downloaded shark_tank artifacts will be cached.",
+    )
+    parser.addoption(
+        "--tank_url",
+        type=str,
+        default="gs://shark_tank/nightly",
+        help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
+    )
+    parser.addoption(
+        "--tank_prefix",
+        type=str,
+        default=None,
+        help="Prefix to gs://shark_tank/ model directories from which to download SHARK tank artifacts. Default is nightly.",
+    )
+    parser.addoption(
+        "--benchmark_dispatches",
+        default=None,
+        help="Benchmark individual dispatch kernels produced by IREE compiler. Use 'All' for all, or specific dispatches e.g. '0 1 2 10'",
+    )
+    parser.addoption(
+        "--dispatch_benchmarks_dir",
+        default="./temp_dispatch_benchmarks",
+        help="Directory in which dispatch benchmarks are saved.",
+    )
+    parser.addoption(
+        "--batchsize",
+        default=1,
+        type=int,
+        help="Batch size for the tested model.",
+    )
--- a/cpp/.gitignore
+++ b/cpp/.gitignore
@@ -0,0 +1,3 @@
+*.mlir
+*.vmfb
+*.ini
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -0,0 +1,52 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required(VERSION 3.21...3.23)
+
+#-------------------------------------------------------------------------------
+# Project configuration
+#-------------------------------------------------------------------------------
+
+project(iree-samples C CXX)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+#-------------------------------------------------------------------------------
+# Core project dependency
+#-------------------------------------------------------------------------------
+
+message(STATUS "Fetching core IREE repo (this may take a few minutes)...")
+# Note: for log output, set -DFETCHCONTENT_QUIET=OFF,
+# see https://gitlab.kitware.com/cmake/cmake/-/issues/18238#note_440475
+
+include(FetchContent)
+
+FetchContent_Declare(
+  iree
+  GIT_REPOSITORY https://github.com/nod-ai/srt.git
+  GIT_TAG shark 
+  GIT_SUBMODULES_RECURSE OFF
+  GIT_SHALLOW OFF
+  GIT_PROGRESS ON
+  USES_TERMINAL_DOWNLOAD ON
+)
+
+# Extend module path to find MLIR CMake modules.
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_BINARY_DIR}/lib/cmake/mlir")
+
+# Disable core project features not needed for these out of tree samples.
+set(IREE_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(IREE_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
+
+FetchContent_MakeAvailable(iree)
+FetchContent_GetProperties(iree SOURCE_DIR IREE_SOURCE_DIR)
+
+#-------------------------------------------------------------------------------
+# Individual samples
+#-------------------------------------------------------------------------------
+
+add_subdirectory(vulkan_gui)
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -0,0 +1,82 @@
+# SHARK C/C++ Samples
+
+These C/C++ samples can be built using CMake. The samples depend on the main
+SHARK-Runtime project's C/C++ sources, including both the runtime and the compiler. 
+
+Individual samples may require additional dependencies. Watch CMake's output
+for information about which you are missing for individual samples.
+
+On Windows we recommend using https://github.com/microsoft/vcpkg to download packages for
+your system. The general setup flow looks like
+
+*Install and activate SHARK*
+
+```bash
+source shark.venv/bin/activate #follow main repo instructions to setup your venv
+```
+
+*Install Dependencies*
+
+```bash
+vcpkg install [library] --triplet [your platform]
+vcpkg integrate install
+
+# Then pass `-DCMAKE_TOOLCHAIN_FILE=[check logs for path]` when configuring CMake
+```
+
+In Ubuntu Linux you can install
+
+```bash
+sudo apt install libsdl2-dev
+```
+
+*Build*
+```bash
+cd cpp
+cmake -GNinja -B build/
+cmake --build build/
+```
+
+*Prepare the model*
+```bash
+wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  resnet50_tf.mlir -o resnet50_tf.vmfb
+```
+*Prepare the input*
+
+```bash
+python save_img.py
+```
+Note that this requires tensorflow, e.g.
+```bash
+python -m pip install tensorflow
+```
+
+*Run the vulkan_gui*
+```bash
+./build/vulkan_gui/iree-samples-resnet-vulkan-gui
+```
+
+## Other models
+A tool for benchmarking other models is built and can be invoked with a command like the following
+```bash
+./build/vulkan_gui/iree-vulkan-gui --module-file=path/to/.vmfb --function_input=...
+```
+see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
+```bash
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  stable_diff_tf.mlir -o stable_diff_tf.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
+```
+VAE and Autoencoder are also available
+```bash
+# VAE
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  vae.mlir -o vae.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32
+
+# CLIP Autoencoder
+wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  clip_autoencoder.mlir -o clip_autoencoder.vmfb
+./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
+```
--- a/cpp/dog_imagenet.jpg
+++ b/cpp/dog_imagenet.jpg
--- a/cpp/save_img.py
+++ b/cpp/save_img.py
@@ -0,0 +1,18 @@
+import numpy as np
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+
+def load_and_preprocess_image(fname: str):
+    image = tf.io.read_file(fname)
+    image = tf.image.decode_image(image, channels=3)
+    image = tf.image.resize(image, (224, 224))
+    image = image[tf.newaxis, :]
+    # preprocessing pipeline
+    input_tensor = tf.keras.applications.resnet50.preprocess_input(image)
+    return input_tensor
+
+
+data = load_and_preprocess_image("dog_imagenet.jpg").numpy()
+
+data.tofile("dog.bin")
--- a/cpp/vision_inference/CMakeLists.txt
+++ b/cpp/vision_inference/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR
+   NOT IREE_HAL_EXECUTABLE_LOADER_EMBEDDED_ELF)
+  message(STATUS "Missing LLVM backend and/or embeddded elf loader, skipping vision_inference sample")
+  return()
+endif()
+
+# vcpkg install stb
+#   tested with version 2021-09-10
+find_package(Stb)
+if(NOT Stb_FOUND)
+  message(STATUS "Could not find Stb, skipping vision inference sample")
+  return()
+endif()
+
+# Compile mnist.mlir to mnist.vmfb.
+set(_COMPILE_TOOL_EXECUTABLE $<TARGET_FILE:iree-compile>)
+set(_COMPILE_ARGS)
+list(APPEND _COMPILE_ARGS "--iree-input-type=auto")
+list(APPEND _COMPILE_ARGS "--iree-hal-target-backends=llvm-cpu")
+list(APPEND _COMPILE_ARGS "${IREE_SOURCE_DIR}/samples/models/mnist.mlir")
+list(APPEND _COMPILE_ARGS "-o")
+list(APPEND _COMPILE_ARGS "mnist.vmfb")
+add_custom_command(
+  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
+  COMMAND ${_COMPILE_TOOL_EXECUTABLE} ${_COMPILE_ARGS}
+  DEPENDS ${_COMPILE_TOOL_EXECUTABLE} "${IREE_SOURCE_DIR}/samples/models/mnist.mlir"
+)
+# Embed mnist.vmfb into a C file as mnist_bytecode_module_c.[h/c]
+set(_EMBED_DATA_EXECUTABLE $<TARGET_FILE:generate_embed_data>)
+set(_EMBED_ARGS)
+list(APPEND _EMBED_ARGS "--output_header=mnist_bytecode_module_c.h")
+list(APPEND _EMBED_ARGS "--output_impl=mnist_bytecode_module_c.c")
+list(APPEND _EMBED_ARGS "--identifier=iree_samples_vision_inference_mnist_bytecode_module")
+list(APPEND _EMBED_ARGS "--flatten")
+list(APPEND _EMBED_ARGS "${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb")
+add_custom_command(
+  OUTPUT "mnist_bytecode_module_c.h" "mnist_bytecode_module_c.c"
+  COMMAND ${_EMBED_DATA_EXECUTABLE} ${_EMBED_ARGS}
+  DEPENDS ${_EMBED_DATA_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
+)
+# Define a library target for mnist_bytecode_module_c.
+add_library(iree_samples_vision_inference_mnist_bytecode_module_c OBJECT)
+target_sources(iree_samples_vision_inference_mnist_bytecode_module_c
+  PRIVATE
+    mnist_bytecode_module_c.h
+    mnist_bytecode_module_c.c
+)
+
+# Define the sample executable.
+set(_NAME "iree-run-mnist-module")
+add_executable(${_NAME} "")
+target_sources(${_NAME}
+  PRIVATE
+    "image_util.h"
+    "image_util.c"
+    "iree-run-mnist-module.c"
+)
+set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-run-mnist-module")
+target_include_directories(${_NAME} PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+)
+target_include_directories(${_NAME} PRIVATE
+    ${Stb_INCLUDE_DIR}
+)
+target_link_libraries(${_NAME}
+  iree_base_base
+  iree_base_tracing
+  iree_hal_hal
+  iree_runtime_runtime
+  iree_samples_vision_inference_mnist_bytecode_module_c
+)
+
+# Define a target that copies the test image into the build directory.
+add_custom_target(iree_samples_vision_inference_test_image
+  COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/mnist_test.png" "${CMAKE_CURRENT_BINARY_DIR}/mnist_test.png")
+add_dependencies(${_NAME} iree_samples_vision_inference_test_image)
+
+message(STATUS "Configured vision_inference sample successfully")
--- a/cpp/vision_inference/README.md
+++ b/cpp/vision_inference/README.md
@@ -0,0 +1,8 @@
+# Vision Inference Sample (C code)
+
+This sample demonstrates how to run a MNIST handwritten digit detection vision
+model on an image using IREE's C API.
+
+A similar sample is implemented using a Python script and IREE's command line
+tools over in the primary iree repository at
+https://github.com/iree-org/iree/tree/main/samples/vision_inference
--- a/cpp/vision_inference/image_util.c
+++ b/cpp/vision_inference/image_util.c
@@ -0,0 +1,224 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "image_util.h"
+
+#include <math.h>
+
+#include "iree/base/internal/flags.h"
+#include "iree/base/tracing.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
+    const uint8_t* pixel_data, iree_host_size_t buffer_length,
+    const float* input_range, iree_host_size_t range_length,
+    float* out_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  if (range_length != 2) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "range defined as 2-element [min, max] array.");
+  }
+  float input_scale = fabsf(input_range[1] - input_range[0]) / 2.0f;
+  float input_offset = (input_range[0] + input_range[1]) / 2.0f;
+  const float kUint8Mean = 127.5f;
+  for (int i = 0; i < buffer_length; ++i) {
+    out_buffer[i] =
+        (((float)(pixel_data[i])) - kUint8Mean) / kUint8Mean * input_scale +
+        input_offset;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_tools_utils_load_pixel_data_impl(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
+  int img_dims[3];
+  if (stbi_info(filename.data, img_dims, &(img_dims[1]), &(img_dims[2])) == 0) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
+                            (int)filename.size, filename.data);
+  }
+  if (!(element_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 ||
+        element_type == IREE_HAL_ELEMENT_TYPE_SINT_8 ||
+        element_type == IREE_HAL_ELEMENT_TYPE_UINT_8)) {
+    char element_type_str[16];
+    IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
+        element_type, sizeof(element_type_str), element_type_str, NULL));
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "element type %s not supported", element_type_str);
+  }
+  switch (shape_rank) {
+    case 2: {  // Assume tensor <height x width>
+      if (img_dims[2] != 1 || (shape[0] != img_dims[1]) ||
+          (shape[1] != img_dims[0])) {
+        return iree_make_status(
+            IREE_STATUS_INVALID_ARGUMENT,
+            "image size: %dx%dx%d, expected: %" PRIdim "x%" PRIdim, img_dims[0],
+            img_dims[1], img_dims[2], shape[1], shape[0]);
+      }
+      break;
+    }
+    case 3: {  // Assume tensor <height x width x channel>
+      if (shape[0] != img_dims[1] || shape[1] != img_dims[0] ||
+          shape[2] != img_dims[2]) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "image size: %dx%dx%d, expected: %" PRIdim
+                                "x%" PRIdim "x%" PRIdim,
+                                img_dims[0], img_dims[1], img_dims[2], shape[1],
+                                shape[0], shape[2]);
+      }
+      break;
+    }
+    case 4: {  // Assume tensor <batch x height x width x channel>
+      if (shape[1] != img_dims[1] || shape[2] != img_dims[0] ||
+          shape[3] != img_dims[2]) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "image size: %dx%dx%d, expected: %" PRIdim
+                                "x%" PRIdim "x%" PRIdim,
+                                img_dims[0], img_dims[1], img_dims[2], shape[2],
+                                shape[1], shape[3]);
+      }
+      break;
+    }
+    default:
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "Input buffer shape rank %" PRIhsz " not supported", shape_rank);
+  }
+  // Drop the alpha channel if present.
+  int req_ch = (img_dims[2] >= 3) ? 3 : 0;
+  *out_pixel_data = stbi_load(filename.data, img_dims, &(img_dims[1]),
+                              &(img_dims[2]), req_ch);
+  if (*out_pixel_data == NULL) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
+                            (int)filename.size, filename.data);
+  }
+  *out_buffer_length =
+      img_dims[0] * img_dims[1] * (img_dims[2] > 3 ? 3 : img_dims[2]);
+  return iree_ok_status();
+}
+
+iree_status_t iree_tools_utils_load_pixel_data(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t result = iree_tools_utils_load_pixel_data_impl(
+      filename, shape, shape_rank, element_type, out_pixel_data,
+      out_buffer_length);
+  IREE_TRACE_ZONE_END(z0);
+  return result;
+}
+
+iree_status_t iree_tools_utils_buffer_view_from_image(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_buffer_view = NULL;
+  if (element_type != IREE_HAL_ELEMENT_TYPE_SINT_8 &&
+      element_type != IREE_HAL_ELEMENT_TYPE_UINT_8) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "element type should be i8 or u8");
+  }
+
+  iree_status_t result;
+  uint8_t* pixel_data = NULL;
+  iree_host_size_t buffer_length;
+  result = iree_tools_utils_load_pixel_data(
+      filename, shape, shape_rank, element_type, &pixel_data, &buffer_length);
+  if (iree_status_is_ok(result)) {
+    iree_host_size_t element_byte =
+        iree_hal_element_dense_byte_count(element_type);
+    // SINT_8 and UINT_8 perform direct buffer wrap.
+    result = iree_hal_buffer_view_allocate_buffer(
+        allocator, shape_rank, shape, element_type,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+        (iree_hal_buffer_params_t){
+            .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+            .access = IREE_HAL_MEMORY_ACCESS_READ,
+            .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                     IREE_HAL_BUFFER_USAGE_TRANSFER,
+        },
+        iree_make_const_byte_span(pixel_data, element_byte * buffer_length),
+        out_buffer_view);
+  }
+  stbi_image_free(pixel_data);
+  IREE_TRACE_ZONE_END(z0);
+  return result;
+}
+
+typedef struct iree_tools_utils_buffer_view_load_params_t {
+  const uint8_t* pixel_data;
+  iree_host_size_t pixel_data_length;
+  const float* input_range;
+  iree_host_size_t input_range_length;
+} iree_tools_utils_buffer_view_load_params_t;
+static iree_status_t iree_tools_utils_buffer_view_load_image_rescaled(
+    iree_hal_buffer_mapping_t* mapping, void* user_data) {
+  iree_tools_utils_buffer_view_load_params_t* params =
+      (iree_tools_utils_buffer_view_load_params_t*)user_data;
+  return iree_tools_utils_pixel_rescaled_to_buffer(
+      params->pixel_data, params->pixel_data_length, params->input_range,
+      params->input_range_length, (float*)mapping->contents.data);
+}
+
+iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, const float* input_range,
+    iree_host_size_t input_range_length,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_buffer_view = NULL;
+  if (element_type != IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "element type should be f32");
+  }
+
+  // Classic row-major image layout.
+  iree_hal_encoding_type_t encoding_type =
+      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR;
+
+  // Load pixel data from the file into a new host memory allocation (the only
+  // interface stb_image provides). A real application would want to use the
+  // generation callback to directly decode the image into the target mapped
+  // device buffer.
+  uint8_t* pixel_data = NULL;
+  iree_host_size_t buffer_length = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_tools_utils_load_pixel_data(filename, shape, shape_rank,
+                                           element_type, &pixel_data,
+                                           &buffer_length));
+
+  iree_tools_utils_buffer_view_load_params_t params = {
+      .pixel_data = pixel_data,
+      .pixel_data_length = buffer_length,
+      .input_range = input_range,
+      .input_range_length = input_range_length,
+  };
+  iree_status_t status = iree_hal_buffer_view_generate_buffer(
+      allocator, shape_rank, shape, element_type, encoding_type,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                  IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
+          .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                   IREE_HAL_BUFFER_USAGE_TRANSFER |
+                   IREE_HAL_BUFFER_USAGE_MAPPING,
+      },
+      iree_tools_utils_buffer_view_load_image_rescaled, &params,
+      out_buffer_view);
+
+  stbi_image_free(pixel_data);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
--- a/cpp/vision_inference/image_util.h
+++ b/cpp/vision_inference/image_util.h
@@ -0,0 +1,77 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
+#define IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/buffer_view.h"
+
+#if __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Loads the image at |filename| into |out_pixel_data| and sets
+// |out_buffer_length| to its length.
+//
+// The image dimension must match the width, height, and channel in|shape|,
+// while 2 <= |shape_rank| <= 4 to match the image tensor format.
+//
+// The file must be in a format supported by stb_image.h.
+// The returned |out_pixel_data| buffer must be released by the caller.
+iree_status_t iree_tools_utils_load_pixel_data(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length);
+
+// Parse the content in an image file in |filename| into a HAL buffer view
+// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
+// |shape_rank|, and |element_type|, while being allocated by |allocator|.
+//
+// The |element_type| has to be SINT_8 or UINT_8. For FLOAT_32, use
+// |iree_tools_utils_buffer_view_from_image_rescaled| instead.
+//
+// The returned |out_buffer_view| must be released by the caller.
+iree_status_t iree_tools_utils_buffer_view_from_image(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view);
+
+// Parse the content in an image file in |filename| into a HAL buffer view
+// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
+// |shape_rank|, and |element_type|, while being allocated by |allocator|.
+// The value in |out_buffer_view| is rescaled with |input_range|.
+//
+// The |element_type| has to be FLOAT_32, For SINT_8 or UINT_8, use
+// |iree_tools_utils_buffer_view_from_image| instead.
+//
+// The returned |out_buffer_view| must be released by the caller.
+iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, const float* input_range,
+    iree_host_size_t input_range_length,
+    iree_hal_buffer_view_t** out_buffer_view);
+
+// Normalize uint8_t |pixel_data| of the size |buffer_length| to float buffer
+// |out_buffer| with the range |input_range|.
+//
+// float32_x = (uint8_x - 127.5) / 127.5 * input_scale + input_offset, where
+// input_scale = abs(|input_range[0]| - |input_range[1]| / 2
+// input_offset = |input_range[0]| + |input_range[1]| / 2
+//
+// |out_buffer| needs to be allocated before the call.
+iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
+    const uint8_t* pixel_data, iree_host_size_t pixel_count,
+    const float* input_range, iree_host_size_t input_range_length,
+    float* out_buffer);
+
+#if __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
--- a/cpp/vision_inference/iree-run-mnist-module.c
+++ b/cpp/vision_inference/iree-run-mnist-module.c
@@ -0,0 +1,121 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This sample uses image_util to load a hand-written image as an
+// iree_hal_buffer_view_t then passes it to the bytecode module built from
+// mnist.mlir on the CPU backend with the local-task driver.
+
+#include <float.h>
+
+#include "image_util.h"
+#include "iree/runtime/api.h"
+#include "mnist_bytecode_module_c.h"
+
+iree_status_t Run(const iree_string_view_t image_path) {
+  iree_runtime_instance_options_t instance_options;
+  iree_runtime_instance_options_initialize(IREE_API_VERSION_LATEST,
+                                           &instance_options);
+  iree_runtime_instance_options_use_all_available_drivers(&instance_options);
+  iree_runtime_instance_t* instance = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_instance_create(
+      &instance_options, iree_allocator_system(), &instance));
+
+  // TODO(#5724): move device selection into the compiled modules.
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_instance_try_create_default_device(
+      instance, iree_make_cstring_view("local-task"), &device));
+
+  // Create one session per loaded module to hold the module state.
+  iree_runtime_session_options_t session_options;
+  iree_runtime_session_options_initialize(&session_options);
+  iree_runtime_session_t* session = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_session_create_with_device(
+      instance, &session_options, device,
+      iree_runtime_instance_host_allocator(instance), &session));
+  iree_hal_device_release(device);
+
+  const struct iree_file_toc_t* module_file =
+      iree_samples_vision_inference_mnist_bytecode_module_create();
+
+  IREE_RETURN_IF_ERROR(iree_runtime_session_append_bytecode_module_from_memory(
+      session, iree_make_const_byte_span(module_file->data, module_file->size),
+      iree_allocator_null()));
+
+  iree_runtime_call_t call;
+  IREE_RETURN_IF_ERROR(iree_runtime_call_initialize_by_name(
+      session, iree_make_cstring_view("module.predict"), &call));
+
+  // Prepare the input hal buffer view with image_util library.
+  // The input of the mmist model is single 28x28 pixel image as a
+  // tensor<1x28x28x1xf32>, with pixels in [0.0, 1.0].
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  iree_hal_dim_t buffer_shape[] = {1, 28, 28, 1};
+  iree_hal_element_type_t hal_element_type = IREE_HAL_ELEMENT_TYPE_FLOAT_32;
+  float input_range[2] = {0.0f, 1.0f};
+  IREE_RETURN_IF_ERROR(
+      iree_tools_utils_buffer_view_from_image_rescaled(
+          image_path, buffer_shape, IREE_ARRAYSIZE(buffer_shape),
+          hal_element_type, iree_hal_device_allocator(device), input_range,
+          IREE_ARRAYSIZE(input_range), &buffer_view),
+      "load image");
+  IREE_RETURN_IF_ERROR(
+      iree_runtime_call_inputs_push_back_buffer_view(&call, buffer_view));
+  iree_hal_buffer_view_release(buffer_view);
+
+  IREE_RETURN_IF_ERROR(iree_runtime_call_invoke(&call, /*flags=*/0));
+
+  // Get the result buffers from the invocation.
+  iree_hal_buffer_view_t* ret_buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_runtime_call_outputs_pop_front_buffer_view(&call, &ret_buffer_view));
+
+  // Read back the results. The output of the mnist model is a 1x10 prediction
+  // confidence values for each digit in [0, 9].
+  float predictions[1 * 10] = {0.0f};
+  IREE_RETURN_IF_ERROR(iree_hal_device_transfer_d2h(
+      iree_runtime_session_device(session),
+      iree_hal_buffer_view_buffer(ret_buffer_view), 0, predictions,
+      sizeof(predictions), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+      iree_infinite_timeout()));
+  iree_hal_buffer_view_release(ret_buffer_view);
+
+  // Get the highest index from the output.
+  float result_val = FLT_MIN;
+  int result_idx = 0;
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(predictions); ++i) {
+    if (predictions[i] > result_val) {
+      result_val = predictions[i];
+      result_idx = i;
+    }
+  }
+  fprintf(stdout, "Detected number: %d\n", result_idx);
+
+  iree_runtime_call_deinitialize(&call);
+  iree_runtime_session_release(session);
+  iree_runtime_instance_release(instance);
+  return iree_ok_status();
+}
+
+int main(int argc, char** argv) {
+  if (argc > 2) {
+    fprintf(stderr, "Usage: iree-run-mnist-module <image file>\n");
+    return -1;
+  }
+  iree_string_view_t image_path;
+  if (argc == 1) {
+    image_path = iree_make_cstring_view("mnist_test.png");
+  } else {
+    image_path = iree_make_cstring_view(argv[1]);
+  }
+  iree_status_t result = Run(image_path);
+  if (!iree_status_is_ok(result)) {
+    iree_status_fprint(stderr, result);
+    iree_status_ignore(result);
+    return -1;
+  }
+  iree_status_ignore(result);
+  return 0;
+}
--- a/cpp/vision_inference/mnist_test.png
+++ b/cpp/vision_inference/mnist_test.png
--- a/cpp/vulkan_gui/CMakeLists.txt
+++ b/cpp/vulkan_gui/CMakeLists.txt
@@ -0,0 +1,116 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT IREE_TARGET_BACKEND_VULKAN_SPIRV OR
+   NOT IREE_HAL_DRIVER_VULKAN)
+  message(STATUS "Missing Vulkan backend and/or driver, skipping vulkan_gui sample")
+  return()
+endif()
+
+# This target statically links against Vulkan.
+# One way to achieve this is by installing the Vulkan SDK from
+# https://vulkan.lunarg.com/.
+include(FindVulkan)
+if(NOT Vulkan_FOUND)
+  message(STATUS "Could not find Vulkan, skipping vulkan_gui sample")
+  return()
+endif()
+
+# vcpkg install sdl2[vulkan]
+#   tested with versions 2.0.14#4 - 2.0.22#1
+find_package(SDL2)
+if(NOT SDL2_FOUND)
+  message(STATUS "Could not find SDL2, skipping vulkan_gui sample")
+  return()
+endif()
+
+FetchContent_Declare(
+  imgui
+  GIT_REPOSITORY https://github.com/ocornut/imgui
+  GIT_TAG        master
+)
+
+FetchContent_MakeAvailable(imgui)
+
+# Dear ImGui
+set(IMGUI_DIR ${CMAKE_BINARY_DIR}/_deps/imgui-src)
+message("Looking for Imgui in ${IMGUI_DIR}")
+include_directories(${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
+
+
+function(iree_vulkan_sample)
+
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "NAME"
+    "SRCS"
+    ${ARGN}
+  )
+
+
+  # Define the sample executable.
+  set(_NAME "${_RULE_NAME}")
+  set(SRCS "${_RULE_SRCS}")
+  add_executable(${_NAME} "")
+  target_sources(${_NAME}
+    PRIVATE
+      ${SRCS}
+      "${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
+      "${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
+      "${IMGUI_DIR}/imgui.cpp"
+      "${IMGUI_DIR}/imgui_draw.cpp"
+      "${IMGUI_DIR}/imgui_demo.cpp"
+      "${IMGUI_DIR}/imgui_tables.cpp"
+      "${IMGUI_DIR}/imgui_widgets.cpp"
+  )
+  set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_NAME}")
+  target_include_directories(${_NAME} PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+  )
+  target_link_libraries(${_NAME}
+    SDL2::SDL2
+    Vulkan::Vulkan
+    iree_runtime_runtime
+    iree_base_internal_main
+    iree_hal_drivers_vulkan_registration_registration
+    iree_modules_hal_hal
+    iree_vm_vm
+    iree_vm_bytecode_module
+    iree_vm_cc
+    iree_tooling_vm_util_cc
+    iree_tooling_context_util
+  )
+
+  if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+    set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
+  else()
+    set(_GUI_LINKOPTS "")
+  endif()
+
+  target_link_options(${_NAME}
+    PRIVATE
+      ${_GUI_LINKOPTS}
+  )
+endfunction()
+
+iree_vulkan_sample(
+    NAME
+      iree-samples-resnet-vulkan-gui
+
+    SRCS
+      vulkan_resnet_inference_gui.cc
+)
+
+iree_vulkan_sample(
+    NAME
+      iree-vulkan-gui
+
+    SRCS
+      vulkan_inference_gui.cc
+)
+
+message(STATUS "Configured vulkan_gui sample successfully")
--- a/cpp/vulkan_gui/simple_mul.mlir
+++ b/cpp/vulkan_gui/simple_mul.mlir
@@ -0,0 +1,4 @@
+func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = "arith.mulf"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
--- a/cpp/vulkan_gui/snail_imagenet.jpg
+++ b/cpp/vulkan_gui/snail_imagenet.jpg
--- a/cpp/vulkan_gui/stb_image.h
+++ b/cpp/vulkan_gui/stb_image.h
--- a/cpp/vulkan_gui/vulkan_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_inference_gui.cc
@@ -0,0 +1,957 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Vulkan Graphics + IREE API Integration Sample.
+
+#include <SDL.h>
+#include <SDL_vulkan.h>
+#include <imgui.h>
+#include <imgui_impl_sdl.h>
+#include <imgui_impl_vulkan.h>
+#include <vulkan/vulkan.h>
+
+
+#include <cstring>
+#include <set>
+#include <vector>
+#include <fstream>
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+#include <iterator>
+#include <string>
+#include <utility>
+
+#include "iree/hal/drivers/vulkan/api.h"
+
+// IREE's C API:
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/drivers/vulkan/registration/driver_module.h"
+#include "iree/modules/hal/module.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+#include "iree/vm/ref_cc.h"
+
+// iree-run-module
+#include "iree/base/internal/flags.h"
+#include "iree/base/status_cc.h"
+#include "iree/base/tracing.h"
+#include "iree/modules/hal/types.h"
+#include "iree/tooling/comparison.h"
+#include "iree/tooling/context_util.h"
+#include "iree/tooling/vm_util_cc.h"
+
+// Other dependencies (helpers, etc.)
+#include "iree/base/internal/main.h"
+
+#define IMGUI_UNLIMITED_FRAME_RATE
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+IREE_FLAG(string, entry_function, "",
+          "Name of a function contained in the module specified by module_file "
+          "to run.");
+
+// TODO(benvanik): move --function_input= flag into a util.
+static iree_status_t parse_function_io(iree_string_view_t flag_name,
+                                       void* storage,
+                                       iree_string_view_t value) {
+  auto* list = (std::vector<std::string>*)storage;
+  list->push_back(std::string(value.data, value.size));
+  return iree_ok_status();
+}
+static void print_function_io(iree_string_view_t flag_name, void* storage,
+                              FILE* file) {
+  auto* list = (std::vector<std::string>*)storage;
+  if (list->empty()) {
+    fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
+  } else {
+    for (size_t i = 0; i < list->size(); ++i) {
+      fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
+              list->at(i).c_str());
+    }
+  }
+}
+static std::vector<std::string> FLAG_function_inputs;
+IREE_FLAG_CALLBACK(
+    parse_function_io, print_function_io, &FLAG_function_inputs, function_input,
+    "An input (a) value or (b) buffer of the format:\n"
+    "  (a) scalar value\n"
+    "     value\n"
+    "     e.g.: --function_input=\"3.14\"\n"
+    "  (b) buffer:\n"
+    "     [shape]xtype=[value]\n"
+    "     e.g.: --function_input=\"2x2xi32=1 2 3 4\"\n"
+    "Optionally, brackets may be used to separate the element values:\n"
+    "  2x2xi32=[[1 2][3 4]]\n"
+    "Raw binary files can be read to provide buffer contents:\n"
+    "  2x2xi32=@some/file.bin\n"
+    "numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
+    "  @some.npy\n"
+    "Each occurrence of the flag indicates an input in the order they were\n"
+    "specified on the command line.");
+
+typedef struct iree_file_toc_t {
+  const char* name;             // the file's original name
+  char* data;             // beginning of the file
+  size_t size;                  // length of the file
+} iree_file_toc_t;
+
+bool load_file(const char* filename, char** pOut, size_t* pSize)
+{
+    FILE* f = fopen(filename, "rb");
+    if (f == NULL)
+    {
+        fprintf(stderr, "Can't open %s\n", filename);
+        return false;
+    }
+
+    fseek(f, 0L, SEEK_END);
+    *pSize = ftell(f);
+    fseek(f, 0L, SEEK_SET);
+
+    *pOut = (char*)malloc(*pSize);
+
+    size_t size = fread(*pOut, *pSize, 1, f);
+
+    fclose(f);
+
+    return size != 0;
+}
+
+static VkAllocationCallbacks* g_Allocator = NULL;
+static VkInstance g_Instance = VK_NULL_HANDLE;
+static VkPhysicalDevice g_PhysicalDevice = VK_NULL_HANDLE;
+static VkDevice g_Device = VK_NULL_HANDLE;
+static uint32_t g_QueueFamily = (uint32_t)-1;
+static VkQueue g_Queue = VK_NULL_HANDLE;
+static VkPipelineCache g_PipelineCache = VK_NULL_HANDLE;
+static VkDescriptorPool g_DescriptorPool = VK_NULL_HANDLE;
+
+static ImGui_ImplVulkanH_Window g_MainWindowData;
+static uint32_t g_MinImageCount = 2;
+static bool g_SwapChainRebuild = false;
+static int g_SwapChainResizeWidth = 0;
+static int g_SwapChainResizeHeight = 0;
+
+static void check_vk_result(VkResult err) {
+  if (err == 0) return;
+  fprintf(stderr, "VkResult: %d\n", err);
+  abort();
+}
+
+// Returns the names of the Vulkan layers used for the given IREE
+// |extensibility_set| and |features|.
+std::vector<const char*> GetIreeLayers(
+    iree_hal_vulkan_extensibility_set_t extensibility_set,
+    iree_hal_vulkan_features_t features) {
+  iree_host_size_t required_count;
+  iree_hal_vulkan_query_extensibility_set(
+      features, extensibility_set, /*string_capacity=*/0, &required_count,
+      /*out_string_values=*/NULL);
+  std::vector<const char*> layers(required_count);
+  iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
+                                          layers.size(), &required_count,
+                                          layers.data());
+  return layers;
+}
+
+// Returns the names of the Vulkan extensions used for the given IREE
+// |extensibility_set| and |features|.
+std::vector<const char*> GetIreeExtensions(
+    iree_hal_vulkan_extensibility_set_t extensibility_set,
+    iree_hal_vulkan_features_t features) {
+  iree_host_size_t required_count;
+  iree_hal_vulkan_query_extensibility_set(
+      features, extensibility_set, /*string_capacity=*/0, &required_count,
+      /*out_string_values=*/NULL);
+  std::vector<const char*> extensions(required_count);
+  iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
+                                          extensions.size(), &required_count,
+                                          extensions.data());
+  return extensions;
+}
+
+// Returns the names of the Vulkan extensions used for the given IREE
+// |vulkan_features|.
+std::vector<const char*> GetDeviceExtensions(
+    VkPhysicalDevice physical_device,
+    iree_hal_vulkan_features_t vulkan_features) {
+  std::vector<const char*> iree_required_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
+      vulkan_features);
+  std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
+      vulkan_features);
+
+  uint32_t extension_count = 0;
+  check_vk_result(vkEnumerateDeviceExtensionProperties(
+      physical_device, nullptr, &extension_count, nullptr));
+  std::vector<VkExtensionProperties> extension_properties(extension_count);
+  check_vk_result(vkEnumerateDeviceExtensionProperties(
+      physical_device, nullptr, &extension_count, extension_properties.data()));
+
+  // Merge extensions lists, including optional and required for simplicity.
+  std::set<const char*> ext_set;
+  ext_set.insert("VK_KHR_swapchain");
+  ext_set.insert(iree_required_extensions.begin(),
+                 iree_required_extensions.end());
+  for (int i = 0; i < iree_optional_extensions.size(); ++i) {
+    const char* optional_extension = iree_optional_extensions[i];
+    for (int j = 0; j < extension_count; ++j) {
+      if (strcmp(optional_extension, extension_properties[j].extensionName) ==
+          0) {
+        ext_set.insert(optional_extension);
+        break;
+      }
+    }
+  }
+  std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
+  return extensions;
+}
+
+std::vector<const char*> GetInstanceLayers(
+    iree_hal_vulkan_features_t vulkan_features) {
+  // Query the layers that IREE wants / needs.
+  std::vector<const char*> required_layers = GetIreeLayers(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_REQUIRED, vulkan_features);
+  std::vector<const char*> optional_layers = GetIreeLayers(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL, vulkan_features);
+
+  // Query the layers that are available on the Vulkan ICD.
+  uint32_t layer_property_count = 0;
+  check_vk_result(
+      vkEnumerateInstanceLayerProperties(&layer_property_count, NULL));
+  std::vector<VkLayerProperties> layer_properties(layer_property_count);
+  check_vk_result(vkEnumerateInstanceLayerProperties(&layer_property_count,
+                                                     layer_properties.data()));
+
+  // Match between optional/required and available layers.
+  std::vector<const char*> layers;
+  for (const char* layer_name : required_layers) {
+    bool found = false;
+    for (const auto& layer_property : layer_properties) {
+      if (std::strcmp(layer_name, layer_property.layerName) == 0) {
+        found = true;
+        layers.push_back(layer_name);
+        break;
+      }
+    }
+    if (!found) {
+      fprintf(stderr, "Required layer %s not available\n", layer_name);
+      abort();
+    }
+  }
+  for (const char* layer_name : optional_layers) {
+    for (const auto& layer_property : layer_properties) {
+      if (std::strcmp(layer_name, layer_property.layerName) == 0) {
+        layers.push_back(layer_name);
+        break;
+      }
+    }
+  }
+
+  return layers;
+}
+
+std::vector<const char*> GetInstanceExtensions(
+    SDL_Window* window, iree_hal_vulkan_features_t vulkan_features) {
+  // Ask SDL for its list of required instance extensions.
+  uint32_t sdl_extensions_count = 0;
+  SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count, NULL);
+  std::vector<const char*> sdl_extensions(sdl_extensions_count);
+  SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count,
+                                   sdl_extensions.data());
+
+  std::vector<const char*> iree_required_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_REQUIRED,
+      vulkan_features);
+  std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
+      vulkan_features);
+
+  // Merge extensions lists, including optional and required for simplicity.
+  std::set<const char*> ext_set;
+  ext_set.insert(sdl_extensions.begin(), sdl_extensions.end());
+  ext_set.insert(iree_required_extensions.begin(),
+                 iree_required_extensions.end());
+  ext_set.insert(iree_optional_extensions.begin(),
+                 iree_optional_extensions.end());
+  std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
+  return extensions;
+}
+
+void SetupVulkan(iree_hal_vulkan_features_t vulkan_features,
+                 const char** instance_layers, uint32_t instance_layers_count,
+                 const char** instance_extensions,
+                 uint32_t instance_extensions_count,
+                 const VkAllocationCallbacks* allocator, VkInstance* instance,
+                 uint32_t* queue_family_index,
+                 VkPhysicalDevice* physical_device, VkQueue* queue,
+                 VkDevice* device, VkDescriptorPool* descriptor_pool) {
+  VkResult err;
+
+  // Create Vulkan Instance
+  {
+    VkInstanceCreateInfo create_info = {};
+    create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+    create_info.enabledLayerCount = instance_layers_count;
+    create_info.ppEnabledLayerNames = instance_layers;
+    create_info.enabledExtensionCount = instance_extensions_count;
+    create_info.ppEnabledExtensionNames = instance_extensions;
+    err = vkCreateInstance(&create_info, allocator, instance);
+    check_vk_result(err);
+  }
+
+  // Select GPU
+  {
+    uint32_t gpu_count;
+    err = vkEnumeratePhysicalDevices(*instance, &gpu_count, NULL);
+    check_vk_result(err);
+    IM_ASSERT(gpu_count > 0);
+
+    VkPhysicalDevice* gpus =
+        (VkPhysicalDevice*)malloc(sizeof(VkPhysicalDevice) * gpu_count);
+    err = vkEnumeratePhysicalDevices(*instance, &gpu_count, gpus);
+    check_vk_result(err);
+
+    // Use the first reported GPU for simplicity.
+    *physical_device = gpus[0];
+
+    VkPhysicalDeviceProperties properties;
+    vkGetPhysicalDeviceProperties(*physical_device, &properties);
+    fprintf(stdout, "Selected Vulkan device: '%s'\n", properties.deviceName);
+    free(gpus);
+  }
+
+  // Select queue family. We want a single queue with graphics and compute for
+  // simplicity, but we could also discover and use separate queues for each.
+  {
+    uint32_t count;
+    vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, NULL);
+    VkQueueFamilyProperties* queues = (VkQueueFamilyProperties*)malloc(
+        sizeof(VkQueueFamilyProperties) * count);
+    vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, queues);
+    for (uint32_t i = 0; i < count; i++) {
+      if (queues[i].queueFlags &
+          (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)) {
+        *queue_family_index = i;
+        break;
+      }
+    }
+    free(queues);
+    IM_ASSERT(*queue_family_index != (uint32_t)-1);
+  }
+
+  // Create Logical Device (with 1 queue)
+  {
+    std::vector<const char*> device_extensions =
+        GetDeviceExtensions(*physical_device, vulkan_features);
+    const float queue_priority[] = {1.0f};
+    VkDeviceQueueCreateInfo queue_info = {};
+    queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    queue_info.queueFamilyIndex = *queue_family_index;
+    queue_info.queueCount = 1;
+    queue_info.pQueuePriorities = queue_priority;
+    VkDeviceCreateInfo create_info = {};
+    create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+    create_info.queueCreateInfoCount = 1;
+    create_info.pQueueCreateInfos = &queue_info;
+    create_info.enabledExtensionCount =
+        static_cast<uint32_t>(device_extensions.size());
+    create_info.ppEnabledExtensionNames = device_extensions.data();
+
+    // Enable timeline semaphores.
+    VkPhysicalDeviceFeatures2 features2;
+    memset(&features2, 0, sizeof(features2));
+    features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    create_info.pNext = &features2;
+    VkPhysicalDeviceTimelineSemaphoreFeatures semaphore_features;
+    memset(&semaphore_features, 0, sizeof(semaphore_features));
+    semaphore_features.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES;
+    semaphore_features.pNext = features2.pNext;
+    features2.pNext = &semaphore_features;
+    semaphore_features.timelineSemaphore = VK_TRUE;
+
+    err = vkCreateDevice(*physical_device, &create_info, allocator, device);
+    check_vk_result(err);
+    vkGetDeviceQueue(*device, *queue_family_index, 0, queue);
+  }
+
+  // Create Descriptor Pool
+  {
+    VkDescriptorPoolSize pool_sizes[] = {
+        {VK_DESCRIPTOR_TYPE_SAMPLER, 1000},
+        {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1000},
+        {VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC, 1000},
+        {VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, 1000}};
+    VkDescriptorPoolCreateInfo pool_info = {};
+    pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
+    pool_info.maxSets = 1000 * IREE_ARRAYSIZE(pool_sizes);
+    pool_info.poolSizeCount = (uint32_t)IREE_ARRAYSIZE(pool_sizes);
+    pool_info.pPoolSizes = pool_sizes;
+    err =
+        vkCreateDescriptorPool(*device, &pool_info, allocator, descriptor_pool);
+    check_vk_result(err);
+  }
+}
+
+void SetupVulkanWindow(ImGui_ImplVulkanH_Window* wd,
+                       const VkAllocationCallbacks* allocator,
+                       VkInstance instance, uint32_t queue_family_index,
+                       VkPhysicalDevice physical_device, VkDevice device,
+                       VkSurfaceKHR surface, int width, int height,
+                       uint32_t min_image_count) {
+  wd->Surface = surface;
+
+  // Check for WSI support
+  VkBool32 res;
+  vkGetPhysicalDeviceSurfaceSupportKHR(physical_device, queue_family_index,
+                                       wd->Surface, &res);
+  if (res != VK_TRUE) {
+    fprintf(stderr, "Error no WSI support on physical device 0\n");
+    exit(-1);
+  }
+
+  // Select Surface Format
+  const VkFormat requestSurfaceImageFormat[] = {
+      VK_FORMAT_B8G8R8A8_UNORM, VK_FORMAT_R8G8B8A8_UNORM,
+      VK_FORMAT_B8G8R8_UNORM, VK_FORMAT_R8G8B8_UNORM};
+  const VkColorSpaceKHR requestSurfaceColorSpace =
+      VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+  wd->SurfaceFormat = ImGui_ImplVulkanH_SelectSurfaceFormat(
+      physical_device, wd->Surface, requestSurfaceImageFormat,
+      (size_t)IREE_ARRAYSIZE(requestSurfaceImageFormat),
+      requestSurfaceColorSpace);
+
+  // Select Present Mode
+#ifdef IMGUI_UNLIMITED_FRAME_RATE
+  VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_MAILBOX_KHR,
+                                      VK_PRESENT_MODE_IMMEDIATE_KHR,
+                                      VK_PRESENT_MODE_FIFO_KHR};
+#else
+  VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_FIFO_KHR};
+#endif
+  wd->PresentMode = ImGui_ImplVulkanH_SelectPresentMode(
+      physical_device, wd->Surface, &present_modes[0],
+      IREE_ARRAYSIZE(present_modes));
+
+  // Create SwapChain, RenderPass, Framebuffer, etc.
+  IM_ASSERT(min_image_count >= 2);
+  ImGui_ImplVulkanH_CreateOrResizeWindow(instance, physical_device, device, wd,
+                                         queue_family_index, allocator, width,
+                                         height, min_image_count);
+
+  // Set clear color.
+  ImVec4 clear_color = ImVec4(0.45f, 0.55f, 0.60f, 1.00f);
+  memcpy(&wd->ClearValue.color.float32[0], &clear_color, 4 * sizeof(float));
+}
+
+void RenderFrame(ImGui_ImplVulkanH_Window* wd, VkDevice device, VkQueue queue) {
+  VkResult err;
+
+  VkSemaphore image_acquired_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].ImageAcquiredSemaphore;
+  VkSemaphore render_complete_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
+  err = vkAcquireNextImageKHR(device, wd->Swapchain, UINT64_MAX,
+                              image_acquired_semaphore, VK_NULL_HANDLE,
+                              &wd->FrameIndex);
+  check_vk_result(err);
+
+  ImGui_ImplVulkanH_Frame* fd = &wd->Frames[wd->FrameIndex];
+  {
+    err = vkWaitForFences(
+        device, 1, &fd->Fence, VK_TRUE,
+        UINT64_MAX);  // wait indefinitely instead of periodically checking
+    check_vk_result(err);
+
+    err = vkResetFences(device, 1, &fd->Fence);
+    check_vk_result(err);
+  }
+  {
+    err = vkResetCommandPool(device, fd->CommandPool, 0);
+    check_vk_result(err);
+    VkCommandBufferBeginInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    err = vkBeginCommandBuffer(fd->CommandBuffer, &info);
+    check_vk_result(err);
+  }
+  {
+    VkRenderPassBeginInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+    info.renderPass = wd->RenderPass;
+    info.framebuffer = fd->Framebuffer;
+    info.renderArea.extent.width = wd->Width;
+    info.renderArea.extent.height = wd->Height;
+    info.clearValueCount = 1;
+    info.pClearValues = &wd->ClearValue;
+    vkCmdBeginRenderPass(fd->CommandBuffer, &info, VK_SUBPASS_CONTENTS_INLINE);
+  }
+
+  // Record Imgui Draw Data and draw funcs into command buffer
+  ImGui_ImplVulkan_RenderDrawData(ImGui::GetDrawData(), fd->CommandBuffer);
+
+  // Submit command buffer
+  vkCmdEndRenderPass(fd->CommandBuffer);
+  {
+    VkPipelineStageFlags wait_stage =
+        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    VkSubmitInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    info.waitSemaphoreCount = 1;
+    info.pWaitSemaphores = &image_acquired_semaphore;
+    info.pWaitDstStageMask = &wait_stage;
+    info.commandBufferCount = 1;
+    info.pCommandBuffers = &fd->CommandBuffer;
+    info.signalSemaphoreCount = 1;
+    info.pSignalSemaphores = &render_complete_semaphore;
+
+    err = vkEndCommandBuffer(fd->CommandBuffer);
+    check_vk_result(err);
+    err = vkQueueSubmit(queue, 1, &info, fd->Fence);
+    check_vk_result(err);
+  }
+}
+
+void PresentFrame(ImGui_ImplVulkanH_Window* wd, VkQueue queue) {
+  VkSemaphore render_complete_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
+  VkPresentInfoKHR info = {};
+  info.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+  info.waitSemaphoreCount = 1;
+  info.pWaitSemaphores = &render_complete_semaphore;
+  info.swapchainCount = 1;
+  info.pSwapchains = &wd->Swapchain;
+  info.pImageIndices = &wd->FrameIndex;
+  VkResult err = vkQueuePresentKHR(queue, &info);
+  check_vk_result(err);
+  wd->SemaphoreIndex =
+      (wd->SemaphoreIndex + 1) %
+      wd->ImageCount;  // Now we can use the next set of semaphores
+}
+
+static void CleanupVulkan() {
+  vkDestroyDescriptorPool(g_Device, g_DescriptorPool, g_Allocator);
+
+  vkDestroyDevice(g_Device, g_Allocator);
+  vkDestroyInstance(g_Instance, g_Allocator);
+}
+
+static void CleanupVulkanWindow() {
+  ImGui_ImplVulkanH_DestroyWindow(g_Instance, g_Device, &g_MainWindowData,
+                                  g_Allocator);
+}
+
+namespace iree {
+
+extern "C" int iree_main(int argc, char** argv) {
+
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
+  if (argc > 1) {
+    // Avoid iree-run-module spinning endlessly on stdin if the user uses single
+    // dashes for flags.
+    printf(
+        "[ERROR] unexpected positional argument (expected none)."
+        " Did you use pass a flag with a single dash ('-')?"
+        " Use '--' instead.\n");
+    return 1;
+  }
+
+  // --------------------------------------------------------------------------
+  // Create a window.
+  if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) != 0) {
+    fprintf(stderr, "Failed to initialize SDL\n");
+    abort();
+    return 1;
+  }
+
+  // Setup window
+  // clang-format off
+  SDL_WindowFlags window_flags = (SDL_WindowFlags)(
+      SDL_WINDOW_VULKAN | SDL_WINDOW_RESIZABLE | SDL_WINDOW_ALLOW_HIGHDPI);
+  // clang-format on
+  SDL_Window* window = SDL_CreateWindow(
+      "IREE Samples - Vulkan Inference GUI", SDL_WINDOWPOS_CENTERED,
+      SDL_WINDOWPOS_CENTERED, 1280, 720, window_flags);
+  if (window == nullptr)
+  {
+    const char* sdl_err = SDL_GetError();
+    fprintf(stderr, "Error, SDL_CreateWindow returned: %s\n", sdl_err);
+    abort();
+    return 1;
+  }
+
+  // Setup Vulkan
+  iree_hal_vulkan_features_t iree_vulkan_features =
+      static_cast<iree_hal_vulkan_features_t>(
+          IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS |
+          IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
+  std::vector<const char*> layers = GetInstanceLayers(iree_vulkan_features);
+  std::vector<const char*> extensions =
+      GetInstanceExtensions(window, iree_vulkan_features);
+  SetupVulkan(iree_vulkan_features, layers.data(),
+              static_cast<uint32_t>(layers.size()), extensions.data(),
+              static_cast<uint32_t>(extensions.size()), g_Allocator,
+              &g_Instance, &g_QueueFamily, &g_PhysicalDevice, &g_Queue,
+              &g_Device, &g_DescriptorPool);
+
+  // Create Window Surface
+  VkSurfaceKHR surface;
+  VkResult err;
+  if (SDL_Vulkan_CreateSurface(window, g_Instance, &surface) == 0) {
+    fprintf(stderr, "Failed to create Vulkan surface.\n");
+    abort();
+    return 1;
+  }
+
+  // Create Framebuffers
+  int w, h;
+  SDL_GetWindowSize(window, &w, &h);
+  ImGui_ImplVulkanH_Window* wd = &g_MainWindowData;
+  SetupVulkanWindow(wd, g_Allocator, g_Instance, g_QueueFamily,
+                    g_PhysicalDevice, g_Device, surface, w, h, g_MinImageCount);
+
+  // Setup Dear ImGui context
+  IMGUI_CHECKVERSION();
+  ImGui::CreateContext();
+  ImGuiIO& io = ImGui::GetIO();
+  (void)io;
+
+  ImGui::StyleColorsDark();
+
+  // Setup Platform/Renderer bindings
+  ImGui_ImplSDL2_InitForVulkan(window);
+  ImGui_ImplVulkan_InitInfo init_info = {};
+  init_info.Instance = g_Instance;
+  init_info.PhysicalDevice = g_PhysicalDevice;
+  init_info.Device = g_Device;
+  init_info.QueueFamily = g_QueueFamily;
+  init_info.Queue = g_Queue;
+  init_info.PipelineCache = g_PipelineCache;
+  init_info.DescriptorPool = g_DescriptorPool;
+  init_info.Allocator = g_Allocator;
+  init_info.MinImageCount = g_MinImageCount;
+  init_info.ImageCount = wd->ImageCount;
+  init_info.CheckVkResultFn = check_vk_result;
+  ImGui_ImplVulkan_Init(&init_info, wd->RenderPass);
+
+  // Upload Fonts
+  {
+    // Use any command queue
+    VkCommandPool command_pool = wd->Frames[wd->FrameIndex].CommandPool;
+    VkCommandBuffer command_buffer = wd->Frames[wd->FrameIndex].CommandBuffer;
+
+    err = vkResetCommandPool(g_Device, command_pool, 0);
+    check_vk_result(err);
+    VkCommandBufferBeginInfo begin_info = {};
+    begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    begin_info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    err = vkBeginCommandBuffer(command_buffer, &begin_info);
+    check_vk_result(err);
+
+    ImGui_ImplVulkan_CreateFontsTexture(command_buffer);
+
+    VkSubmitInfo end_info = {};
+    end_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    end_info.commandBufferCount = 1;
+    end_info.pCommandBuffers = &command_buffer;
+    err = vkEndCommandBuffer(command_buffer);
+    check_vk_result(err);
+    err = vkQueueSubmit(g_Queue, 1, &end_info, VK_NULL_HANDLE);
+    check_vk_result(err);
+
+    err = vkDeviceWaitIdle(g_Device);
+    check_vk_result(err);
+    ImGui_ImplVulkan_DestroyFontUploadObjects();
+  }
+
+  // Demo state.
+  bool show_iree_window = true;
+  // --------------------------------------------------------------------------
+  // Setup IREE.
+
+  // Check API version.
+  iree_api_version_t actual_version;
+  iree_status_t status =
+      iree_api_version_check(IREE_API_VERSION_LATEST, &actual_version);
+  if (iree_status_is_ok(status)) {
+    fprintf(stdout, "IREE runtime API version: %d\n", actual_version);
+  } else {
+    fprintf(stderr, "Unsupported runtime API version: %d\n", actual_version);
+    abort();
+  }
+
+  // Create a runtime Instance.
+  iree_vm_instance_t* iree_instance = nullptr;
+  IREE_CHECK_OK(
+      iree_vm_instance_create(iree_allocator_system(), &iree_instance));
+
+  // Register HAL drivers and VM module types.
+  IREE_CHECK_OK(iree_hal_vulkan_driver_module_register(
+      iree_hal_driver_registry_default()));
+  IREE_CHECK_OK(iree_hal_module_register_all_types(iree_instance));
+
+  // Create IREE Vulkan Driver and Device, sharing our VkInstance/VkDevice.
+  fprintf(stdout, "Creating Vulkan driver/device\n");
+  // Load symbols from our static `vkGetInstanceProcAddr` for IREE to use.
+  iree_hal_vulkan_syms_t* iree_vk_syms = nullptr;
+  IREE_CHECK_OK(iree_hal_vulkan_syms_create(
+      reinterpret_cast<void*>(&vkGetInstanceProcAddr), iree_allocator_system(),
+      &iree_vk_syms));
+  // Create the driver sharing our VkInstance.
+  iree_hal_driver_t* iree_vk_driver = nullptr;
+  iree_string_view_t driver_identifier = iree_make_cstring_view("vulkan");
+  iree_hal_vulkan_driver_options_t driver_options;
+  driver_options.api_version = VK_API_VERSION_1_0;
+  driver_options.requested_features = static_cast<iree_hal_vulkan_features_t>(
+      IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
+  IREE_CHECK_OK(iree_hal_vulkan_driver_create_using_instance(
+      driver_identifier, &driver_options, iree_vk_syms, g_Instance,
+      iree_allocator_system(), &iree_vk_driver));
+  // Create a device sharing our VkDevice and queue.
+  // We could also create a separate (possibly low priority) compute queue for
+  // IREE, and/or provide a dedicated transfer queue.
+  iree_string_view_t device_identifier = iree_make_cstring_view("vulkan");
+  iree_hal_vulkan_queue_set_t compute_queue_set;
+  compute_queue_set.queue_family_index = g_QueueFamily;
+  compute_queue_set.queue_indices = 1 << 0;
+  iree_hal_vulkan_queue_set_t transfer_queue_set;
+  transfer_queue_set.queue_indices = 0;
+  iree_hal_device_t* iree_vk_device = nullptr;
+  IREE_CHECK_OK(iree_hal_vulkan_wrap_device(
+      device_identifier, &driver_options.device_options, iree_vk_syms,
+      g_Instance, g_PhysicalDevice, g_Device, &compute_queue_set,
+      &transfer_queue_set, iree_allocator_system(), &iree_vk_device));
+  // Create a HAL module using the HAL device.
+  iree_vm_module_t* hal_module = nullptr;
+  IREE_CHECK_OK(iree_hal_module_create(iree_instance, iree_vk_device,
+                                       IREE_HAL_MODULE_FLAG_NONE,
+                                       iree_allocator_system(), &hal_module));
+
+
+  // Load bytecode module
+  //iree_file_toc_t module_file_toc;
+  //const char network_model[] = "resnet50_tf.vmfb";
+  //fprintf(stdout, "Loading: %s\n", network_model);
+  //if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
+  //{
+  //    abort();
+  //    return 1;
+  //}
+  //fprintf(stdout, "module size: %zu\n", module_file_toc.size);
+
+  iree_vm_module_t* bytecode_module = nullptr;
+  iree_status_t module_status = iree_tooling_load_module_from_flags(
+      iree_instance, iree_allocator_system(), &bytecode_module);
+  if (!iree_status_is_ok(module_status))
+    return -1;
+  //IREE_CHECK_OK(iree_vm_bytecode_module_create(
+  //    iree_instance,
+  //    iree_const_byte_span_t{
+  //        reinterpret_cast<const uint8_t*>(module_file_toc.data),
+  //        module_file_toc.size},
+  //    iree_allocator_null(), iree_allocator_system(), &bytecode_module));
+  //// Query for details about what is in the loaded module.
+  //iree_vm_module_signature_t bytecode_module_signature =
+  //    iree_vm_module_signature(bytecode_module);
+  //fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
+  //        bytecode_module_signature.export_function_count);
+  //for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
+  //  iree_vm_function_t function;
+  //  IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
+  //      bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
+  //  auto function_name = iree_vm_function_name(&function);
+  //  auto function_signature = iree_vm_function_signature(&function);
+
+  //  fprintf(stdout, "  %d: '%.*s' with calling convention '%.*s'\n", i,
+  //          (int)function_name.size, function_name.data,
+  //          (int)function_signature.calling_convention.size,
+  //          function_signature.calling_convention.data);
+  //}
+
+  // Allocate a context that will hold the module state across invocations.
+  iree_vm_context_t* iree_context = nullptr;
+  std::vector<iree_vm_module_t*> modules = {hal_module, bytecode_module};
+  IREE_CHECK_OK(iree_vm_context_create_with_modules(
+      iree_instance, IREE_VM_CONTEXT_FLAG_NONE, modules.size(), modules.data(),
+      iree_allocator_system(), &iree_context));
+  fprintf(stdout, "Context with modules is ready for use\n");
+
+  // Lookup the entry point function.
+  iree_vm_function_t main_function;
+  const char kMainFunctionName[] = "module.forward";
+  IREE_CHECK_OK(iree_vm_context_resolve_function(
+      iree_context,
+      iree_string_view_t{kMainFunctionName, sizeof(kMainFunctionName) - 1},
+      &main_function));
+  iree_string_view_t main_function_name = iree_vm_function_name(&main_function);
+  fprintf(stdout, "Resolved main function named '%.*s'\n",
+          (int)main_function_name.size, main_function_name.data);
+
+  // --------------------------------------------------------------------------
+
+        // Write inputs into mappable buffers.
+        iree_hal_allocator_t* allocator =
+            iree_hal_device_allocator(iree_vk_device);
+        //iree_hal_memory_type_t input_memory_type =
+        //    static_cast<iree_hal_memory_type_t>(
+        //        IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+        //        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
+        //iree_hal_buffer_usage_t input_buffer_usage =
+        //    static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
+        //iree_hal_buffer_params_t buffer_params;
+        //buffer_params.type = input_memory_type;
+        //buffer_params.usage = input_buffer_usage;
+        //buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;
+
+       // Wrap input buffers in buffer views.
+
+        vm::ref<iree_vm_list_t> inputs;
+        iree_status_t input_status = ParseToVariantList(
+            allocator,
+            iree::span<const std::string>{FLAG_function_inputs.data(),
+                                          FLAG_function_inputs.size()},
+            iree_allocator_system(), &inputs);
+        if (!iree_status_is_ok(input_status))
+            return -1;
+        //vm::ref<iree_vm_list_t> inputs;
+        //IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
+
+        //iree_hal_buffer_view_t* input0_buffer_view = nullptr;
+        //constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
+        //IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
+        //    allocator,
+        //    /*shape_rank=*/4, /*shape=*/input_buffer_shape,
+        //    IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+        //    IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
+        //    iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
+        //    &input0_buffer_view));
+
+        //auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
+        //IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));
+
+        // Prepare outputs list to accept results from the invocation.
+
+        vm::ref<iree_vm_list_t> outputs;
+        constexpr iree_hal_dim_t kOutputCount = 1000;
+        IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, kOutputCount * sizeof(float), iree_allocator_system(), &outputs));
+
+  // --------------------------------------------------------------------------
+
+  // Main loop.
+  bool done = false;
+  while (!done) {
+    SDL_Event event;
+
+    while (SDL_PollEvent(&event)) {
+      if (event.type == SDL_QUIT) {
+        done = true;
+      }
+
+      ImGui_ImplSDL2_ProcessEvent(&event);
+      if (event.type == SDL_QUIT) done = true;
+      if (event.type == SDL_WINDOWEVENT &&
+          event.window.event == SDL_WINDOWEVENT_RESIZED &&
+          event.window.windowID == SDL_GetWindowID(window)) {
+        g_SwapChainResizeWidth = (int)event.window.data1;
+        g_SwapChainResizeHeight = (int)event.window.data2;
+        g_SwapChainRebuild = true;
+      }
+    }
+
+    if (g_SwapChainRebuild) {
+      g_SwapChainRebuild = false;
+      ImGui_ImplVulkan_SetMinImageCount(g_MinImageCount);
+      ImGui_ImplVulkanH_CreateOrResizeWindow(
+          g_Instance, g_PhysicalDevice, g_Device, &g_MainWindowData,
+          g_QueueFamily, g_Allocator, g_SwapChainResizeWidth,
+          g_SwapChainResizeHeight, g_MinImageCount);
+      g_MainWindowData.FrameIndex = 0;
+    }
+
+    // Start the Dear ImGui frame
+    ImGui_ImplVulkan_NewFrame();
+    ImGui_ImplSDL2_NewFrame(window);
+    ImGui::NewFrame();
+
+    // Custom window.
+    {
+      ImGui::Begin("IREE Vulkan Integration Demo", &show_iree_window);
+
+      ImGui::Separator();
+
+      // ImGui Inputs for two input tensors.
+      // Run computation whenever any of the values changes.
+      static bool dirty = true;
+      if (dirty) {
+
+        // Synchronously invoke the function.
+        IREE_CHECK_OK(iree_vm_invoke(iree_context, main_function,
+                                     IREE_VM_INVOCATION_FLAG_NONE,
+                                     /*policy=*/nullptr, inputs.get(),
+                                     outputs.get(), iree_allocator_system()));
+
+
+        // we want to run continuously so we can use tools like RenderDoc, RGP, etc...
+        dirty = true;
+      }
+
+      // Framerate counter.
+      ImGui::Text("Application average %.3f ms/frame (%.1f FPS)",
+                  1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate);
+
+      ImGui::End();
+    }
+
+    // Rendering
+    ImGui::Render();
+    RenderFrame(wd, g_Device, g_Queue);
+
+    PresentFrame(wd, g_Queue);
+  }
+  // --------------------------------------------------------------------------
+
+  // --------------------------------------------------------------------------
+  // Cleanup
+  iree_vm_module_release(hal_module);
+  iree_vm_module_release(bytecode_module);
+  iree_vm_context_release(iree_context);
+  iree_hal_device_release(iree_vk_device);
+  iree_hal_allocator_release(allocator);
+  iree_hal_driver_release(iree_vk_driver);
+  iree_hal_vulkan_syms_release(iree_vk_syms);
+  iree_vm_instance_release(iree_instance);
+
+  err = vkDeviceWaitIdle(g_Device);
+  check_vk_result(err);
+  ImGui_ImplVulkan_Shutdown();
+  ImGui_ImplSDL2_Shutdown();
+  ImGui::DestroyContext();
+
+  CleanupVulkanWindow();
+  CleanupVulkan();
+
+  SDL_DestroyWindow(window);
+  SDL_Quit();
+  // --------------------------------------------------------------------------
+
+  return 0;
+}
+
+}  // namespace iree
--- a/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
--- a/dataset/README.md
+++ b/dataset/README.md
@@ -0,0 +1,27 @@
+# Dataset annotation tool
+
+SHARK annotator for adding or modifying prompts of dataset images
+
+## Set up
+
+Activate SHARK Python virtual environment and install additional packages
+```shell
+source ../shark.venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Run annotator
+
+```shell
+python annotation_tool.py
+```
+
+<img width="1280" alt="annotator" src="https://user-images.githubusercontent.com/49575973/214521137-7ef6ae10-7cd8-46e6-b270-b6c0445157f1.png">
+
+* Select a dataset from `Dataset` dropdown list
+* Select an image from `Image` dropdown list
+* Image and the existing prompt will be loaded
+* Select a prompt from `Prompt` dropdown list to modify or "Add new" to add a prompt
+* Click `Save` to save changes, click `Delete` to delete prompt
+* Click `Back` or `Next` to switch image, you could also select other images from `Image`
+* Click `Finish` when finishing annotation or before switching dataset
--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -0,0 +1,233 @@
+import gradio as gr
+import json
+import jsonlines
+import os
+from args import args
+from pathlib import Path
+from PIL import Image
+from utils import get_datasets
+
+
+shark_root = Path(__file__).parent.parent
+demo_css = shark_root.joinpath("web/demo.css").resolve()
+nodlogo_loc = shark_root.joinpath("web/models/stable_diffusion/logos/nod-logo.png")
+
+
+with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Column(scale=1, elem_id="demo_title_outer"):
+            gr.Image(
+                value=nod_logo,
+                show_label=False,
+                interactive=False,
+                show_download_button=False,
+                elem_id="top_logo",
+                width=150,
+                height=100,
+            )
+
+    datasets, images, ds_w_prompts = get_datasets(args.gs_url)
+    prompt_data = dict()
+
+    with gr.Row(elem_id="ui_body"):
+        # TODO: add multiselect dataset, there is a gradio version conflict
+        dataset = gr.Dropdown(label="Dataset", choices=datasets)
+        image_name = gr.Dropdown(label="Image", choices=[])
+
+    with gr.Row(elem_id="ui_body"):
+        # TODO: add ability to search image by typing
+        with gr.Column(scale=1, min_width=600):
+            image = gr.Image(type="filepath", height=512)
+
+        with gr.Column(scale=1, min_width=600):
+            prompts = gr.Dropdown(
+                label="Prompts",
+                choices=[],
+            )
+            prompt = gr.Textbox(
+                label="Editor",
+                lines=3,
+            )
+            with gr.Row():
+                save = gr.Button("Save")
+                delete = gr.Button("Delete")
+            with gr.Row():
+                back_image = gr.Button("Back")
+                next_image = gr.Button("Next")
+            finish = gr.Button("Finish")
+
+    def filter_datasets(dataset):
+        if dataset is None:
+            return gr.Dropdown.update(value=None, choices=[])
+
+        # create the dataset dir if doesn't exist and download prompt file
+        dataset_path = str(shark_root) + "/dataset/" + dataset
+        if not os.path.exists(dataset_path):
+            os.mkdir(dataset_path)
+
+        # read prompt jsonlines file
+        prompt_data.clear()
+        if dataset in ds_w_prompts:
+            prompt_gs_path = args.gs_url + "/" + dataset + "/metadata.jsonl"
+            os.system(f'gsutil cp "{prompt_gs_path}" "{dataset_path}"/')
+            with jsonlines.open(dataset_path + "/metadata.jsonl") as reader:
+                for line in reader.iter(type=dict, skip_invalid=True):
+                    prompt_data[line["file_name"]] = (
+                        [line["text"]] if type(line["text"]) is str else line["text"]
+                    )
+
+        return gr.Dropdown.update(choices=images[dataset])
+
+    dataset.change(fn=filter_datasets, inputs=dataset, outputs=image_name)
+
+    def display_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return gr.Image.update(value=None), gr.Dropdown.update(value=None)
+
+        # download and load the image
+        img_gs_path = args.gs_url + "/" + dataset + "/" + image_name
+        img_sub_path = "/".join(image_name.split("/")[:-1])
+        img_dst_path = (
+            str(shark_root) + "/dataset/" + dataset + "/" + img_sub_path + "/"
+        )
+        if not os.path.exists(img_dst_path):
+            os.mkdir(img_dst_path)
+        os.system(f'gsutil cp "{img_gs_path}" "{img_dst_path}"')
+        img = Image.open(img_dst_path + image_name.split("/")[-1])
+
+        if image_name not in prompt_data.keys():
+            prompt_data[image_name] = []
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Image.update(value=img), gr.Dropdown.update(choices=prompt_choices)
+
+    image_name.change(
+        fn=display_image,
+        inputs=[dataset, image_name],
+        outputs=[image, prompts],
+    )
+
+    def edit_prompt(prompts):
+        if prompts == "Add new":
+            return gr.Textbox.update(value=None)
+
+        return gr.Textbox.update(value=prompts)
+
+    prompts.change(fn=edit_prompt, inputs=prompts, outputs=prompt)
+
+    def save_prompt(dataset, image_name, prompts, prompt):
+        if dataset is None or image_name is None or prompts is None or prompt is None:
+            return
+
+        if prompts == "Add new":
+            prompt_data[image_name].append(prompt)
+        else:
+            idx = prompt_data[image_name].index(prompts)
+            prompt_data[image_name][idx] = prompt
+
+        prompt_path = str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
+        # write prompt jsonlines file
+        with open(prompt_path, "w") as f:
+            for key, value in prompt_data.items():
+                if not value:
+                    continue
+                v = value if len(value) > 1 else value[0]
+                f.write(json.dumps({"file_name": key, "text": v}))
+                f.write("\n")
+
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Dropdown.update(choices=prompt_choices, value=None)
+
+    save.click(
+        fn=save_prompt,
+        inputs=[dataset, image_name, prompts, prompt],
+        outputs=prompts,
+    )
+
+    def delete_prompt(dataset, image_name, prompts):
+        if dataset is None or image_name is None or prompts is None:
+            return
+        if prompts == "Add new":
+            return
+
+        prompt_data[image_name].remove(prompts)
+        prompt_path = str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
+        # write prompt jsonlines file
+        with open(prompt_path, "w") as f:
+            for key, value in prompt_data.items():
+                if not value:
+                    continue
+                v = value if len(value) > 1 else value[0]
+                f.write(json.dumps({"file_name": key, "text": v}))
+                f.write("\n")
+
+        prompt_choices = ["Add new"]
+        prompt_choices += prompt_data[image_name]
+        return gr.Dropdown.update(choices=prompt_choices, value=None)
+
+    delete.click(
+        fn=delete_prompt,
+        inputs=[dataset, image_name, prompts],
+        outputs=prompts,
+    )
+
+    def get_back_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return
+
+        # remove local image
+        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
+        os.system(f'rm "{img_path}"')
+        # get the index for the back image
+        idx = images[dataset].index(image_name)
+        if idx == 0:
+            return gr.Dropdown.update(value=None)
+
+        return gr.Dropdown.update(value=images[dataset][idx - 1])
+
+    back_image.click(
+        fn=get_back_image, inputs=[dataset, image_name], outputs=image_name
+    )
+
+    def get_next_image(dataset, image_name):
+        if dataset is None or image_name is None:
+            return
+
+        # remove local image
+        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
+        os.system(f'rm "{img_path}"')
+        # get the index for the next image
+        idx = images[dataset].index(image_name)
+        if idx == len(images[dataset]) - 1:
+            return gr.Dropdown.update(value=None)
+
+        return gr.Dropdown.update(value=images[dataset][idx + 1])
+
+    next_image.click(
+        fn=get_next_image, inputs=[dataset, image_name], outputs=image_name
+    )
+
+    def finish_annotation(dataset):
+        if dataset is None:
+            return
+
+        # upload prompt and remove local data
+        dataset_path = str(shark_root) + "/dataset/" + dataset
+        dataset_gs_path = args.gs_url + "/" + dataset + "/"
+        os.system(f'gsutil cp "{dataset_path}/metadata.jsonl" "{dataset_gs_path}"')
+        os.system(f'rm -rf "{dataset_path}"')
+
+        return gr.Dropdown.update(value=None)
+
+    finish.click(fn=finish_annotation, inputs=dataset, outputs=dataset)
+
+
+if __name__ == "__main__":
+    shark_web.launch(
+        share=args.share,
+        inbrowser=True,
+        server_name="0.0.0.0",
+        server_port=args.server_port,
+    )
--- a/dataset/args.py
+++ b/dataset/args.py
@@ -0,0 +1,34 @@
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+##############################################################################
+### Dataset Annotator flags
+##############################################################################
+
+p.add_argument(
+    "--gs_url",
+    type=str,
+    required=True,
+    help="URL to datasets in GS bucket",
+)
+
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+
+##############################################################################
+
+args = p.parse_args()
--- a/dataset/requirements.txt
+++ b/dataset/requirements.txt
@@ -0,0 +1,3 @@
+# SHARK Annotator
+gradio==3.34.0
+jsonlines
--- a/dataset/utils.py
+++ b/dataset/utils.py
@@ -0,0 +1,29 @@
+from google.cloud import storage
+
+
+def get_datasets(gs_url):
+    datasets = set()
+    images = dict()
+    ds_w_prompts = []
+
+    storage_client = storage.Client()
+    bucket_name = gs_url.split("/")[2]
+    source_blob_name = "/".join(gs_url.split("/")[3:])
+    blobs = storage_client.list_blobs(bucket_name, prefix=source_blob_name)
+
+    for blob in blobs:
+        dataset_name = blob.name.split("/")[1]
+        if dataset_name == "":
+            continue
+        datasets.add(dataset_name)
+        if dataset_name not in images.keys():
+            images[dataset_name] = []
+
+        # check if image or jsonl
+        file_sub_path = "/".join(blob.name.split("/")[2:])
+        if "/" in file_sub_path:
+            images[dataset_name] += [file_sub_path]
+        elif "metadata.jsonl" in file_sub_path:
+            ds_w_prompts.append(dataset_name)
+
+    return list(datasets), images, ds_w_prompts
--- a/docs/shark_iree_profiling.md
+++ b/docs/shark_iree_profiling.md
@@ -0,0 +1,118 @@
+# Overview
+
+This document is intended to provide a starting point for profiling with SHARK/IREE. At it's core
+[SHARK](https://github.com/nod-ai/SHARK/tree/main/tank) is a python API that links the MLIR lowerings from various
+frameworks + frontends (e.g. PyTorch -> Torch-MLIR) with the compiler + runtime offered by IREE. More information
+on model coverage and framework support can be found [here](https://github.com/nod-ai/SHARK/tree/main/tank). The intended
+use case for SHARK is for compilation and deployment of performant state of the art AI models.
+
+![image](https://user-images.githubusercontent.com/22101546/217151219-9bb184a3-cfb9-4788-bb7e-5b502953525c.png)
+
+## Benchmarking with SHARK
+
+TODO: Expand this section.
+
+SHARK offers native benchmarking support, although because it is model focused, fine grain profiling is
+hidden when compared against the common "model benchmarking suite" use case SHARK is good at.
+
+### SharkBenchmarkRunner
+
+SharkBenchmarkRunner is a class designed for benchmarking models against other runtimes.
+TODO: List supported runtimes for comparison + example on how to benchmark with it.
+
+## Directly profiling IREE
+
+A number of excellent developer resources on profiling with IREE can be
+found [here](https://github.com/iree-org/iree/tree/main/docs/developers/developing_iree). As a result this section will
+focus on the bridging the gap between the two.
+ - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling.md
+ - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_with_tracy.md
+ - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_vulkan_gpu.md
+ - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_cpu_events.md
+
+Internally, SHARK builds a pair of IREE commands to compile + run a model. At a high level the flow starts with the
+model represented with a high level dialect (commonly Linalg) and is compiled to a flatbuffer (.vmfb) that
+the runtime is capable of ingesting. At this point (with potentially a few runtime flags) the compiled model is then run
+through the IREE runtime. This is all facilitated with the IREE python bindings, which offers a convenient method
+to capture the compile command SHARK comes up with. This is done by setting the environment variable
+`IREE_SAVE_TEMPS` to point to a directory of choice, e.g. for stable diffusion
+```
+# Linux
+$ export IREE_SAVE_TEMPS=/path/to/some/directory
+# Windows
+$ $env:IREE_SAVE_TEMPS="C:\path\to\some\directory"
+$ python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse" --save_vmfb
+```
+NOTE: Currently this will only save the compile command + input MLIR for a single model if run in a pipeline.
+In the case of stable diffusion this (should) be UNet so to get examples for other models in the pipeline they
+need to be extracted and tested individually.
+
+The save temps directory should contain three files: `core-command-line.txt`, `core-input.mlir`, and `core-output.bin`.
+The command line for compilation will start something like this, where the `-` needs to be replaced with the path to `core-input.mlir`.
+```
+/home/quinn/nod/iree-build/compiler/bindings/python/iree/compiler/tools/../_mlir_libs/iree-compile - --iree-input-type=none ...
+```
+The `-o output_filename.vmfb` flag can be used to specify the location to save the compiled vmfb. Note that a dump of the
+dispatches that can be compiled + run in isolation can be generated by adding `--iree-hal-dump-executable-benchmarks-to=/some/directory`. Say, if they are in the `benchmarks` directory, the following compile/run commands would work for Vulkan on RDNA3.
+```
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux  benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb
+
+iree-benchmark-module --module=benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb --function=forward --device=vulkan
+```
+Where `${NUM}` is the dispatch number that you want to benchmark/profile in isolation.
+
+### Enabling Tracy for Vulkan profiling
+
+To begin profiling with Tracy, a build of IREE runtime with tracing enabled is needed. SHARK-Runtime (SRT) builds an
+instrumented version alongside the normal version nightly (.whls typically found [here](https://github.com/nod-ai/SRT/releases)), however this is only available for Linux. For Windows, tracing can be enabled by enabling a CMake flag.
+```
+$env:IREE_ENABLE_RUNTIME_TRACING="ON"
+```
+Getting a trace can then be done by setting environment variable `TRACY_NO_EXIT=1` and running the program that is to be
+traced. Then, to actually capture the trace, use the `iree-tracy-capture` tool in a different terminal. Note that to get
+the capture and profiler tools the `IREE_BUILD_TRACY=ON` CMake flag needs to be set.
+```
+TRACY_NO_EXIT=1 python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse"
+
+# (in another terminal, either on the same machine or through ssh with a tunnel through port 8086)
+iree-tracy-capture -o trace_filename.tracy
+```
+To do it over ssh, the flow looks like this
+```
+# From terminal 1 on local machine
+ssh -L 8086:localhost:8086 <remote_server_name>
+TRACY_NO_EXIT=1 python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse"
+
+# From terminal 2 on local machine. Requires having built IREE with the CMake flag `IREE_BUILD_TRACY=ON` to build the required tooling.
+iree-tracy-capture -o /path/to/trace.tracy
+```
+
+The trace can then be viewed with
+```
+iree-tracy-profiler /path/to/trace.tracy
+```
+Capturing a runtime trace will work with any IREE tooling that uses the runtime. For example, `iree-benchmark-module`
+can be used for benchmarking an individual module. Importantly this means that any SHARK script can be profiled with tracy.
+
+NOTE: Not all backends have the same tracy support. This writeup is focused on CPU/Vulkan backends but there is recently added support for tracing on CUDA (requires the `--cuda_tracing` flag).
+
+## Experimental RGP support
+
+TODO: This section is temporary until proper RGP support is added.
+
+Currently, for stable diffusion there is a flag for enabling UNet to be visible to RGP with `--enable_rgp`. To get a proper capture though, the `DevModeSqttPrepareFrameCount=1` flag needs to be set for the driver (done with `VkPanel` on Windows).
+With these two settings, a single iteration of UNet can be captured.
+
+(AMD only) To get a dump of the pipelines (result of compiled SPIR-V) the `EnablePipelineDump=1` driver flag can be set. The
+files will typically be dumped to a directory called `spvPipeline` (on Linux `/var/tmp/spvPipeline`. The dumped files will
+include header information that can be used to map back to the source dispatch/SPIR-V, e.g.
+```
+[Version]
+version = 57 
+
+[CsSpvFile]
+fileName = Shader_0x946C08DFD0C10D9A.spv
+
+[CsInfo]
+entryPoint = forward_dispatch_193_matmul_256x65536x2304
+```
--- a/docs/shark_sd_blender.md
+++ b/docs/shark_sd_blender.md
@@ -0,0 +1,75 @@
+# Overview
+
+This document is intended to provide a starting point for using SHARK stable diffusion with Blender. 
+
+We currently make use of the [AI-Render Plugin](https://github.com/benrugg/AI-Render) to integrate with Blender.
+
+## Setup SHARK and prerequisites:
+
+ * Download the latest SHARK SD webui .exe from [here](https://github.com/nod-ai/SHARK/releases) or follow instructions on the [README](https://github.com/nod-ai/SHARK#readme)
+ * Once you have the .exe where you would like SHARK to install, run the .exe from terminal/PowerShell with the `--api` flag:
+```
+## Run the .exe in API mode:
+.\shark_sd_<date>_<ver>.exe --api
+
+## For example:
+.\shark_sd_20230411_671.exe --api --server_port=8082
+
+## From a the base directory of a source clone of SHARK:
+./setup_venv.ps1
+python apps\stable_diffusion\web\index.py --api
+
+```
+
+Your local SD server should start and look something like this:
+![image](https://user-images.githubusercontent.com/87458719/231369758-e2c3c45a-eccc-4fe5-a788-4a3bf1ace1d1.png)
+
+ * Note: When running in api mode with `--api`, the .exe will not function as a webUI. Thus, the address in the terminal output will only be useful for API requests.
+
+### Install AI Render
+
+- Get AI Render on [Blender Market](https://blendermarket.com/products/ai-render) or [Gumroad](https://airender.gumroad.com/l/ai-render)
+- Open Blender, then go to Edit > Preferences > Add-ons > Install and then find the zip file
+- We will be using the Automatic1111 SD backend for the AI-Render plugin. Follow instructions [here](https://github.com/benrugg/AI-Render/wiki/Local-Installation) to setup local SD backend.
+
+Your AI-Render preferences should be configured as shown; the highlighted part should match your terminal output:
+![image](https://user-images.githubusercontent.com/87458719/231390322-59a54a09-520a-4a08-b658-6e37bd63e932.png)
+
+
+The [AI-Render README](https://github.com/benrugg/AI-Render/blob/main/README.md) has more details on installation and usage, as well as video tutorials.
+
+## Using AI-Render + SHARK in your Blender project
+
+- In the Render Properties tab, in the AI-Render dropdown, enable AI-Render.
+
+![image](https://user-images.githubusercontent.com/87458719/231392843-9bd51744-3ce2-464e-843a-0c4d4c96df0c.png)
+
+- Select an image size (it's usually better to upscale later than go high on the img2img resolution here.)
+
+![image](https://user-images.githubusercontent.com/87458719/231394288-0c4ab8c5-dc30-4dbe-8bc1-7520ded5efe8.png)
+
+- From here, you can enter a prompt and configure img2img Stable Diffusion parameters, and AI-Render will run SHARK SD img2img on the rendered scene.
+- AI-Render has useful presets for aesthetic styles, so you should be able to keep your subject prompt simple and focus on creating a decent Blender scene to start from.
+
+![image](https://user-images.githubusercontent.com/87458719/231440729-2fe69586-41cb-4274-9ce7-f6c08def600b.png)
+
+## Examples:
+Scene (Input image):
+
+![blender-sample-2](https://user-images.githubusercontent.com/87458719/231450408-0e680086-3e52-4962-a5c1-c703a94d1583.png)
+
+Prompt:
+"A bowl of tangerines in front of rocks, masterpiece, oil on canvas, by Georgia O'Keefe, trending on artstation, landscape painting by Caspar David Friedrich"
+
+Negative Prompt (default):
+"ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
+
+Example output:
+
+![blender-sample-2_out](https://user-images.githubusercontent.com/87458719/231451145-a0b56897-a7d0-4add-bbed-7e8af21a65df.png)
+
+
+
+
+
+
--- a/docs/shark_sd_koboldcpp.md
+++ b/docs/shark_sd_koboldcpp.md
@@ -0,0 +1,140 @@
+# Overview
+
+In [1.47.2](https://github.com/LostRuins/koboldcpp/releases/tag/v1.47.2) [Koboldcpp](https://github.com/LostRuins/koboldcpp) added AUTOMATIC1111 integration for image generation. Since SHARK implements a small subset of the A1111 REST api, you can also use SHARK for this. This document gives a starting point for how to get this working.
+
+## In Action
+
+![preview](https://user-images.githubusercontent.com/121311569/280557602-bb97bad0-fdf5-4922-a2cc-4f327f2760db.jpg)
+
+## Memory considerations
+
+Since both Koboldcpp and SHARK will use VRAM on your graphic card(s) running both at the same time using the same card will impose extra limitations on the model size you can fully offload to the video card in Koboldcpp. For me, on a RX 7900 XTX on Windows with 24 GiB of VRAM, the limit was about a 13 Billion parameter model with Q5_K_M quantisation.
+
+## Performance Considerations
+
+When using SHARK for image generation, especially with Koboldcpp, you need to be aware that it is currently designed to pay a large upfront cost in time compiling and tuning the model you select, to get an optimal individual image generation time. You need to be the judge as to whether this trade-off is going to be worth it for your OS and hardware combination.
+
+It means that the first time you run a particular Stable Diffusion model for a particular combination of image size, LoRA, and VAE, SHARK will spend *many minutes* - even on a beefy machaine with very fast graphics card with lots of memory - building that model combination just so it can save it to disk. It may even have to go away and download the model if it doesn't already have it locally. Once it has done its build of a model combination for your hardware once, it shouldn't need to do it again until you upgrade to a newer SHARK version, install different drivers or change your graphics hardware. It will just upload the files it generated the first time to your graphics card and proceed from there.
+
+This does mean however, that on a brand new fresh install of SHARK that has not generated any images on a model you haven't selected before, the first image Koboldcpp requests may look like it is *never* going finish and that the whole process has broken. Be forewarned, make yourself a cup of coffee, and expect a lot of messages about compilation and tuning from SHARK in the terminal you ran it from.
+
+## Setup SHARK and prerequisites:
+
+ * Make sure you have suitable drivers for your graphics card installed. See the prerequisties section of the [README](https://github.com/nod-ai/SHARK#readme).
+ * Download the latest SHARK studio .exe from [here](https://github.com/nod-ai/SHARK/releases) or follow the instructions in the [README](https://github.com/nod-ai/SHARK#readme) for an advanced, Linux or Mac install.
+ * Run SHARK from terminal/PowerShell with the `--api` flag. Since koboldcpp also expects both CORS support and the image generator to be running on port `7860` rather than SHARK default of `8080`, also include both the `--api_accept_origin` flag with a suitable origin (use `="*"` to enable all origins) and `--server_port=7860` on the command line. (See the if you want to run SHARK on a different port)
+
+```powershell
+## Run the .exe in API mode, with CORS support, on the A1111 endpoint port:
+.\node_ai_shark_studio_<date>_<ver>.exe --api --api_accept_origin="*"  --server_port=7860
+
+## Run trom the base directory of a source clone of SHARK on Windows:
+.\setup_venv.ps1
+python .\apps\stable_diffusion\web\index.py --api --api_accept_origin="*"  --server_port=7860
+
+## Run a the base directory of a source clone of SHARK on Linux:
+./setup_venv.sh
+source shark.venv/bin/activate
+python ./apps/stable_diffusion/web/index.py --api --api_accept_origin="*"  --server_port=7860
+
+## An example giving improved performance on AMD cards using vulkan, that runs on the same port as A1111
+.\node_ai_shark_studio_20320901_2525.exe --api --api_accept_origin="*" --device_allocator="caching" --server_port=7860
+
+## Since the api respects most applicable SHARK command line arguments for options not specified,
+## or currently unimplemented by API, there might be some you want to set, as listed in `--help`
+.\node_ai_shark_studio_20320901_2525.exe --help
+
+## For instance, the example above, but with a a custom VAE specified
+.\node_ai_shark_studio_20320901_2525.exe --api --api_accept_origin="*" --device_allocator="caching" --server_port=7860 --custom_vae="clearvae_v23.safetensors"
+
+## An example with multiple specific CORS origins
+python apps/stable_diffusion/web/index.py --api --api_accept_origin="koboldcpp.example.com:7001" --api_accept_origin="koboldcpp.example.com:7002" --server_port=7860
+```
+
+SHARK should start in server mode, and you should see something like this:
+
+![SHARK API startup](https://user-images.githubusercontent.com/121311569/280556294-c3f7fc1a-c8e2-467d-afe6-365638d6823a.png)
+
+* Note: When running in api mode with `--api`, the .exe will not function as a webUI. Thus, the address or port shown in the terminal output will only be useful for API requests.
+
+
+## Configure Koboldcpp for local image generation:
+
+* Get the latest [Koboldcpp](https://github.com/LostRuins/koboldcpp/releases) if you don't already have it. If you have a recent AMD card that has ROCm HIP [support for Windows](https://rocmdocs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus) or [support for Linux](https://rocmdocs.amd.com/en/latest/release/gpu_os_support.html#linux-supported-gpus), you'll likely prefer [YellowRosecx's ROCm fork](https://github.com/YellowRoseCx/koboldcpp-rocm).
+* Start Koboldcpp in another terminal/Powershell and setup your model configuration. Refer to the [Koboldcpp README](https://github.com/YellowRoseCx/koboldcpp-rocm) for more details on how to do this if this is your first time using Koboldcpp.
+* Once the main UI has loaded into your browser click the settings button, go to the advanced tab, and then choose *Local A1111* from the generate images dropdown:
+
+  ![Settings button location](https://user-images.githubusercontent.com/121311569/280556246-10692d79-e89f-4fdf-87ba-82f3d78ed49d.png)
+
+  ![Advanced Settings with 'Local A1111' location](https://user-images.githubusercontent.com/121311569/280556234-6ebc8ba7-1469-442a-93a7-5626a094ddf1.png)
+
+  *if you get an error here, see the next section [below](#connecting-to-shark-on-a-different-address-or-port)*
+
+* A list of Stable Diffusion models available to your SHARK instance should now be listed in the box below *generate images*. The default value will usually be set to `stabilityai/stable-diffusion-2-1-base`. Choose the model you want to use for image generation from the list (but see [performance considerations](#performance-considerations)).
+* You should now be ready to generate images, either by clicking the 'Add Img' button above the text entry box:
+
+  ![Add Image Button](https://user-images.githubusercontent.com/121311569/280556161-846c7883-4a83-4458-a56a-bd9f93ca354c.png)
+
+  ...or by selecting the 'Autogenerate' option in the settings:
+
+  ![Setting the autogenerate images option](https://user-images.githubusercontent.com/121311569/280556230-ae221a46-ba68-499b-a519-c8f290bbbeae.png)
+
+  *I often find that even if I have selected autogenerate I have to do an 'add img' to get things started off*
+
+* There is one final piece of image generation configuration within Koboldcpp you might want to do. This is also in the generate images section of advanced settings. Here there is, not very obviously, a 'style' button:
+
+  ![Selecting the 'styles' button](https://user-images.githubusercontent.com/121311569/280556694-55cd1c55-a059-4b54-9293-63d66a32368e.png)
+
+  This will bring up a dialog box where you can enter a short text that will sent as a prefix to the Prompt sent to SHARK:
+
+  ![Entering extra image styles](https://user-images.githubusercontent.com/121311569/280556172-4aab9794-7a77-46d7-bdda-43df570ad19a.png)
+
+
+## Connecting to SHARK on a different address or port
+
+If you didn't set the port to `--server_port=7860` when starting SHARK, or you are running it on different machine on your network than you are running Koboldcpp, or to where you are running the koboldcpp's kdlite client frontend, then you very likely got the following error:
+
+  ![Can't find the A1111 endpoint error](https://user-images.githubusercontent.com/121311569/280555857-601f53dc-35e9-4027-9180-baa61d2393ba.png)
+
+As long as SHARK is running correctly, this means you need to set the url and port to the correct values in Koboldcpp. For instance. to set the port that Koboldcpp looks for an image generator to SHARK's default port of 8080:
+
+* Select the cog icon the Generate Images section of Advanced settings:
+
+     ![Selecting the endpoint cog](https://user-images.githubusercontent.com/121311569/280555866-4287ecc5-f29f-4c03-8f5a-abeaf31b0442.png)
+
+* Then edit the port number at the end of the url in the 'A1111 Endpoint Selection' dialog box to read 8080:
+
+     ![Changing the endpoint port](https://user-images.githubusercontent.com/121311569/280556170-f8848b7b-6fc9-4cf7-80eb-5c312f332fd9.png)
+
+* Similarly, when running SHARK on a different machine you will need to change host part of the endpoint url to the hostname or ip address where SHARK is running, similarly:
+
+    ![Changing the endpoint hostname](https://user-images.githubusercontent.com/121311569/280556167-c6541dea-0f85-417a-b661-fdf4dc40d05f.png)
+
+## Examples
+
+Here's how Koboldcpp shows an image being requested:
+
+  ![An image being generated]((https://user-images.githubusercontent.com/121311569/280556210-bb1c9efd-79ac-478e-b726-b25b82ef2186.png)
+
+The generated image in context in story mode:
+
+ ![A generated image](https://user-images.githubusercontent.com/121311569/280556179-4e9f3752-f349-4cba-bc6a-f85f8dc79b10.jpg)
+
+And the same image when clicked on:
+
+ ![A selected image](https://user-images.githubusercontent.com/121311569/280556216-2ca4c0a4-3889-4ef5-8a09-30084fb34081.jpg)
+
+
+## Where to find the images in SHARK
+
+Even though Koboldcpp requests images at a size of 512x512, it resizes then to 256x256, converts them to `.jpeg`, and only shows them at 200x200 in the main text window. It does this so it can save them compactly embedded in your story as a `data://` uri.
+
+However the images at the original size are saved by SHARK in its `output_dir` which is usually a folder named for the current date. inside `generated_imgs` folder in the SHARK installation directory.
+
+You can browse these, either using the Output Gallery tab from within the SHARK web ui:
+
+  ![SHARK web ui output gallery tab](https://user-images.githubusercontent.com/121311569/280556582-9303ca85-2594-4a8c-97a2-fbd72337980b.jpg)
+
+...or by browsing to the `output_dir` in your operating system's file manager:
+
+  ![SHARK output directory subfolder in Windows File Explorer](https://user-images.githubusercontent.com/121311569/280556297-66173030-2324-415c-a236-ef3fcd73e6ed.jpg)
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -1,192 +0,0 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-cmake_minimum_required(VERSION 3.17)
-
-project(sharkbackend LANGUAGES C CXX)
-
-#
-# Options
-#
-
-option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
-option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
-
-set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
-set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
-set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-#
-# Dependencies
-#
-# FetchContent requires us to include the transitive closure of all
-# repos that we depend on so that we can override the tags.
-#
-include(FetchContent)
-
-FetchContent_Declare(
-  repo-common
-  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
-  GIT_TAG ${TRITON_COMMON_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_Declare(
-  repo-core
-  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
-  GIT_TAG ${TRITON_CORE_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_Declare(
-  repo-backend
-  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
-  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_MakeAvailable(repo-common repo-core repo-backend)
-
-#
-# The backend must be built into a shared library. Use an ldscript to
-# hide all symbols except for the TRITONBACKEND API.
-#
-configure_file(src/libtriton_dshark.ldscript libtriton_dshark.ldscript COPYONLY)
-
-add_library(
-  triton-dshark-backend SHARED
-  src/dshark.cc
-  #src/dshark_driver_module.c
-)
-
-add_library(
-  SharkBackend::triton-dshark-backend ALIAS triton-dshark-backend
-)
-
-target_include_directories(
-  triton-dshark-backend
-  PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/src
-)
-
-list(APPEND CMAKE_MODULE_PATH "${PROJECT_BINARY_DIR}/lib/cmake/mlir")
-
-add_subdirectory(thirdparty/shark-runtime EXCLUDE_FROM_ALL)
-
-target_link_libraries(triton-dshark-backend PRIVATE iree_base_base
-  iree_hal_hal
-  iree_hal_cuda_cuda
-  iree_hal_cuda_registration_registration
-  iree_hal_vmvx_registration_registration
-  iree_hal_dylib_registration_registration
-  iree_modules_hal_hal
-  iree_vm_vm
-  iree_vm_bytecode_module
-  iree_hal_local_loaders_system_library_loader
-  iree_hal_local_loaders_vmvx_module_loader
-  )
-
-target_compile_features(triton-dshark-backend PRIVATE cxx_std_11)
-
-
-target_link_libraries(
-  triton-dshark-backend
-  PRIVATE
-    triton-core-serverapi   # from repo-core
-    triton-core-backendapi  # from repo-core
-    triton-core-serverstub  # from repo-core
-    triton-backend-utils    # from repo-backend
-)
-
-if(WIN32)
-  set_target_properties(
-    triton-dshark-backend PROPERTIES
-    POSITION_INDEPENDENT_CODE ON
-    OUTPUT_NAME triton_dshark
-  )
-else()
-  set_target_properties(
-    triton-dshark-backend PROPERTIES
-    POSITION_INDEPENDENT_CODE ON
-    OUTPUT_NAME triton_dshark
-    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_dshark.ldscript
-    LINK_FLAGS "-Wl,--version-script libtriton_dshark.ldscript"
-  )
-endif()
-
-
-
-#
-# Install
-#
-include(GNUInstallDirs)
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/SharkBackend)
-
-install(
-  TARGETS
-    triton-dshark-backend
-  EXPORT
-    triton-dshark-backend-targets
-  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
-  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
-)
-
-install(
-  EXPORT
-    triton-dshark-backend-targets
-  FILE
-    SharkBackendTargets.cmake
-  NAMESPACE
-    SharkBackend::
-  DESTINATION
-    ${INSTALL_CONFIGDIR}
-)
-
-include(CMakePackageConfigHelpers)
-configure_package_config_file(
-  ${CMAKE_CURRENT_LIST_DIR}/cmake/SharkBackendConfig.cmake.in
-  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
-  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
-)
-
-install(
-  FILES
-  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
-  DESTINATION ${INSTALL_CONFIGDIR}
-)
-
-#
-# Export from build tree
-#
-export(
-  EXPORT triton-dshark-backend-targets
-  FILE ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendTargets.cmake
-  NAMESPACE SharkBackend::
-)
-
-export(PACKAGE SharkBackend)
-
--- a/inference/README.md
+++ b/inference/README.md
@@ -1,100 +0,0 @@
-# SHARK Triton Backend
-
-The triton backend for shark.
-
-# Build
-
-Install SHARK
-
-```
-git clone https://github.com/nod-ai/SHARK.git
-# skip above step if dshark is already installed
-cd SHARK/inference
-```
-
-install dependancies
-
-```
-apt-get install patchelf rapidjson-dev python3-dev
-git submodule update --init
-```
-
-update the submodules of iree
-
-```
-cd thirdparty/shark-runtime
-git submodule update --init
-```
-
-Next, make the backend and install it
-
-```
-cd ../..
-mkdir build && cd build
-cmake -DTRITON_ENABLE_GPU=ON \
-DIREE_HAL_DRIVER_CUDA=ON \
-DIREE_TARGET_BACKEND_CUDA=ON \
-DMLIR_ENABLE_CUDA_RUNNER=ON \
-DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \
-DTRITON_BACKEND_REPO_TAG=r22.02 \
-DTRITON_CORE_REPO_TAG=r22.02 \
-DTRITON_COMMON_REPO_TAG=r22.02 ..
-make install
-```
-
-# Incorporating into Triton
-
-There are much more in depth explenations for the following steps in triton's documentation:
-https://github.com/triton-inference-server/server/blob/main/docs/compose.md#triton-with-unsupported-and-custom-backends
-
-There should be a file at /build/install/backends/dshark/libtriton_dshark.so.  You will need to copy it into your triton server image.  
-More documentation is in the link above, but to create the docker image, you need to run the compose.py command in the triton-backend server repo
-
-
-To first build your image, clone the tritonserver repo.
-
-```
-git clone https://github.com/triton-inference-server/server.git
-```
-
-then run `compose.py` to build a docker compose file 
-```
-cd server
-python3 compose.py --repoagent checksum --dry-run
-```
-
-Because dshark is a third party backend, you will need to manually modify the `Dockerfile.compose` to include the dshark backend.  To do this, in the Dockerfile.compose file produced, copy this line.
-the dshark backend will be located in the build folder from earlier under `/build/install/backends`
-
-```
-COPY /path/to/build/install/backends/dshark /opt/tritonserver/backends/dshark
-```
-
-Next run 
-```
-docker build -t tritonserver_custom -f Dockerfile.compose .
-docker run -it --gpus=1 --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
-```
-
-where `path/to/model_repos` is where you are storing the models you want to run
-
-if your not using gpus, omit `--gpus=1`
-
-```
-docker run -it  --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
-```
-
-# Setting up a model
-
-to include a model in your backend, add a directory with your model name to your model repository directory.  examples of models can be seen here: https://github.com/triton-inference-server/backend/tree/main/examples/model_repos/minimal_models
-
-make sure to adjust the input correctly in the config.pbtxt file, and save a vmfb file under 1/model.vmfb
-
-# CUDA
-
-if you're having issues with cuda, make sure your correct drivers are installed, and that `nvidia-smi` works, and also make sure that the nvcc compiler is on the path.
-
-
-
-
-
--- a/inference/cmake/SharkBackendConfig.cmake.in
+++ b/inference/cmake/SharkBackendConfig.cmake.in
@@ -1,39 +0,0 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include(CMakeFindDependencyMacro)
-
-get_filename_component(
-  SHARKBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
-)
-
-list(APPEND CMAKE_MODULE_PATH ${SHARKBACKEND_CMAKE_DIR})
-
-if(NOT TARGET SharkBackend::triton-dshark-backend)
-  include("${SHARKBACKEND_CMAKE_DIR}/SharkBackendTargets.cmake")
-endif()
-
-set(SHARKBACKEND_LIBRARIES SharkBackend::triton-dshark-backend)
--- a/inference/src/dshark.cc
+++ b/inference/src/dshark.cc
--- a/inference/src/libtriton_dshark.ldscript
+++ b/inference/src/libtriton_dshark.ldscript
@@ -1,30 +0,0 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-{
-  global:
-    TRITONBACKEND_*;
-  local: *;
-};
--- a/inference/thirdparty/shark-runtime
+++ b/inference/thirdparty/shark-runtime
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -0,0 +1,66 @@
+# This script will toggle the comment/uncommenting aspect for dealing
+# with __file__ AttributeError arising in case of a few modules in
+# `torch/_dynamo/skipfiles.py` (within shark.venv)
+
+from distutils.sysconfig import get_python_lib
+import fileinput
+from pathlib import Path
+
+# Temporary workaround for transformers/__init__.py.
+path_to_transformers_hook = Path(
+    get_python_lib() + "/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-transformers.py"
+)
+if path_to_transformers_hook.is_file():
+    pass
+else:
+    with open(path_to_transformers_hook, "w") as f:
+        f.write("module_collection_mode = 'pyz+py'")
+
+path_to_skipfiles = Path(get_python_lib() + "/torch/_dynamo/skipfiles.py")
+
+modules_to_comment = ["abc,", "os,", "posixpath,", "_collections_abc,"]
+startMonitoring = 0
+for line in fileinput.input(path_to_skipfiles, inplace=True):
+    if "SKIP_DIRS = " in line:
+        startMonitoring = 1
+        print(line, end="")
+    elif startMonitoring in [1, 2]:
+        if "]" in line:
+            startMonitoring += 1
+            print(line, end="")
+        else:
+            flag = True
+            for module in modules_to_comment:
+                if module in line:
+                    if not line.startswith("#"):
+                        print(f"#{line}", end="")
+                    else:
+                        print(f"{line[1:]}", end="")
+                    flag = False
+                    break
+            if flag:
+                print(line, end="")
+    else:
+        print(line, end="")
+
+# For getting around scikit-image's packaging, laze_loader has had a patch merged but yet to be released.
+# Refer: https://github.com/scientific-python/lazy_loader
+path_to_lazy_loader = Path(get_python_lib() + "/lazy_loader/__init__.py")
+
+for line in fileinput.input(path_to_lazy_loader, inplace=True):
+    if 'stubfile = filename if filename.endswith("i")' in line:
+        print(
+            '    stubfile = (filename if filename.endswith("i") else f"{os.path.splitext(filename)[0]}.pyi")',
+            end="",
+        )
+    else:
+        print(line, end="")
+
+# For getting around timm's packaging.
+# Refer: https://github.com/pyinstaller/pyinstaller/issues/5673#issuecomment-808731505
+path_to_timm_activations = Path(get_python_lib() + "/timm/layers/activations_jit.py")
+for line in fileinput.input(path_to_timm_activations, inplace=True):
+    if "@torch.jit.script" in line:
+        print("@torch.jit._script_if_tracing", end="\n")
+    else:
+        print(line, end="")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,9 +4,26 @@ requires = [
    "wheel",
    "packaging",

-    "numpy==1.22.4",
-    "torch-mlir>=20220428.420",
-    "iree-compiler>=20220427.13",
-    "iree-runtime>=20220427.13",
+    "numpy>=1.22.4",
+    "iree-compiler>=20221022.190",
+    "iree-runtime>=20221022.190",
 ]
 build-backend = "setuptools.build_meta"
+
+[tool.black]
+include = '\.pyi?$'
+exclude = '''
+(
+  /(
+    | apps/stable_diffusion
+    | apps/language_models
+    | shark
+    | benchmarks
+    | tank
+    | build
+    | generated_imgs
+    | shark.venv
+  )/
+  | setup.py
+)
+'''
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
-addopts = --verbose -p no:warnings
-norecursedirs = inference tank/tflite 
+addopts = --verbose -s -p no:warnings
+norecursedirs = inference tank/tflite examples benchmarks shark apps/shark_studio
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -1,4 +1,4 @@
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+-f https://download.pytorch.org/whl/nightly/cpu/
 --pre

 numpy
@@ -8,23 +8,17 @@ torchvision
 tqdm

 #iree-compiler  | iree-runtime should already be installed
-#these dont work ok osx
-#iree-tools-tflite
-#iree-tools-xla
-#iree-tools-tf

-# TensorFlow and JAX.
-gin-config
-tensorflow-macos
-tensorflow-metal
-#tf-models-nightly
-#tensorflow-text-nightly
-transformers==4.18.0
+transformers
 #jax[cpu]

 # tflitehub dependencies.
 Pillow

+# web dependecies.
+gradio
+altair
+
 # Testing and support.
 #lit
 #pyyaml
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -1,39 +1,41 @@
 -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 --pre

-numpy==1.22.4
-torch
-torchvision
+numpy>1.22.4
+pytorch-triton
+torchvision 
+tabulate

 tqdm

 #iree-compiler  | iree-runtime should already be installed
-iree-tools-tflite
 iree-tools-xla
-iree-tools-tf

-# TensorFlow and JAX.
+# Modelling and JAX.
 gin-config
-tensorflow
-tf-models-nightly
-tensorflow-text-nightly
-transformers==4.18.0
+transformers
+diffusers
 #jax[cpu]
-
-
-# tflitehub dependencies.
 Pillow

 # Testing and support.
 lit
 pyyaml
+python-dateutil
+sacremoses
+sentencepiece
+
+# web dependecies.
+gradio==3.44.3
+altair
+scipy

 #ONNX and ORT for benchmarking
--extra-index-url https://test.pypi.org/simple/
-protobuf
-coloredlogs
-flatbuffers
-sympy
-psutil
-onnx-weekly
-ort-nightly
+#--extra-index-url https://test.pypi.org/simple/
+#protobuf
+#coloredlogs
+#flatbuffers
+#sympy
+#psutil
+#onnx-weekly
+#ort-nightly
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,54 @@
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+-f https://openxla.github.io/iree/pip-release-links.html
+--pre
+
 setuptools
 wheel

-#SHARK Runner
+shark-turbine @ git+https://github.com/nod-ai/SHARK-Turbine.git@main
+turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine#egg=turbine-models&subdirectory=python/turbine_models
+
+# SHARK Runner
 tqdm

-#Testing
+# SHARK Downloader
+google-cloud-storage
+
+# Testing
 pytest
 pytest-xdist
+pytest-forked
+Pillow
+parameterized
+
+# Add transformers, diffusers and scipy since it most commonly used
+#accelerate is now required for diffusers import from ckpt.
+accelerate
+scipy
+ftfy
+gradio==4.8.0
+altair
+omegaconf
+# 0.3.2 doesn't have binaries for arm64
+safetensors==0.3.1
+opencv-python
+scikit-image
+pytorch_lightning # for runwayml models
+tk
+pywebview
+sentencepiece
+py-cpuinfo
+tiktoken # for codegen
+joblib # for langchain
+timm # for MiniGPT4
+langchain
+einops # for zoedepth
+pydantic==2.4.1 # pin until pyinstaller-hooks-contrib works with beta versions
+
+# Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
+pefile
+pyinstaller
+
+# For quantized GPTQ models
+optimum
+auto_gptq
--- a/rest_api_tests/api_test.py
+++ b/rest_api_tests/api_test.py
@@ -0,0 +1,348 @@
+import requests
+from PIL import Image
+import base64
+from io import BytesIO
+
+
+def upscaler_test(verbose=False):
+    # Define values here
+    prompt = ""
+    negative_prompt = ""
+    seed = 2121991605
+    height = 512
+    width = 512
+    steps = 50
+    noise_level = 10
+    cfg_scale = 7
+    image_path = r"./rest_api_tests/dog.png"
+
+    # Converting Image to base64
+    img_file = open(image_path, "rb")
+    init_images = [
+        "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
+    ]
+
+    url = "http://127.0.0.1:8080/sdapi/v1/upscaler"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    data = {
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "seed": seed,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "noise_level": noise_level,
+        "cfg_scale": cfg_scale,
+        "init_images": init_images,
+    }
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+
+    print(f"[upscaler] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
+
+
+def img2img_test(verbose=False):
+    # Define values here
+    prompt = "Paint a rabbit riding on the dog"
+    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
+    seed = 2121991605
+    height = 512
+    width = 512
+    steps = 50
+    denoising_strength = 0.75
+    cfg_scale = 7
+    image_path = r"./rest_api_tests/dog.png"
+
+    # Converting Image to Base64
+    img_file = open(image_path, "rb")
+    init_images = [
+        "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
+    ]
+
+    url = "http://127.0.0.1:8080/sdapi/v1/img2img"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    data = {
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "init_images": init_images,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "denoising_strength": denoising_strength,
+        "cfg_scale": cfg_scale,
+        "seed": seed,
+    }
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+
+    print(f"[img2img] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
+
+    # NOTE Uncomment below to save the picture
+
+    # print("Extracting response object")
+    # response_obj = res.json()
+    # img_b64 = response_obj.get("images", [False])[0] or response_obj.get(
+    #     "image"
+    # )
+    # img_b2 = base64.b64decode(img_b64.replace("data:image/png;base64,", ""))
+    # im_file = BytesIO(img_b2)
+    # response_img = Image.open(im_file)
+    # print("Saving Response Image to: response_img")
+    # response_img.save(r"rest_api_tests/response_img.png")
+
+
+def inpainting_test(verbose=False):
+    prompt = "Paint a rabbit riding on the dog"
+    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
+    seed = 2121991605
+    height = 512
+    width = 512
+    steps = 50
+    noise_level = 10
+    cfg_scale = 7
+    is_full_res = False
+    full_res_padding = 32
+    image_path = r"./rest_api_tests/dog.png"
+
+    img_file = open(image_path, "rb")
+    image = "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
+    img_file = open(image_path, "rb")
+    mask = "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
+
+    url = "http://127.0.0.1:8080/sdapi/v1/inpaint"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    data = {
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "image": image,
+        "mask": mask,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "noise_level": noise_level,
+        "cfg_scale": cfg_scale,
+        "seed": seed,
+        "is_full_res": is_full_res,
+        "full_res_padding": full_res_padding,
+    }
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+
+    print(f"[inpaint] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
+
+
+def outpainting_test(verbose=False):
+    prompt = "Paint a rabbit riding on the dog"
+    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
+    seed = 2121991605
+    height = 512
+    width = 512
+    steps = 50
+    cfg_scale = 7
+    color_variation = 0.2
+    noise_q = 0.2
+    directions = ["up", "down", "right", "left"]
+    pixels = 32
+    mask_blur = 64
+    image_path = r"./rest_api_tests/dog.png"
+
+    # Converting Image to Base64
+    img_file = open(image_path, "rb")
+    init_images = [
+        "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
+    ]
+
+    url = "http://127.0.0.1:8080/sdapi/v1/outpaint"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    data = {
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "seed": seed,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "cfg_scale": cfg_scale,
+        "color_variation": color_variation,
+        "noise_q": noise_q,
+        "directions": directions,
+        "pixels": pixels,
+        "mask_blur": mask_blur,
+        "init_images": init_images,
+    }
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+
+    print(f"[outpaint] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
+
+
+def txt2img_test(verbose=False):
+    prompt = "Paint a rabbit in a top hate"
+    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
+    seed = 2121991605
+    height = 512
+    width = 512
+    steps = 50
+    cfg_scale = 7
+
+    url = "http://127.0.0.1:8080/sdapi/v1/txt2img"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    data = {
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "seed": seed,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "cfg_scale": cfg_scale,
+    }
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+
+    print(f"[txt2img] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
+
+
+def sd_models_test(verbose=False):
+    url = "http://127.0.0.1:8080/sdapi/v1/sd-models"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    res = requests.get(url=url, headers=headers, timeout=1000)
+
+    print(f"[sd_models] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json() if res.status_code == 200 else res.content}\n")
+
+
+def sd_samplers_test(verbose=False):
+    url = "http://127.0.0.1:8080/sdapi/v1/samplers"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    res = requests.get(url=url, headers=headers, timeout=1000)
+
+    print(f"[sd_samplers] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json() if res.status_code == 200 else res.content}\n")
+
+
+def options_test(verbose=False):
+    url = "http://127.0.0.1:8080/sdapi/v1/options"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    res = requests.get(url=url, headers=headers, timeout=1000)
+
+    print(f"[options] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json() if res.status_code == 200 else res.content}\n")
+
+
+def cmd_flags_test(verbose=False):
+    url = "http://127.0.0.1:8080/sdapi/v1/cmd-flags"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    res = requests.get(url=url, headers=headers, timeout=1000)
+
+    print(f"[cmd-flags] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res.json() if res.status_code == 200 else res.content}\n")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=(
+            "Exercises the Stable Diffusion REST API of Shark. Make sure "
+            "Shark is running in API mode on 127.0.0.1:8080 before running"
+            "this script."
+        ),
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help=(
+            "also display selected info from the JSON response for "
+            "successful requests"
+        ),
+    )
+    args = parser.parse_args()
+
+    sd_models_test(args.verbose)
+    sd_samplers_test(args.verbose)
+    options_test(args.verbose)
+    cmd_flags_test(args.verbose)
+    txt2img_test(args.verbose)
+    img2img_test(args.verbose)
+    upscaler_test(args.verbose)
+    inpainting_test(args.verbose)
+    outpainting_test(args.verbose)
--- a/rest_api_tests/dog.png
+++ b/rest_api_tests/dog.png
--- a/setup.py
+++ b/setup.py
@@ -2,11 +2,13 @@ from setuptools import find_packages
 from setuptools import setup

 import os
+import glob

 with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

-PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
+PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.5"
+backend_deps = []

 setup(
    name="nodai-SHARK",
@@ -26,13 +28,11 @@ setup(
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
-    packages=find_packages(exclude=('examples')),
-    python_requires=">=3.7",
+    packages=find_packages(exclude=("examples")),
+    python_requires=">=3.9",
+    data_files=glob.glob("apps/stable_diffusion/resources/**"),
    install_requires=[
        "numpy",
        "PyYAML",
-        "torch-mlir>=20220428.420",
-        "iree-compiler>=20220427.13",
-        "iree-runtime>=20220427.13",
-    ],
+    ]
 )
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -0,0 +1,97 @@
+<#
+.SYNOPSIS
+  A script to update and install the SHARK runtime and its dependencies.
+
+.DESCRIPTION
+  This script updates and installs the SHARK runtime and its dependencies.
+  It checks the Python version installed and installs any required build
+  dependencies into a Python virtual environment.
+  If that environment does not exist, it creates it.
+  
+.PARAMETER update-src
+  git pulls latest version
+
+.PARAMETER force
+  removes and recreates venv to force update of all dependencies
+  
+.EXAMPLE
+  .\setup_venv.ps1 --force
+
+.EXAMPLE
+  .\setup_venv.ps1 --update-src
+
+.INPUTS
+  None
+
+.OUTPUTS
+  None
+
+#>
+
+param([string]$arguments)
+
+if ($arguments -eq "--update-src"){
+	git pull
+}
+
+if ($arguments -eq "--force"){
+	if (Test-Path env:VIRTUAL_ENV) {
+        Write-Host "deactivating..."
+        Deactivate
+    }
+    
+    if (Test-Path .\shark.venv\) {
+        Write-Host "removing and recreating venv..."
+        Remove-Item .\shark.venv -Force -Recurse
+        if (Test-Path .\shark.venv\) {
+            Write-Host 'could not remove .\shark-venv - please try running ".\setup_venv.ps1 --force" again!'
+            exit 1
+        }
+    }
+}
+
+# redirect stderr into stdout
+$p = &{python -V} 2>&1
+# check if an ErrorRecord was returned
+$version = if($p -is [System.Management.Automation.ErrorRecord])
+{
+    # grab the version string from the error message
+    $p.Exception.Message
+}
+else
+{
+    # otherwise return complete Python list
+    $ErrorActionPreference = 'SilentlyContinue'
+    $PyVer = py --list
+}
+
+# deactivate any activated venvs
+if ($PyVer -like "*venv*")
+{
+  deactivate # make sure we don't update the wrong venv
+  $PyVer = py --list # update list
+}
+
+Write-Host "Python versions found are"
+Write-Host ($PyVer | Out-String) # formatted output with line breaks
+if (!($PyVer.length -ne 0)) {$p} # return Python --version String if py.exe is unavailable
+if (!($PyVer -like "*3.11*") -and !($p -like "*3.11*")) # if 3.11 is not in any list
+{
+    Write-Host "Please install Python 3.11 and try again"
+    exit 34
+}
+
+Write-Host "Installing Build Dependencies"
+# make sure we really use 3.11 from list, even if it's not the default.
+if ($NULL -ne $PyVer) {py -3.11 -m venv .\shark.venv\}
+else {python -m venv .\shark.venv\}
+.\shark.venv\Scripts\activate
+python -m pip install --upgrade pip
+pip install wheel
+pip install -r requirements.txt
+pip install --pre torch-mlir torchvision torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
+pip install --upgrade -f https://nod-ai.github.io/SRT/pip-release-links.html iree-compiler iree-runtime
+Write-Host "Building SHARK..."
+pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
+Write-Host "Build and installation completed successfully"
+Write-Host "Source your venv with ./shark.venv/Scripts/activate"
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -2,11 +2,14 @@
 # Sets up a venv suitable for running samples.
 # e.g:
 # ./setup_venv.sh  #setup a default $PYTHON3 shark.venv
-# Environment Variables by the script.
+# Environment variables used by the script.
 # PYTHON=$PYTHON3.10 ./setup_venv.sh  #pass a version of $PYTHON to use
 # VENV_DIR=myshark.venv #create a venv called myshark.venv
+# SKIP_VENV=1 #Don't create and activate a Python venv. Use the current environment. 
 # USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
 # IMPORTER=1 #Install importer deps
+# BENCHMARK=1 #Install benchmark deps
+# NO_BACKEND=1 #Don't install iree or shark backend
 # if you run the script from a conda env it will install in your conda env

 TD="$(cd $(dirname $0) && pwd)"
@@ -24,15 +27,22 @@ PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; prin
 echo "Python: $PYTHON"
 echo "Python version: $PYTHON_VERSION_X_Y"

-if [[ -z "${CONDA_PREFIX}" ]]; then
-  # Not a conda env. So create a new VENV dir
-  VENV_DIR=${VENV_DIR:-shark.venv}
-  echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
-  $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
-  source "$VENV_DIR/bin/activate" || die "Could not activate venv"
-  PYTHON="$(which python3)"
-else
-  echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
+if [ "$PYTHON_VERSION_X_Y" != "3.11" ]; then
+    echo "Error: Python version 3.11 is required."
+    exit 1
+fi
+
+if [[ "$SKIP_VENV" != "1" ]]; then
+  if [[ -z "${CONDA_PREFIX}" ]]; then
+    # Not a conda env. So create a new VENV dir
+    VENV_DIR=${VENV_DIR:-shark.venv}
+    echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
+    $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
+    source "$VENV_DIR/bin/activate" || die "Could not activate venv"
+    PYTHON="$(which python3)"
+  else
+    echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
+  fi
 fi

 Red=`tput setaf 1`
@@ -40,7 +50,7 @@ Green=`tput setaf 2`
 Yellow=`tput setaf 3`

 # Assume no binary torch-mlir.
-# Currently available for macOS m1&intel (3.10) and Linux(3.7,3.8,3.9,3.10)
+# Currently available for macOS m1&intel (3.11) and Linux(3.8,3.10,3.11)
 torch_mlir_bin=false
 if [[ $(uname -s) = 'Darwin' ]]; then
  echo "${Yellow}Apple macOS detected"
@@ -58,12 +68,12 @@ if [[ $(uname -s) = 'Darwin' ]]; then
  fi
  echo "${Yellow}Run the following commands to setup your SSL certs for your Python version if you see SSL errors with tests"
  echo "${Yellow}/Applications/Python\ 3.XX/Install\ Certificates.command"
-  if [ "$PYTHON_VERSION_X_Y" == "3.10" ]; then
+  if [ "$PYTHON_VERSION_X_Y" == "3.11" ]; then
    torch_mlir_bin=true
  fi
 elif [[ $(uname -s) = 'Linux' ]]; then
  echo "${Yellow}Linux detected"
-  if [ "$PYTHON_VERSION_X_Y" == "3.7" ] || [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.9" ] || [ "$PYTHON_VERSION_X_Y" == "3.10" ] ; then
+  if [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.10" ] || [ "$PYTHON_VERSION_X_Y" == "3.11" ] ; then
    torch_mlir_bin=true
  fi
 else
@@ -74,42 +84,78 @@ fi
 $PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
 $PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
 if [ "$torch_mlir_bin" = true ]; then
-  $PYTHON -m pip install --find-links https://github.com/llvm/torch-mlir/releases torch-mlir --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch-mlir"
+  if [[ $(uname -s) = 'Darwin' ]]; then
+    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
+    $PYTHON -m pip uninstall -y timm #TEMP FIX FOR MAC
+    $PYTHON -m pip install --pre --no-cache-dir torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
  else
-    echo "Could not install torch-mlir" >&2
+    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
+    if [ $? -eq 0 ];then
+      echo "Successfully Installed torch-mlir"
+    else
+      echo "Could not install torch-mlir" >&2
+    fi
  fi
 else
  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
-  echo "${Yello}Python 3.10 supported on macOS and 3.7,3.8,3.9 and 3.10 on Linux"
+  echo "${Yello}Python 3.11 supported on macOS and 3.8,3.10 and 3.11 on Linux"
  echo "${Red}Please build torch-mlir from source in your environment"
  exit 1
 fi
 if [[ -z "${USE_IREE}" ]]; then
-  RUNTIME="nod-ai/SHARK-Runtime"
+  rm .use-iree
+  RUNTIME="https://nod-ai.github.io/SRT/pip-release-links.html"
 else
-  RUNTIME="google/iree"
+  touch ./.use-iree
+  RUNTIME="https://openxla.github.io/iree/pip-release-links.html"
+fi
+if [[ -z "${NO_BACKEND}" ]]; then
+  echo "Installing ${RUNTIME}..."
+  $PYTHON -m pip install --pre --upgrade --no-index --find-links ${RUNTIME} iree-compiler iree-runtime
+else
+  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
 fi
-echo "Installing ${RUNTIME}..."
-$PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime

 if [[ ! -z "${IMPORTER}" ]]; then
  echo "${Yellow}Installing importer tools.."
  if [[ $(uname -s) = 'Linux' ]]; then
    echo "${Yellow}Linux detected.. installing Linux importer tools"
-    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://test.pypi.org/simple/ --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    #Always get the importer tools from upstream IREE
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://openxla.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  elif [[ $(uname -s) = 'Darwin' ]]; then
    echo "${Yellow}macOS detected.. installing macOS importer tools"
    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
-    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  fi
 fi

-$PYTHON -m pip install -e . --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://github.com/llvm/torch-mlir/releases -f https://github.com/${RUNTIME}/releases
+if [[ $(uname -s) = 'Darwin' ]]; then
+  PYTORCH_URL=https://download.pytorch.org/whl/nightly/torch/
+else
+  PYTORCH_URL=https://download.pytorch.org/whl/nightly/cpu/
+fi

-if [[ -z "${CONDA_PREFIX}" ]]; then
+$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f ${PYTORCH_URL}
+
+if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
+  T_VER=$($PYTHON -m pip show torch | grep Version)
+  T_VER_MIN=${T_VER:14:12}
+  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
+  TV_VER_MAJ=${TV_VER:9:6}
+  $PYTHON -m pip uninstall -y torchvision
+  $PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
+  if [ $? -eq 0 ];then
+    echo "Successfully Installed torch + cu118."
+  else
+    echo "Could not install torch + cu118." >&2
+  fi
+fi
+
+if [[ -z "${NO_BREVITAS}" ]]; then
+  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@dev
+fi
+
+if [[ -z "${CONDA_PREFIX}" && "$SKIP_VENV" != "1" ]]; then
  echo "${Green}Before running examples activate venv with:"
  echo "  ${Green}source $VENV_DIR/bin/activate"
 fi
-
--- a/shark/init.py
+++ b/shark/init.py
@@ -0,0 +1,28 @@
+import importlib
+import logging
+
+from torch._dynamo import register_backend
+
+log = logging.getLogger(__name__)
+
+
+@register_backend
+def shark(model, inputs, *, options):
+    try:
+        from shark.dynamo_backend.utils import SharkBackend
+    except ImportError:
+        log.exception(
+            "Unable to import SHARK - High Performance Machine Learning Distribution"
+            "Please install the right version of SHARK that matches the PyTorch version being used. "
+            "Refer to https://github.com/nod-ai/SHARK/ for details."
+        )
+        raise
+    return SharkBackend(model, inputs, options)
+
+
+def has_shark():
+    try:
+        importlib.import_module("shark")
+        return True
+    except ImportError:
+        return False
--- a/shark/backward_makefx.py
+++ b/shark/backward_makefx.py
@@ -15,15 +15,13 @@
 import torch
 from torch._decomp import get_decompositions
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.nn.utils import _stateless
+from torch.nn.utils import stateless

 from torch import fx
-import copy
 import tempfile


 class MakeFxModule:
-
    def __init__(self, model, inputs, labels=None, custom_inference_fn=None):
        self.model = model
        self.inputs = inputs
@@ -53,20 +51,28 @@ class MakeFxModule:
        return fx_g

    def generate_graph(self):
-        fx_g = make_fx(self.custom_inference_fn,
-                       decomposition_table=get_decompositions([
-                           torch.ops.aten.embedding_dense_backward,
-                           torch.ops.aten.native_layer_norm_backward,
-                           torch.ops.aten.slice_backward,
-                           torch.ops.aten.select_backward
-                       ]))(dict(self.model.named_parameters()),
-                           dict(self.model.named_buffers()), self.inputs)
+        fx_g = make_fx(
+            self.custom_inference_fn,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                ]
+            ),
+        )(
+            dict(self.model.named_parameters()),
+            dict(self.model.named_buffers()),
+            self.inputs,
+        )
        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
        fx_g.recompile()
        fx_g = self.change_fx_graph_return_to_tuple(fx_g)
        ts_g = torch.jit.script(fx_g)
-        temp = tempfile.NamedTemporaryFile(suffix='_shark_ts',
-                                           prefix='temp_ts_')
+        temp = tempfile.NamedTemporaryFile(
+            suffix="_shark_ts", prefix="temp_ts_"
+        )
        ts_g.save(temp.name)
        new_ts = torch.jit.load(temp.name)
        self.training_graph = new_ts
--- a/shark/cuda_utils.py
+++ b/shark/cuda_utils.py
@@ -1,78 +0,0 @@
-# Copyright 2020 The Nod Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import ctypes
-
-#Some constants taken from cuda.h
-CUDA_SUCCESS = 0
-CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
-CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
-CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
-CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
-
-
-def get_cuda_sm_cc():
-    libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')
-    for libname in libnames:
-        try:
-            cuda = ctypes.CDLL(libname)
-        except OSError:
-            continue
-        else:
-            break
-    else:
-        raise OSError("could not load any of: " + ' '.join(libnames))
-
-    nGpus = ctypes.c_int()
-    name = b' ' * 100
-    cc_major = ctypes.c_int()
-    cc_minor = ctypes.c_int()
-
-    result = ctypes.c_int()
-    device = ctypes.c_int()
-    context = ctypes.c_void_p()
-    error_str = ctypes.c_char_p()
-
-    result = cuda.cuInit(0)
-    if result != CUDA_SUCCESS:
-        cuda.cuGetErrorString(result, ctypes.byref(error_str))
-        print("cuInit failed with error code %d: %s" %
-              (result, error_str.value.decode()))
-        return 1
-    result = cuda.cuDeviceGetCount(ctypes.byref(nGpus))
-    if result != CUDA_SUCCESS:
-        cuda.cuGetErrorString(result, ctypes.byref(error_str))
-        print("cuDeviceGetCount failed with error code %d: %s" %
-              (result, error_str.value.decode()))
-        return 1
-    print("Found %d device(s)." % nGpus.value)
-    for i in range(nGpus.value):
-        result = cuda.cuDeviceGet(ctypes.byref(device), i)
-        if result != CUDA_SUCCESS:
-            cuda.cuGetErrorString(result, ctypes.byref(error_str))
-            print("cuDeviceGet failed with error code %d: %s" %
-                  (result, error_str.value.decode()))
-            return 1
-        print("Device: %d" % i)
-        if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name),
-                                device) == CUDA_SUCCESS:
-            print("  Name: %s" % (name.split(b'\0', 1)[0].decode()))
-        if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major),
-                                          ctypes.byref(cc_minor),
-                                          device) == CUDA_SUCCESS:
-            print("  Compute Capability: %d.%d" %
-                  (cc_major.value, cc_minor.value))
-    sm = f"sm_{cc_major.value}{cc_minor.value}"
-    return sm
--- a/shark/dynamo_backend/init.py
+++ b/shark/dynamo_backend/init.py
--- a/shark/dynamo_backend/utils.py
+++ b/shark/dynamo_backend/utils.py
@@ -0,0 +1,154 @@
+import functools
+from typing import List, Optional
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._functorch.compile_utils import strip_overloads
+from shark.shark_inference import SharkInference
+from torch._decomp import get_decompositions
+from torch.func import functionalize
+import io
+import torch_mlir
+
+
+# TODO: Control decompositions.
+def default_decompositions():
+    return get_decompositions(
+        [
+            torch.ops.aten.embedding_dense_backward,
+            torch.ops.aten.native_layer_norm_backward,
+            torch.ops.aten.slice_backward,
+            torch.ops.aten.select_backward,
+            torch.ops.aten.norm.ScalarOpt_dim,
+            torch.ops.aten.native_group_norm,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.split_with_sizes,
+            torch.ops.aten.native_layer_norm,
+            torch.ops.aten.masked_fill.Tensor,
+            torch.ops.aten.masked_fill.Scalar,
+        ]
+    )
+
+
+def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
+    removed_indexes = []
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, (list, tuple)):
+                node_arg = list(node_arg)
+                node_args_len = len(node_arg)
+                for i in range(node_args_len):
+                    curr_index = node_args_len - (i + 1)
+                    if node_arg[curr_index] is None:
+                        removed_indexes.append(curr_index)
+                        node_arg.pop(curr_index)
+                node.args = (tuple(node_arg),)
+                break
+
+    if len(removed_indexes) > 0:
+        fx_g.graph.lint()
+        fx_g.graph.eliminate_dead_code()
+        fx_g.recompile()
+    removed_indexes.sort()
+    return removed_indexes
+
+
+def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, tuple):
+                return len(node_arg) == 0
+    return False
+
+
+def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
+    """
+    Replace tuple with tuple element in functions that return one-element tuples.
+    Returns true if an unwrapping took place, and false otherwise.
+    """
+    unwrapped_tuple = False
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, tuple):
+                if len(node_arg) == 1:
+                    node.args = (node_arg[0],)
+                    unwrapped_tuple = True
+                    break
+
+    if unwrapped_tuple:
+        fx_g.graph.lint()
+        fx_g.recompile()
+    return unwrapped_tuple
+
+
+class SharkBackend:
+    def __init__(
+        self, fx_g: torch.fx.GraphModule, inputs: tuple, options: dict
+    ):
+        self.fx_g = fx_g
+        self.inputs = inputs
+        self.shark_module = None
+        self.device: str = options.get("device", "cpu")
+        self.was_unwrapped: bool = False
+        self.none_indices: list = []
+        self._modify_fx_g()
+        self.compile()
+
+    def _modify_fx_g(self):
+        self.none_indices = _remove_nones(self.fx_g)
+        self.was_unwrapped = _unwrap_single_tuple_return(self.fx_g)
+
+    def compile(self):
+        gm = make_fx(
+            functionalize(self.fx_g),
+            decomposition_table=default_decompositions(),
+        )(*self.inputs)
+        gm.graph.set_codegen(torch.fx.graph.CodeGen())
+        gm.recompile()
+        strip_overloads(gm)
+        ts_g = torch.jit.script(gm)
+        mlir_module = torch_mlir.compile(
+            ts_g, self.inputs, output_type="linalg-on-tensors"
+        )
+        bytecode_stream = io.BytesIO()
+        mlir_module.operation.write_bytecode(bytecode_stream)
+        bytecode = bytecode_stream.getvalue()
+        from shark.shark_inference import SharkInference
+
+        shark_module = SharkInference(
+            mlir_module=bytecode,
+            device=self.device,
+            mlir_dialect="tm_tensor",
+        )
+        shark_module.compile(extra_args=[])
+        self.shark_module = shark_module
+
+    def __call__(self, *inputs):
+        np_inputs = [x.contiguous().detach().cpu().numpy() for x in inputs]
+        np_outs = self.shark_module("forward", np_inputs)
+        if self.was_unwrapped:
+            np_outs = [
+                np_outs,
+            ]
+
+        if not isinstance(np_outs, list):
+            res = torch.from_numpy(np_outs)
+            return res
+
+        result = [torch.from_numpy(x) for x in np_outs]
+        for r_in in self.none_indices:
+            result.insert(r_in, None)
+        result = tuple(result)
+        return result
--- a/shark/examples/shark_dynamo/basic_examples.py
+++ b/shark/examples/shark_dynamo/basic_examples.py
@@ -0,0 +1,25 @@
+import torch
+import shark
+
+
+def foo(x, a):
+    if x.shape[0] > 3:
+        return x + a
+    else:
+        return x + 3
+
+
+shark_options = {"device": "cpu"}
+compiled = torch.compile(foo, backend="shark", options=shark_options)
+
+input = torch.ones(4)
+
+x = compiled(input, input)
+
+print(x)
+
+input = torch.ones(3)
+
+x = compiled(input, input)
+
+print(x)
--- a/shark/examples/shark_eager/dynamo_demo.ipynb
+++ b/shark/examples/shark_eager/dynamo_demo.ipynb
@@ -36,7 +36,9 @@
    "    from torchdynamo.optimizations.backends import create_backend\n",
    "    from torchdynamo.optimizations.subgraph import SubGraph\n",
    "except ModuleNotFoundError:\n",
-    "    print(\"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\")\n",
+    "    print(\n",
+    "        \"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\"\n",
+    "    )\n",
    "    exit()\n",
    "\n",
    "# torch-mlir imports for compiling\n",
@@ -97,7 +99,9 @@
    "\n",
    "        for node in fx_g.graph.nodes:\n",
    "            if node.op == \"output\":\n",
-    "                assert len(node.args) == 1, \"Output node must have a single argument\"\n",
+    "                assert (\n",
+    "                    len(node.args) == 1\n",
+    "                ), \"Output node must have a single argument\"\n",
    "                node_arg = node.args[0]\n",
    "                if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
    "                    node.args = (node_arg[0],)\n",
@@ -116,8 +120,12 @@
    "    if len(args) == 1 and isinstance(args[0], list):\n",
    "        args = args[0]\n",
    "\n",
-    "    linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)\n",
-    "    callable, _ = get_iree_compiled_module(linalg_module, \"cuda\", func_name=\"forward\")\n",
+    "    linalg_module = compile(\n",
+    "        ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS\n",
+    "    )\n",
+    "    callable, _ = get_iree_compiled_module(\n",
+    "        linalg_module, \"cuda\", func_name=\"forward\"\n",
+    "    )\n",
    "\n",
    "    def forward(*inputs):\n",
    "        return callable(*inputs)\n",
@@ -212,6 +220,7 @@
    "    assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
    "    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
    "\n",
+    "\n",
    "@torchdynamo.optimize(\"torch_mlir\")\n",
    "def toy_example2(*args):\n",
    "    a, b = args\n",
--- a/shark/examples/shark_eager/dynamo_demo.py
+++ b/shark/examples/shark_eager/dynamo_demo.py
@@ -8,7 +8,9 @@ try:
    from torchdynamo.optimizations.backends import create_backend
    from torchdynamo.optimizations.subgraph import SubGraph
 except ModuleNotFoundError:
-    print("Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo")
+    print(
+        "Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo"
+    )
    exit()

 NUM_ITERS = 10
@@ -24,7 +26,9 @@ def __torch_mlir(fx_graph, *args, **kwargs):

        for node in fx_g.graph.nodes:
            if node.op == "output":
-                assert len(node.args) == 1, "Output node must have a single argument"
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
                node_arg = node.args[0]
                if isinstance(node_arg, tuple) and len(node_arg) == 1:
                    node.args = (node_arg[0],)
@@ -41,8 +45,12 @@ def __torch_mlir(fx_graph, *args, **kwargs):
    if len(args) == 1 and isinstance(args[0], list):
        args = args[0]

-    linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)
-    callable, _ = get_iree_compiled_module(linalg_module, "cuda", func_name="forward")
+    linalg_module = compile(
+        ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS
+    )
+    callable, _ = get_iree_compiled_module(
+        linalg_module, "cuda", func_name="forward"
+    )

    def forward(*inputs):
        return callable(*inputs)
--- a/shark/examples/shark_eager/squeezenet_lockstep.py
+++ b/shark/examples/shark_eager/squeezenet_lockstep.py
@@ -0,0 +1,73 @@
+import torch
+import numpy as np
+
+model = torch.hub.load(
+    "pytorch/vision:v0.10.0", "squeezenet1_0", pretrained=True
+)
+model.eval()
+
+# from PIL import Image
+# from torchvision import transforms
+# import urllib
+#
+# url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+# try: urllib.URLopener().retrieve(url, filename)
+# except: urllib.request.urlretrieve(url, filename)
+#
+#
+# input_image = Image.open(filename)
+# preprocess = transforms.Compose([
+#     transforms.Resize(256),
+#     transforms.CenterCrop(224),
+#     transforms.ToTensor(),
+#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+# ])
+# input_tensor = preprocess(input_image)
+# input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+# print(input_batch.shape) # size = [1, 3, 224, 224]
+
+# The above is code for generating sample inputs from an image. We can just use
+# random values for accuracy testing though
+input_batch = torch.randn(1, 3, 224, 224)
+
+
+# Focus on CPU for now
+if False and torch.cuda.is_available():
+    input_batch = input_batch.to("cuda")
+    model.to("cuda")
+
+with torch.no_grad():
+    output = model(input_batch)
+# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
+golden_confidences = output[0]
+# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
+golden_probabilities = torch.nn.functional.softmax(
+    golden_confidences, dim=0
+).numpy()
+
+golden_confidences = golden_confidences.numpy()
+
+from shark.torch_mlir_lockstep_tensor import TorchMLIRLockstepTensor
+
+input_detached_clone = input_batch.clone()
+eager_input_batch = TorchMLIRLockstepTensor(input_detached_clone)
+
+print("getting torch-mlir result")
+
+output = model(eager_input_batch)
+
+static_output = output.elem
+confidences = static_output[0]
+probabilities = torch.nn.functional.softmax(
+    torch.from_numpy(confidences), dim=0
+).numpy()
+
+print("The obtained result via shark is: ", confidences)
+print("The golden result is:", golden_confidences)
+
+np.testing.assert_allclose(
+    golden_confidences, confidences, rtol=1e-02, atol=1e-03
+)
+np.testing.assert_allclose(
+    golden_probabilities, probabilities, rtol=1e-02, atol=1e-03
+)
--- a/shark/examples/shark_inference/CLIPModel_tf.py
+++ b/shark/examples/shark_inference/CLIPModel_tf.py
@@ -9,23 +9,24 @@ from shark.shark_inference import SharkInference
 clip_vit_inputs = [
    tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
    tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
-    tf.TensorSpec(shape=[1, 3, 224, 224], dtype=tf.float32)
+    tf.TensorSpec(shape=[1, 3, 224, 224], dtype=tf.float32),
 ]


 class CLIPModule(tf.Module):
-
    def __init__(self):
        super(CLIPModule, self).__init__()
        self.m = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")

        self.m.predict = lambda x, y, z: self.m(
-            input_ids=x, attention_mask=y, pixel_values=z)
+            input_ids=x, attention_mask=y, pixel_values=z
+        )

-    @tf.function(input_signature=clip_vit_inputs)
+    @tf.function(input_signature=clip_vit_inputs, jit_compile=True)
    def forward(self, input_ids, attention_mask, pixel_values):
-        return self.m.predict(input_ids, attention_mask,
-                              pixel_values).logits_per_image
+        return self.m.predict(
+            input_ids, attention_mask, pixel_values
+        ).logits_per_image


 if __name__ == "__main__":
@@ -35,17 +36,30 @@ if __name__ == "__main__":
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)

-    inputs = processor(text=["a photo of a cat", "a photo of a dog"],
-                       images=image,
-                       return_tensors="tf",
-                       padding=True)
+    inputs = processor(
+        text=["a photo of a cat", "a photo of a dog"],
+        images=image,
+        return_tensors="tf",
+        padding=True,
+    )

    shark_module = SharkInference(
        CLIPModule(),
-        (inputs["input_ids"], inputs["attention_mask"], inputs["pixel_values"]))
+        (
+            inputs["input_ids"],
+            inputs["attention_mask"],
+            inputs["pixel_values"],
+        ),
+    )
    shark_module.set_frontend("tensorflow")
    shark_module.compile()

    print(
-        shark_module.forward((inputs["input_ids"], inputs["attention_mask"],
-                              inputs["pixel_values"])))
+        shark_module.forward(
+            (
+                inputs["input_ids"],
+                inputs["attention_mask"],
+                inputs["pixel_values"],
+            )
+        )
+    )
--- a/shark/examples/shark_inference/ESRGAN/README.md
+++ b/shark/examples/shark_inference/ESRGAN/README.md
@@ -0,0 +1,15 @@
+## Running ESRGAN
+
+```
+1. pip install numpy opencv-python
+2. mkdir InputImages
+   (this is where all the input images will reside in)
+3. mkdir OutputImages
+   (this is where the model will generate all the images)
+4. mkdir models
+   (save the .pth checkpoint file here)
+5. python esrgan.py
+```
+
+- Download [RRDB_ESRGAN_x4.pth](https://drive.google.com/drive/u/0/folders/17VYV_SoZZesU6mbxz2dMAIccSSlqLecY) and place it in the `models` directory as mentioned above in step 4.
+- Credits : [ESRGAN](https://github.com/xinntao/ESRGAN)
--- a/shark/examples/shark_inference/ESRGAN/esrgan.py
+++ b/shark/examples/shark_inference/ESRGAN/esrgan.py
@@ -0,0 +1,239 @@
+from ast import arg
+import os.path as osp
+import glob
+import cv2
+import numpy as np
+import torch
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from shark.shark_inference import SharkInference
+import torch_mlir
+import tempfile
+import functools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def make_layer(block, n_layers):
+    layers = []
+    for _ in range(n_layers):
+        layers.append(block())
+    return nn.Sequential(*layers)
+
+
+class ResidualDenseBlock_5C(nn.Module):
+    def __init__(self, nf=64, gc=32, bias=True):
+        super(ResidualDenseBlock_5C, self).__init__()
+        # gc: growth channel, i.e. intermediate channels
+        self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
+        self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
+        self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
+        self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
+        self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+
+        # initialization
+        # mutil.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
+
+    def forward(self, x):
+        x1 = self.lrelu(self.conv1(x))
+        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
+        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
+        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
+        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
+        return x5 * 0.2 + x
+
+
+class RRDB(nn.Module):
+    """Residual in Residual Dense Block"""
+
+    def __init__(self, nf, gc=32):
+        super(RRDB, self).__init__()
+        self.RDB1 = ResidualDenseBlock_5C(nf, gc)
+        self.RDB2 = ResidualDenseBlock_5C(nf, gc)
+        self.RDB3 = ResidualDenseBlock_5C(nf, gc)
+
+    def forward(self, x):
+        out = self.RDB1(x)
+        out = self.RDB2(out)
+        out = self.RDB3(out)
+        return out * 0.2 + x
+
+
+class RRDBNet(nn.Module):
+    def __init__(self, in_nc, out_nc, nf, nb, gc=32):
+        super(RRDBNet, self).__init__()
+        RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
+
+        self.conv_first = nn.Conv2d(in_nc, nf, 3, 1, 1, bias=True)
+        self.RRDB_trunk = make_layer(RRDB_block_f, nb)
+        self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        #### upsampling
+        self.upconv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.upconv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.HRconv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
+        self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
+
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+
+    def forward(self, x):
+        fea = self.conv_first(x)
+        trunk = self.trunk_conv(self.RRDB_trunk(fea))
+        fea = fea + trunk
+
+        fea = self.lrelu(
+            self.upconv1(F.interpolate(fea, scale_factor=2, mode="nearest"))
+        )
+        fea = self.lrelu(
+            self.upconv2(F.interpolate(fea, scale_factor=2, mode="nearest"))
+        )
+        out = self.conv_last(self.lrelu(self.HRconv(fea)))
+
+        return out
+
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument(
+    "--mlir_loc",
+    type=str,
+    default=None,
+    help="location of the model's mlir file",
+)
+args = p.parse_args()
+###################################################
+
+
+def inference(input_m):
+    return model(input_m)
+
+
+def load_mlir(mlir_loc):
+    import os
+
+    if mlir_loc == None:
+        return None
+    print(f"Trying to load the model from {mlir_loc}.")
+    with open(os.path.join(mlir_loc)) as f:
+        mlir_module = f.read()
+    return mlir_module
+
+
+def compile_through_fx(model, inputs, mlir_loc=None):
+    module = load_mlir(mlir_loc)
+    if module == None:
+        fx_g = make_fx(
+            model,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(inputs)
+
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+
+        def strip_overloads(gm):
+            """
+            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+            Args:
+                gm(fx.GraphModule): The input Fx graph module to be modified
+            """
+            for node in gm.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverload):
+                    node.target = node.target.overloadpacket
+            gm.recompile()
+
+        strip_overloads(fx_g)
+
+        ts_g = torch.jit.script(fx_g)
+
+        print("Torchscript graph generated successfully")
+        module = torch_mlir.compile(
+            ts_g,
+            inputs,
+            torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=False,
+            verbose=False,
+        )
+
+    mlir_model = str(module)
+    func_name = "forward"
+    shark_module = SharkInference(
+        mlir_model, device=args.device, mlir_dialect="linalg"
+    )
+    shark_module.compile()
+
+    return shark_module
+
+
+model_path = "models/RRDB_ESRGAN_x4.pth"  # models/RRDB_ESRGAN_x4.pth OR models/RRDB_PSNR_x4.pth
+# device = torch.device('cuda')  # if you want to run on CPU, change 'cuda' -> cpu
+device = torch.device("cpu")
+
+test_img_folder = "InputImages/*"
+
+model = RRDBNet(3, 3, 64, 23, gc=32)
+model.load_state_dict(torch.load(model_path), strict=True)
+model.eval()
+model = model.to(device)
+
+print("Model path {:s}. \nTesting...".format(model_path))
+
+if __name__ == "__main__":
+    idx = 0
+    for path in glob.glob(test_img_folder):
+        idx += 1
+        base = osp.splitext(osp.basename(path))[0]
+        print(idx, base)
+        # read images
+        img = cv2.imread(path, cv2.IMREAD_COLOR)
+        img = img * 1.0 / 255
+        img = torch.from_numpy(
+            np.transpose(img[:, :, [2, 1, 0]], (2, 0, 1))
+        ).float()
+        img_LR = img.unsqueeze(0)
+        img_LR = img_LR.to(device)
+
+        with torch.no_grad():
+            shark_module = compile_through_fx(inference, img_LR)
+            shark_output = shark_module.forward((img_LR,))
+            shark_output = torch.from_numpy(shark_output)
+            shark_output = (
+                shark_output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            )
+            esrgan_output = (
+                model(img_LR).data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            )
+        # SHARK OUTPUT
+        shark_output = np.transpose(shark_output[[2, 1, 0], :, :], (1, 2, 0))
+        shark_output = (shark_output * 255.0).round()
+        cv2.imwrite(
+            "OutputImages/{:s}_rlt_shark_output.png".format(base), shark_output
+        )
+        print("Generated SHARK's output")
+        # ESRGAN OUTPUT
+        esrgan_output = np.transpose(esrgan_output[[2, 1, 0], :, :], (1, 2, 0))
+        esrgan_output = (esrgan_output * 255.0).round()
+        cv2.imwrite(
+            "OutputImages/{:s}_rlt_esrgan_output.png".format(base),
+            esrgan_output,
+        )
+        print("Generated ESRGAN's output")
--- a/shark/examples/shark_inference/albert_maskfill_pt.py
+++ b/shark/examples/shark_inference/albert_maskfill_pt.py
@@ -0,0 +1,86 @@
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import torch
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+from iree.compiler import compile_str
+from iree import runtime as ireert
+import os
+import numpy as np
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+
+class AlbertModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForMaskedLM.from_pretrained("albert-base-v2")
+        self.model.eval()
+
+    def forward(self, input_ids, attention_mask):
+        return self.model(
+            input_ids=input_ids, attention_mask=attention_mask
+        ).logits
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
+    text = "This [MASK] is very tasty."
+    encoded_inputs = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        return_tensors="pt",
+    )
+    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
+    mlir_importer = SharkImporter(
+        AlbertModule(),
+        inputs,
+        frontend="torch",
+    )
+    minilm_mlir, func_name = mlir_importer.import_mlir(
+        is_dynamic=False, tracing_required=True
+    )
+    shark_module = SharkInference(minilm_mlir)
+    shark_module.compile()
+    token_logits = torch.tensor(shark_module.forward(inputs))
+    mask_id = torch.where(
+        encoded_inputs["input_ids"] == tokenizer.mask_token_id
+    )[1]
+    mask_token_logits = token_logits[0, mask_id, :]
+    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+    for token in top_5_tokens:
+        print(
+            f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
+        )
+    while True:
+        try:
+            new_text = input("Give me a sentence with [MASK] to fill: ")
+            encoded_inputs = tokenizer(
+                new_text,
+                padding="max_length",
+                truncation=True,
+                max_length=MAX_SEQUENCE_LENGTH,
+                return_tensors="pt",
+            )
+            inputs = (
+                encoded_inputs["input_ids"],
+                encoded_inputs["attention_mask"],
+            )
+            token_logits = torch.tensor(shark_module.forward(inputs))
+            mask_id = torch.where(
+                encoded_inputs["input_ids"] == tokenizer.mask_token_id
+            )[1]
+            mask_token_logits = token_logits[0, mask_id, :]
+            top_5_tokens = (
+                torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+            )
+            for token in top_5_tokens:
+                print(
+                    f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
+                )
+        except KeyboardInterrupt:
+            print("Exiting program.")
+            break
--- a/shark/examples/shark_inference/albert_maskfill_tf.py
+++ b/shark/examples/shark_inference/albert_maskfill_tf.py
@@ -0,0 +1,100 @@
+from PIL import Image
+import requests
+
+from transformers import TFAutoModelForMaskedLM, AutoTokenizer
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+from iree import runtime as ireert
+import os
+import numpy as np
+import sys
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of inputs
+t5_inputs = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class AlbertModule(tf.Module):
+    def __init__(self):
+        super(AlbertModule, self).__init__()
+        self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
+        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
+
+    @tf.function(input_signature=t5_inputs, jit_compile=True)
+    def forward(self, input_ids, attention_mask):
+        return self.m.predict(input_ids, attention_mask)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
+    # text = "This is a great [MASK]."
+    text = "This [MASK] is very tasty."
+    encoded_inputs = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        return_tensors="tf",
+    )
+    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
+    mlir_importer = SharkImporter(
+        AlbertModule(),
+        inputs,
+        frontend="tf",
+    )
+    minilm_mlir, func_name = mlir_importer.import_mlir(
+        is_dynamic=False, tracing_required=False
+    )
+    shark_module = SharkInference(minilm_mlir, mlir_dialect="mhlo")
+    shark_module.compile()
+    output_idx = 0
+    data_idx = 1
+    token_logits = shark_module.forward(inputs)[output_idx][data_idx]
+    mask_id = np.where(
+        tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id
+    )
+    mask_token_logits = token_logits[0, mask_id, :]
+    top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[0:5]
+    for token in top_5_tokens:
+        print(
+            f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
+        )
+    while True:
+        try:
+            new_text = input("Give me a sentence with [MASK] to fill: ")
+            encoded_inputs = tokenizer(
+                new_text,
+                padding="max_length",
+                truncation=True,
+                max_length=MAX_SEQUENCE_LENGTH,
+                return_tensors="tf",
+            )
+            inputs = (
+                encoded_inputs["input_ids"],
+                encoded_inputs["attention_mask"],
+            )
+            token_logits = shark_module.forward(inputs)[output_idx][data_idx]
+            mask_id = np.where(
+                tf.squeeze(encoded_inputs["input_ids"])
+                == tokenizer.mask_token_id
+            )
+            mask_token_logits = token_logits[0, mask_id, :]
+            top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[
+                0:5
+            ]
+            for token in top_5_tokens:
+                print(
+                    f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
+                )
+        except KeyboardInterrupt:
+            print("Exiting program.")
+            sys.exit()
--- a/shark/examples/shark_inference/bloom_tank.py
+++ b/shark/examples/shark_inference/bloom_tank.py
@@ -0,0 +1,14 @@
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_model
+
+mlir_model, func_name, inputs, golden_out = download_model(
+    "bloom", frontend="torch"
+)
+
+shark_module = SharkInference(
+    mlir_model, device="cpu", mlir_dialect="tm_tensor"
+)
+shark_module.compile()
+result = shark_module.forward(inputs)
+print("The obtained result via shark is: ", result)
+print("The golden result is:", golden_out)
--- a/shark/examples/shark_inference/gpt2_tf.py
+++ b/shark/examples/shark_inference/gpt2_tf.py
@@ -13,14 +13,13 @@ gpt2_inputs = [


 class GPT2Module(tf.Module):
-
    def __init__(self):
        super(GPT2Module, self).__init__()
        self.m = TFGPT2Model.from_pretrained("distilgpt2")

        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=gpt2_inputs)
+    @tf.function(input_signature=gpt2_inputs, jit_compile=True)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

@@ -30,9 +29,12 @@ if __name__ == "__main__":
    tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
    text = "I love the distilled version of models."

-    inputs = tokenizer(text, return_tensors='tf')
+    inputs = tokenizer(text, return_tensors="tf")
    shark_module = SharkInference(
-        GPT2Module(), (inputs["input_ids"], inputs["attention_mask"]))
+        GPT2Module(), (inputs["input_ids"], inputs["attention_mask"])
+    )
    shark_module.set_frontend("tensorflow")
    shark_module.compile()
-    print(shark_module.forward((inputs["input_ids"], inputs["attention_mask"])))
+    print(
+        shark_module.forward((inputs["input_ids"], inputs["attention_mask"]))
+    )
--- a/shark/examples/shark_inference/llama/README.md
+++ b/shark/examples/shark_inference/llama/README.md
@@ -0,0 +1,18 @@
+# SHARK LLaMA
+
+## TORCH-MLIR Version
+
+```
+https://github.com/nod-ai/torch-mlir.git
+```
+Then check out the `complex` branch and `git submodule update --init` and then build with `.\build_tools\python_deploy\build_windows.ps1`
+
+### Setup & Run
+```
+git clone https://github.com/nod-ai/llama.git
+```
+Then in this repository
+```
+pip install -e .
+python llama/shark_model.py
+```
--- a/shark/examples/shark_inference/mega_test.py
+++ b/shark/examples/shark_inference/mega_test.py
@@ -0,0 +1,72 @@
+import torch
+import torch_mlir
+from shark.shark_inference import SharkInference
+from shark.shark_compile import shark_compile_through_fx
+from MEGABYTE_pytorch import MEGABYTE
+
+import os
+
+
+class MegaModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = MEGABYTE(
+            num_tokens=16000,  # number of tokens
+            dim=(
+                512,
+                256,
+            ),  # transformer model dimension (512 for coarsest, 256 for fine in this example)
+            max_seq_len=(
+                1024,
+                4,
+            ),  # sequence length for global and then local. this can be more than 2
+            depth=(
+                6,
+                4,
+            ),  # number of layers for global and then local. this can be more than 2, but length must match the max_seq_len's
+            dim_head=64,  # dimension per head
+            heads=8,  # number of attention heads
+            flash_attn=True,  # use flash attention
+        )
+
+    def forward(self, input):
+        return self.model(input)
+
+
+megaModel = MegaModel()
+inputs = [torch.randint(0, 16000, (1, 1024, 4))]
+
+# CURRENTLY IT BAILS OUT HERE BECAUSE OF MISSING OP LOWERINGS :-
+# 1. aten.alias
+shark_module, _ = shark_compile_through_fx(
+    model=megaModel,
+    inputs=inputs,
+    extended_model_name="mega_shark",
+    is_f16=False,
+    f16_input_mask=None,
+    save_dir=os.getcwd(),
+    debug=False,
+    generate_or_load_vmfb=True,
+    extra_args=[],
+    device="cuda",
+    mlir_dialect="tm_tensor",
+)
+# logits = model(x)
+
+
+def print_output_info(output, msg):
+    print("\n", msg)
+    print("\n\t", output.shape)
+
+
+ans = shark_module("forward", inputs)
+print_output_info(torch.from_numpy(ans), "SHARK's output")
+
+ans = megaModel.forward(*inputs)
+print_output_info(ans, "ORIGINAL Model's output")
+
+# and sample from the logits accordingly
+# or you can use the generate function
+
+# NEED TO LOOK AT THIS LATER IF REQUIRED IN SHARK.
+# sampled = model.generate(temperature = 0.9, filter_thres = 0.9) # (1, 1024, 4)
--- a/shark/examples/shark_inference/mhlo_example.py
+++ b/shark/examples/shark_inference/mhlo_example.py
@@ -12,7 +12,20 @@ mhlo_ir = r"""builtin.module  {
 arg0 = np.ones((1, 4)).astype(np.float32)
 arg1 = np.ones((4, 1)).astype(np.float32)

-shark_module = SharkInference(mhlo_ir, (arg0, arg1))
-shark_module.set_frontend("mhlo")
+print("Running shark on cpu backend")
+shark_module = SharkInference(mhlo_ir, device="cpu", mlir_dialect="mhlo")
+
+# Generate the random inputs and feed into the graph.
+x = shark_module.generate_random_inputs()
 shark_module.compile()
-print(shark_module.forward((arg0, arg1)))
+print(shark_module.forward(x))
+
+print("Running shark on cuda backend")
+shark_module = SharkInference(mhlo_ir, device="cuda", mlir_dialect="mhlo")
+shark_module.compile()
+print(shark_module.forward(x))
+
+print("Running shark on vulkan backend")
+shark_module = SharkInference(mhlo_ir, device="vulkan", mlir_dialect="mhlo")
+shark_module.compile()
+print(shark_module.forward(x))
--- a/shark/examples/shark_inference/minilm_benchmark.py
+++ b/shark/examples/shark_inference/minilm_benchmark.py
@@ -7,17 +7,13 @@ tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")


 class MiniLMSequenceClassification(torch.nn.Module):
-
    def __init__(self):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "microsoft/MiniLM-L12-H384-uncased",  # The pretrained model.
-            num_labels=
-            2,  # The number of output labels--2 for binary classification.
-            output_attentions=
-            False,  # Whether the model returns attentions weights.
-            output_hidden_states=
-            False,  # Whether the model returns all hidden-states.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
            torchscript=True,
        )

@@ -27,9 +23,12 @@ class MiniLMSequenceClassification(torch.nn.Module):

 test_input = torch.randint(2, (1, 128))

-shark_module = SharkInference(MiniLMSequenceClassification(), (test_input,),
-                              jit_trace=True,
-                              benchmark_mode=True)
+shark_module = SharkInference(
+    MiniLMSequenceClassification(),
+    (test_input,),
+    jit_trace=True,
+    benchmark_mode=True,
+)

 shark_module.compile()
 shark_module.forward((test_input,))
--- a/shark/examples/shark_inference/minilm_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_benchmark_tf.py
@@ -2,10 +2,6 @@ import tensorflow as tf
 from transformers import BertModel, BertTokenizer, TFBertModel
 from shark.shark_inference import SharkInference

-gpus = tf.config.experimental.list_physical_devices('GPU')
-for gpu in gpus:
-  tf.config.experimental.set_memory_growth(gpu, True)
-
 MAX_SEQUENCE_LENGTH = 512
 BATCH_SIZE = 1

@@ -13,23 +9,24 @@ BATCH_SIZE = 1
 bert_input = [
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
-    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
 ]


 class BertModule(tf.Module):
-
    def __init__(self):
        super(BertModule, self).__init__()
        # Create a BERT trainer with the created network.
        self.m = TFBertModel.from_pretrained(
-            "microsoft/MiniLM-L12-H384-uncased", from_pt=True)
+            "microsoft/MiniLM-L12-H384-uncased", from_pt=True
+        )

        # Invoke the trainer model on the inputs. This causes the layer to be built.
        self.m.predict = lambda x, y, z: self.m.call(
-            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False
+        )

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

@@ -37,22 +34,28 @@ class BertModule(tf.Module):
 if __name__ == "__main__":
    # Prepping Data
    tokenizer = BertTokenizer.from_pretrained(
-        "microsoft/MiniLM-L12-H384-uncased")
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
    text = "Replace me by any text you'd like."
-    encoded_input = tokenizer(text,
-                              padding='max_length',
-                              truncation=True,
-                              max_length=MAX_SEQUENCE_LENGTH)
+    encoded_input = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+    )
    for key in encoded_input:
        encoded_input[key] = tf.expand_dims(
-            tf.convert_to_tensor(encoded_input[key]), 0)
+            tf.convert_to_tensor(encoded_input[key]), 0
+        )

-    test_input = (encoded_input["input_ids"], encoded_input["attention_mask"],
-         encoded_input["token_type_ids"])
+    test_input = (
+        encoded_input["input_ids"],
+        encoded_input["attention_mask"],
+        encoded_input["token_type_ids"],
+    )
    shark_module = SharkInference(
-        BertModule(),
-        test_input,
-        benchmark_mode=True)
+        BertModule(), test_input, benchmark_mode=True
+    )
    shark_module.set_frontend("tensorflow")
    shark_module.compile()
    shark_module.benchmark_all(test_input)
--- a/shark/examples/shark_inference/minilm_jax.py
+++ b/shark/examples/shark_inference/minilm_jax.py
@@ -0,0 +1,73 @@
+from transformers import AutoTokenizer, FlaxAutoModel
+import torch
+import jax
+from typing import Union, Dict, List, Any
+import numpy as np
+from shark.shark_inference import SharkInference
+import io
+
+NumpyTree = Union[np.ndarray, Dict[str, np.ndarray], List[np.ndarray]]
+
+
+def convert_torch_tensor_tree_to_numpy(
+    tree: Union[torch.tensor, Dict[str, torch.tensor], List[torch.tensor]]
+) -> NumpyTree:
+    return jax.tree_util.tree_map(
+        lambda torch_tensor: torch_tensor.cpu().detach().numpy(), tree
+    )
+
+
+def convert_int64_to_int32(tree: NumpyTree) -> NumpyTree:
+    return jax.tree_util.tree_map(
+        lambda tensor: np.array(tensor, dtype=np.int32)
+        if tensor.dtype == np.int64
+        else tensor,
+        tree,
+    )
+
+
+def get_sample_input():
+    tokenizer = AutoTokenizer.from_pretrained(
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
+    inputs_torch = tokenizer("Hello, World!", return_tensors="pt")
+    return convert_int64_to_int32(
+        convert_torch_tensor_tree_to_numpy(inputs_torch.data)
+    )
+
+
+def get_jax_model():
+    return FlaxAutoModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
+
+
+def export_jax_to_mlir(jax_model: Any, sample_input: NumpyTree):
+    model_mlir = jax.jit(jax_model).lower(**sample_input).compiler_ir()
+    byte_stream = io.BytesIO()
+    model_mlir.operation.write_bytecode(file=byte_stream)
+    return byte_stream.getvalue()
+
+
+def assert_array_list_allclose(x, y, *args, **kwargs):
+    assert len(x) == len(y)
+    for a, b in zip(x, y):
+        np.testing.assert_allclose(
+            np.asarray(a), np.asarray(b), *args, **kwargs
+        )
+
+
+sample_input = get_sample_input()
+jax_model = get_jax_model()
+mlir = export_jax_to_mlir(jax_model, sample_input)
+
+# Compile and load module.
+shark_inference = SharkInference(mlir_module=mlir, mlir_dialect="mhlo")
+shark_inference.compile()
+
+# Run main function.
+result = shark_inference("main", jax.tree_util.tree_flatten(sample_input)[0])
+
+# Run JAX model.
+reference_result = jax.tree_util.tree_flatten(jax_model(**sample_input))[0]
+
+# Verify result.
+assert_array_list_allclose(result, reference_result, atol=1e-5)
--- a/shark/examples/shark_inference/minilm_jax_requirements.txt
+++ b/shark/examples/shark_inference/minilm_jax_requirements.txt
@@ -0,0 +1,6 @@
+flax
+jax[cpu]
+nodai-SHARK
+orbax
+transformers
+torch
--- a/shark/examples/shark_inference/minilm_jit.py
+++ b/shark/examples/shark_inference/minilm_jit.py
@@ -1,35 +1,23 @@
-import torch
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from shark.shark_inference import SharkInference
-
-torch.manual_seed(0)
-tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
+from shark.shark_downloader import download_model


-class MiniLMSequenceClassification(torch.nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.model = AutoModelForSequenceClassification.from_pretrained(
-            "microsoft/MiniLM-L12-H384-uncased",  # The pretrained model.
-            num_labels=
-            2,  # The number of output labels--2 for binary classification.
-            output_attentions=
-            False,  # Whether the model returns attentions weights.
-            output_hidden_states=
-            False,  # Whether the model returns all hidden-states.
-            torchscript=True,
-        )
-
-    def forward(self, tokens):
-        return self.model.forward(tokens)[0]
+mlir_model, func_name, inputs, golden_out = download_model(
+    "microsoft/MiniLM-L12-H384-uncased",
+    frontend="torch",
+)


-test_input = torch.randint(2, (1, 128))
-
-shark_module = SharkInference(MiniLMSequenceClassification(), (test_input,),
-                              jit_trace=True)
-
+shark_module = SharkInference(mlir_model, device="cpu", mlir_dialect="linalg")
 shark_module.compile()
-result = shark_module.forward((test_input,))
-print("Obtained result", result)
+result = shark_module.forward(inputs)
+print("The obtained result via shark is: ", result)
+print("The golden result is:", golden_out)
+
+
+# Let's generate random inputs, currently supported
+# for static models.
+rand_inputs = shark_module.generate_random_inputs()
+rand_results = shark_module.forward(rand_inputs)
+
+print("Running shark_module with random_inputs is: ", rand_results)
--- a/shark/examples/shark_inference/minilm_load_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_load_benchmark_tf.py
@@ -1,41 +0,0 @@
-import tensorflow as tf
-from transformers import BertModel, BertTokenizer, TFBertModel
-from shark.shark_inference import SharkInference
-from shark.shark_importer import shark_load
-from shark.parser import parser
-import os
-
-gpus = tf.config.experimental.list_physical_devices('GPU')
-for gpu in gpus:
-  tf.config.experimental.set_memory_growth(gpu, True)
-
-parser.add_argument(
-    "--download_mlir_path",
-    type=str,
-    default="minilm_tf_inference.mlir",
-    help="Specifies path to target mlir file that will be loaded.")
-load_args, unknown = parser.parse_known_args()
-
-MAX_SEQUENCE_LENGTH = 512
-
-if __name__ == "__main__":
-    # Prepping Data
-    tokenizer = BertTokenizer.from_pretrained(
-        "microsoft/MiniLM-L12-H384-uncased")
-    text = "Replace me by any text you'd like."
-    encoded_input = tokenizer(text,
-                              padding='max_length',
-                              truncation=True,
-                              max_length=MAX_SEQUENCE_LENGTH)
-    for key in encoded_input:
-        encoded_input[key] = tf.expand_dims(
-            tf.convert_to_tensor(encoded_input[key]), 0)
-    model_name = "minilm_tf_inference"
-    minilm_mlir = shark_load(model_name, load_args.download_mlir_path)
-    test_input = (encoded_input["input_ids"], encoded_input["attention_mask"],
-         encoded_input["token_type_ids"])
-    shark_module = SharkInference(
-        minilm_mlir, test_input, benchmark_mode=True)
-    shark_module.set_frontend("mhlo")
-    shark_module.compile()
-    shark_module.benchmark_all(test_input)
--- a/shark/examples/shark_inference/minilm_tf.py
+++ b/shark/examples/shark_inference/minilm_tf.py
@@ -9,23 +9,24 @@ BATCH_SIZE = 1
 bert_input = [
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
-    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
 ]


 class BertModule(tf.Module):
-
    def __init__(self):
        super(BertModule, self).__init__()
        # Create a BERT trainer with the created network.
        self.m = TFBertModel.from_pretrained(
-            "microsoft/MiniLM-L12-H384-uncased", from_pt=True)
+            "microsoft/MiniLM-L12-H384-uncased", from_pt=True
+        )

        # Invoke the trainer model on the inputs. This causes the layer to be built.
        self.m.predict = lambda x, y, z: self.m.call(
-            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False
+        )

-    @tf.function(input_signature=bert_input)
+    @tf.function(input_signature=bert_input, jit_compile=True)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

@@ -33,24 +34,37 @@ class BertModule(tf.Module):
 if __name__ == "__main__":
    # Prepping Data
    tokenizer = BertTokenizer.from_pretrained(
-        "microsoft/MiniLM-L12-H384-uncased")
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
    text = "Replace me by any text you'd like."
-    encoded_input = tokenizer(text,
-                              padding='max_length',
-                              truncation=True,
-                              max_length=MAX_SEQUENCE_LENGTH)
+    encoded_input = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+    )
    for key in encoded_input:
        encoded_input[key] = tf.expand_dims(
-            tf.convert_to_tensor(encoded_input[key]), 0)
+            tf.convert_to_tensor(encoded_input[key]), 0
+        )

    shark_module = SharkInference(
        BertModule(),
-        (encoded_input["input_ids"], encoded_input["attention_mask"],
-         encoded_input["token_type_ids"]))
+        (
+            encoded_input["input_ids"],
+            encoded_input["attention_mask"],
+            encoded_input["token_type_ids"],
+        ),
+    )
    shark_module.set_frontend("tensorflow")
    shark_module.compile()

    print(
        shark_module.forward(
-            (encoded_input["input_ids"], encoded_input["attention_mask"],
-             encoded_input["token_type_ids"])))
+            (
+                encoded_input["input_ids"],
+                encoded_input["attention_mask"],
+                encoded_input["token_type_ids"],
+            )
+        )
+    )
--- a/shark/examples/shark_inference/resnest.py
+++ b/shark/examples/shark_inference/resnest.py
@@ -0,0 +1,39 @@
+import torch
+import torchvision.models as models
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+
+torch.hub.list("zhanghang1989/ResNeSt", force_reload=True)
+
+
+class ResnestModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = torch.hub.load(
+            "zhanghang1989/ResNeSt", "resnest50", pretrained=True
+        )
+        self.model.eval()
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+input = torch.randn(1, 3, 224, 224)
+
+
+mlir_importer = SharkImporter(
+    ResnestModule(),
+    (input,),
+    frontend="torch",
+)
+
+(vision_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
+    tracing_required=True
+)
+
+print(golden_out)
+
+shark_module = SharkInference(vision_mlir, mlir_dialect="linalg")
+shark_module.compile()
+result = shark_module.forward((input,))
+print("Obtained result", result)
--- a/shark/examples/shark_inference/resnet50_fp16.py
+++ b/shark/examples/shark_inference/resnet50_fp16.py
@@ -0,0 +1,74 @@
+from shark.shark_inference import SharkInference
+from shark.parser import shark_args
+
+import torch
+import numpy as np
+import sys
+import torchvision.models as models
+import torch_mlir
+
+torch.manual_seed(0)
+
+
+class VisionModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = models.resnet50(pretrained=True)
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+model = VisionModule()
+test_input = torch.randn(1, 3, 224, 224)
+actual_out = model(test_input)
+
+test_input_fp16 = test_input.to(device=torch.device("cuda"), dtype=torch.half)
+model_fp16 = model.half()
+model_fp16.eval()
+model_fp16.to("cuda")
+actual_out_fp16 = model_fp16(test_input_fp16)
+
+ts_g = torch.jit.trace(model_fp16, [test_input_fp16])
+
+module = torch_mlir.compile(
+    ts_g,
+    (test_input_fp16),
+    torch_mlir.OutputType.LINALG_ON_TENSORS,
+    use_tracing=True,
+    verbose=False,
+)
+
+# from contextlib import redirect_stdout
+
+# with open('resnet50_fp16_linalg_ir.mlir', 'w') as f:
+#     with redirect_stdout(f):
+#         print(module.operation.get_asm())
+
+mlir_model = module
+func_name = "forward"
+
+shark_module = SharkInference(mlir_model, device="cuda", mlir_dialect="linalg")
+shark_module.compile()
+
+
+def shark_result(x):
+    x_ny = x.cpu().detach().numpy()
+    inputs = (x_ny,)
+    result = shark_module.forward(inputs)
+    return torch.from_numpy(result)
+
+
+observed_out = shark_result(test_input_fp16)
+
+print("Golden result:", actual_out_fp16)
+print("SHARK result:", observed_out)
+
+actual_out_fp16 = actual_out_fp16.to(device=torch.device("cpu"))
+
+print(
+    torch.testing.assert_allclose(
+        actual_out_fp16, observed_out, rtol=1e-2, atol=1e-2
+    )
+)
--- a/shark/examples/shark_inference/resnet50_script.py
+++ b/shark/examples/shark_inference/resnet50_script.py
@@ -5,24 +5,28 @@ import torchvision.models as models
 from torchvision import transforms
 import sys
 from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_model


 ################################## Preprocessing inputs and model ############
 def load_and_preprocess_image(url: str):
    headers = {
-        "User-Agent":
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
    }
-    img = Image.open(requests.get(url, headers=headers,
-                                  stream=True).raw).convert("RGB")
+    img = Image.open(
+        requests.get(url, headers=headers, stream=True).raw
+    ).convert("RGB")
    # preprocessing pipeline
-    preprocess = transforms.Compose([
-        transforms.Resize(256),
-        transforms.CenterCrop(224),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                             std=[0.229, 0.224, 0.225]),
-    ])
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+            ),
+        ]
+    )
    img_preprocessed = preprocess(img)
    return torch.unsqueeze(img_preprocessed, 0)

@@ -44,7 +48,6 @@ def top3_possibilities(res):


 class Resnet50Module(torch.nn.Module):
-
    def __init__(self):
        super().__init__()
        self.resnet = models.resnet50(pretrained=True)
@@ -61,18 +64,20 @@ labels = load_labels()

 ##############################################################################

-input = torch.randn(1, 3, 224, 224)
-print(input.shape)
-
-## The img is passed to determine the input shape.
-shark_module = SharkInference(Resnet50Module(), (img,))
-shark_module.compile()

 ## Can pass any img or input to the forward module.
-results = shark_module.forward((img,))
+mlir_model, func_name, inputs, golden_out = download_model(
+    "resnet50", frontend="torch"
+)
+
+shark_module = SharkInference(mlir_model, mlir_dialect="linalg")
+shark_module.compile()
+path = shark_module.save_module()
+shark_module.load_module(path)
+result = shark_module("forward", (img.detach().numpy(),))

 print("The top 3 results obtained via shark_runner is:")
-print(top3_possibilities(torch.from_numpy(results)))
+print(top3_possibilities(torch.from_numpy(result)))

 print()

--- a/Show More
+++ b/Show More