Update vulkan_utils.py

Correct spelling
Add script for tensorflow stable diffusion (#391 )
2026-01-12 23:38:12 -05:00 · 2022-10-11 20:53:41 +05:30 · 2022-10-11 01:53:55 -07:00 · 2022-10-10 12:01:49 -04:00 · 2022-10-08 23:31:20 -05:00 · 2022-10-07 16:02:46 -07:00
160 changed files with 15362 additions and 3817 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -16,6 +16,7 @@ jobs:
      fail-fast: false
      matrix:
        python-version: ["3.10"]
+        backend: [IREE, SHARK]

    steps:
    - uses: actions/checkout@v3
@@ -49,12 +50,18 @@ jobs:
        body: |
          Automatic snapshot release of nod.ai SHARK.
        draft: true
-        prerelease: false        
+        prerelease: false
+    - name: Find Torch-MLIR Release
+      run: |
+        TM_HTML_URL="$(python3 -c "import urllib.request, json, sys; u=json.loads(urllib.request.urlopen('https://api.github.com/repos/llvm/torch-mlir/releases/latest').read().decode()).get('html_url', False); print(u) if u else sys.exit(1);")"
+        TM_RELEASE_DIR=${TM_HTML_URL/"tag"/"expanded_assets"}
+        echo "TM_RELEASE_DIR=${TM_RELEASE_DIR}" >> $GITHUB_ENV
    - name: Install dependencies
      run: |
+        echo "Torch-MLIR Release DIR is ${{ env.TM_RELEASE_DIR }}"
        python -m pip install --upgrade pip
        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cpu  -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
+        if [ -f requirements.txt ]; then pip install -r requirements.txt -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
@@ -62,46 +69,19 @@ jobs:
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
    - name: Build and validate the IREE package
+      if: ${{ matrix.backend == 'IREE' }}
      run: |
        cd $GITHUB_WORKSPACE
        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
        source iree.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://github.com/llvm/torch-mlir/releases -f https://github.com/iree-org/iree/releases
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/iree-org/iree/releases
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest -k 'cpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        pytest -k 'gpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        pytest -k 'vulkan' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        rm -rf ./wheelhouse/nodai*
-
-    - name: Build and validate the SHARK Runtime package
-      run: |
-        cd $GITHUB_WORKSPACE
-        ./setup_venv.sh
-        source shark.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        pytest -k 'cpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        pytest -k 'gpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        pytest -k 'vulkan' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
+        pytest tank/test_models.py |
          tail -n 1 |
          tee -a pytest_results.txt
        if !(grep -Fxq " failed" pytest_results.txt) 
@@ -110,20 +90,39 @@ jobs:
            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/$SHA
            gsutil -m cp -r gs://shark_tank/$SHA/* gs://shark_tank/latest/
        fi
-        rm pytest_results.txt
        rm -rf ./wheelhouse/nodai*

-    
+    - name: Build and validate the SHARK Runtime package
+      if: ${{ matrix.backend == 'SHARK' }}
+      run: |
+        cd $GITHUB_WORKSPACE
+        ./setup_venv.sh
+        source shark.venv/bin/activate
+        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
+        SHARK_PACKAGE_VERSION=${package_version} \
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases
+        # Install the built wheel
+        pip install ./wheelhouse/nodai*
+        # Validate the Models
+        pytest tank/test_models.py |
+          tail -n 1 |
+          tee -a pytest_results.txt
+  publish:
+    runs-on: a100
+    needs: build 
+    steps:
    - name: Upload Release Assets
+      if: ${{ matrix.backend == 'SHARK' }}
      id: upload-release-assets
      uses: dwenegar/upload-release-assets@v1
      env:
        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
      with:
        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ./wheelhouse/nodai_*.whl
+        assets_path: ${GITHUB_WORKSPACE}/wheelhouse/nodai_*.whl

    - name: Publish Release
+      if: ${{ matrix.backend == 'SHARK' }}
      id: publish_release
      uses: eregon/publish-release@v1
      env:
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -15,8 +15,8 @@ jobs:
    strategy:
      fail-fast: true
      matrix:
-        os: [a100, MacStudio, ubuntu-latest]
-        suite: [cpu,gpu,vulkan]
+        os: [icelake, a100, MacStudio, ubuntu-latest]
+        suite: [cpu,cuda,vulkan]
        python-version: ["3.10"]
        include:
          - os: ubuntu-latest
@@ -25,15 +25,21 @@ jobs:
          - os: ubuntu-latest
            suite: vulkan
          - os: ubuntu-latest
-            suite: gpu
+            suite: cuda
          - os: ubuntu-latest
            suite: cpu
          - os: MacStudio
-            suite: gpu
+            suite: cuda
          - os: MacStudio
            suite: cpu
          - os: MacStudio
            suite: vulkan
+          - os: icelake
+            suite: vulkan
+          - os: icelake
+            suite: cuda
+          - os: a100
+            suite: cpu

    runs-on: ${{ matrix.os }}

@@ -46,13 +52,13 @@ jobs:
        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
        
    - name: Set up Python Version File ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest'
+      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
      run: |
        # See https://github.com/actions/setup-python/issues/433
        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
    
    - name: Set up Python ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest'
+      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
      uses: actions/setup-python@v4
      with:
        python-version: '${{ matrix.python-version }}'
@@ -78,27 +84,30 @@ jobs:
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py

-    - name: Validate CPU Models
+    - name: Validate Models on CPU
      if: matrix.suite == 'cpu'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest -k 'cpu' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 
+        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cpu
+        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
+        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

-    - name: Validate GPU Models
-      if: matrix.suite == 'gpu'
+    - name: Validate Models on NVIDIA GPU
+      if: matrix.suite == 'cuda'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark -k "gpu" --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_gpu_${SHORT_SHA}.csv
+        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cuda
+        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
+        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv

    - name: Validate Vulkan Models
      if: matrix.suite == 'vulkan'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest -k 'vulkan' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k vulkan
--- a/218
+++ b/218
@@ -0,0 +1,218 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ If you are on an Intel macOS machine you need this [workaround](https://github.c
 ```shell
 curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/resnet50_script.py
 #Install deps for test script
-pip install --pre torch torchvision torchaudio tqdm pillow --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install --pre torch torchvision torchaudio tqdm pillow gsutil --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 python ./resnet50_script.py --device="cpu"  #use cuda or vulkan or metal
 ```

@@ -81,54 +81,74 @@ For example if you want to use Python3.10 and upstream IREE with TF Import tools
 # PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 USE_IREE=1 ./setup_venv.sh 
 ```

-If you are a Torch-mlir developer or an IREE developer and want to test local changes you can uninstall
+If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
 the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
 with Python bindings and set your PYTHONPATH as mentioned [here](https://google.github.io/iree/bindings/python/)
 for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
 for Torch-MLIR.

+### How to use your locally built Torch-MLIR with SHARK
+```shell
+1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
+2.) Run `pip uninstall torch-mlir`.
+3.) Go to your local Torch-MLIR directory.
+4.) Activate mlir_venv virtual envirnoment.
+5.) Run `pip uninstall -r requirements.txt`.
+6.) Run `pip install -r requirements.txt`.
+7.) Build Torch-MLIR.
+8.) Activate shark.venv virtual environment from the Torch-MLIR directory.
+8.) Run `export PYTHONPATH=`pwd`/build/tools/torch-mlir/python_packages/torch_mlir:`pwd`/examples` in the Torch-MLIR directory.
+9.) Go to the SHARK directory.
+```
+Now the SHARK will use your locally build Torch-MLIR repo.
+
 ### Run a demo script
 ```shell
 python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
 # Or a pytest
-pytest tank/tf/hf_masked_lm/albert-base-v2_test.py::AlbertBaseModuleTest::test_module_static_cpu
+pytest tank/test_models.py -k "MiniLM"
 ```

-
 </details>

-
 <details>
-  <summary>Testing</summary>
+  <summary>Testing and Benchmarks</summary>

 ### Run all model tests on CPU/GPU/VULKAN/Metal
 ```shell
-pytest tank
+pytest tank/test_models.py

 # If on Linux for multithreading on CPU (faster results):
-pytest tank -n auto
+pytest tank/test_models.py -n auto
 ```

 ### Running specific tests
 ```shell
-# Run tests for a specific model:
-pytest tank/<MODEL_NAME> #i.e., pytest tank/bert-base-uncased

-# Run tests for a specific case:
-pytest tank/<MODEL_NAME> -k "keyword" 
-# i.e., pytest tank/bert-base-uncased/bert-base-uncased_test.py -k "static_gpu"
+# Search for test cases by including a keyword that matches all or part of the test case's name;
+pytest tank/test_models.py -k "keyword" 

-```
+# Test cases are named uniformly by format test_module_<model_name_underscores_only>_<torch/tf>_<static/dynamic>_<device>.
+
+# Example: Test all models on nvidia gpu:
+pytest tank/test_models.py -k "cuda"
+
+# Example: Test all tensorflow resnet models on Vulkan backend:
+pytest tank/test_models.py -k "resnet and tf and vulkan"
+
+# Exclude a test case:
+pytest tank/test_models.py -k "not ..."

 ### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.

-(requires source installation with `IMPORTER=1 ./setup_venv.sh`)
+(the following requires source installation with `IMPORTER=1 ./setup_venv.sh`)

 ```shell
-pytest --benchmark tank
+pytest --benchmark tank/test_models.py
  
 # Just do static GPU benchmarks for PyTorch tests:
-pytest --benchmark tank --ignore-glob="_tf*" -k "static_gpu"
+pytest --benchmark tank/test_models.py -k "pytorch and static and cuda"
+
 ```
  
 ### Benchmark Resnet50, MiniLM on CPU
@@ -142,10 +162,10 @@ cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print
 echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo

 # Benchmark canonical Resnet50 on CPU via pytest
-pytest --benchmark tank/resnet50/ -k "cpu"
+pytest --benchmark tank/test_models -k "resnet50 and tf_static_cpu"

 # Benchmark canonical MiniLM on CPU via pytest
-pytest --benchmark tank/MiniLM-L12-H384-uncased/ -k "cpu"
+pytest --benchmark tank/test_models -k "MiniLM and cpu"

 # Benchmark MiniLM on CPU via transformer-benchmarks:
 git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -2,4 +2,4 @@

 IMPORTER=1 ./setup_venv.sh
 source $GITHUB_WORKSPACE/shark.venv/bin/activate
-python generate_sharktank.py --upload=False
+python generate_sharktank.py --upload=False --ci_tank_dir=True
--- a/conftest.py
+++ b/conftest.py
@@ -12,22 +12,45 @@ def pytest_addoption(parser):
        default="False",
        help="Add ONNX benchmark results to pytest benchmarks.",
    )
-    # The following options are deprecated and pending removal.
    parser.addoption(
-        "--save_mlir",
+        "--tf32",
        action="store_true",
        default="False",
-        help="Pass option to save input MLIR",
+        help="Use TensorFloat-32 calculations.",
    )
    parser.addoption(
-        "--save_vmfb",
+        "--save_repro",
        action="store_true",
        default="False",
-        help="Pass option to save IREE output .vmfb",
+        help="Pass option to save reproduction artifacts to SHARK/shark_tmp/test_case/",
    )
    parser.addoption(
-        "--save_temps",
+        "--save_fails",
        action="store_true",
        default="False",
-        help="Saves IREE reproduction artifacts for filing upstream issues.",
+        help="Save reproduction artifacts for a test case only if it fails. Default is False.",
+    )
+    parser.addoption(
+        "--ci",
+        action="store_true",
+        default="False",
+        help="Enables uploading of reproduction artifacts upon test case failure during iree-compile or validation. Must be passed with --ci_sha option ",
+    )
+    parser.addoption(
+        "--ci_sha",
+        action="store",
+        default="None",
+        help="Passes the github SHA of the CI workflow to include in google storage directory for reproduction artifacts.",
+    )
+    parser.addoption(
+        "--local_tank_cache",
+        action="store",
+        default="",
+        help="Specify the directory in which all downloaded shark_tank artifacts will be cached.",
+    )
+    parser.addoption(
+        "--tank_url",
+        type=str,
+        default="gs://shark_tank/latest",
+        help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
    )
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -0,0 +1,52 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required(VERSION 3.21...3.23)
+
+#-------------------------------------------------------------------------------
+# Project configuration
+#-------------------------------------------------------------------------------
+
+project(iree-samples C CXX)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+#-------------------------------------------------------------------------------
+# Core project dependency
+#-------------------------------------------------------------------------------
+
+message(STATUS "Fetching core IREE repo (this may take a few minutes)...")
+# Note: for log output, set -DFETCHCONTENT_QUIET=OFF,
+# see https://gitlab.kitware.com/cmake/cmake/-/issues/18238#note_440475
+
+include(FetchContent)
+
+FetchContent_Declare(
+  iree
+  GIT_REPOSITORY https://github.com/nod-ai/shark-runtime.git
+  GIT_TAG shark 
+  GIT_SUBMODULES_RECURSE OFF
+  GIT_SHALLOW OFF
+  GIT_PROGRESS ON
+  USES_TERMINAL_DOWNLOAD ON
+)
+
+# Extend module path to find MLIR CMake modules.
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_BINARY_DIR}/lib/cmake/mlir")
+
+# Disable core project features not needed for these out of tree samples.
+set(IREE_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(IREE_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
+
+FetchContent_MakeAvailable(iree)
+FetchContent_GetProperties(iree SOURCE_DIR IREE_SOURCE_DIR)
+
+#-------------------------------------------------------------------------------
+# Individual samples
+#-------------------------------------------------------------------------------
+
+add_subdirectory(vulkan_gui)
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -0,0 +1,58 @@
+# SHARK C/C++ Samples
+
+These C/C++ samples can be built using CMake. The samples depend on the main
+SHARK-Runtime project's C/C++ sources, including both the runtime and the compiler. 
+
+Individual samples may require additional dependencies. Watch CMake's output
+for information about which you are missing for individual samples.
+
+On Windows we recommend using https://github.com/microsoft/vcpkg to download packages for
+your system. The general setup flow looks like
+
+*Install and activate SHARK*
+
+```bash
+source shark.venv/bin/activate #follow main repo instructions to setup your venv
+```
+
+*Install Dependencies*
+
+```bash
+vcpkg install [library] --triplet [your platform]
+vcpkg integrate install
+
+# Then pass `-DCMAKE_TOOLCHAIN_FILE=[check logs for path]` when configuring CMake
+```
+
+In Ubuntu Linux you can install
+
+```bash
+sudo apt install libsdl2-dev
+```
+
+*Build*
+```bash
+cd cpp
+cmake -GNinja -B build/
+cmake --build build/
+```
+
+*Prepare the model*
+```bash
+wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvm-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
+```
+*Prepare the input*
+
+```bash
+python save_img.py
+```
+Note that this requires tensorflow, e.g.
+```bash
+python -m pip install tensorflow
+```
+
+*Run the vulkan_gui*
+```bash
+./build/vulkan_gui/iree-samples-vulkan-gui
+```
--- a/cpp/dog_imagenet.jpg
+++ b/cpp/dog_imagenet.jpg
--- a/cpp/save_img.py
+++ b/cpp/save_img.py
@@ -0,0 +1,19 @@
+import numpy as np
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+
+
+def load_and_preprocess_image(fname: str):
+    image = tf.io.read_file(fname)
+    image = tf.image.decode_image(image, channels=3)
+    image = tf.image.resize(image, (224, 224))
+    image = image[tf.newaxis, :]
+    # preprocessing pipeline
+    input_tensor = tf.keras.applications.resnet50.preprocess_input(image)
+    return input_tensor
+
+
+data = load_and_preprocess_image("dog_imagenet.jpg").numpy()
+
+data.tofile("dog.bin")
--- a/cpp/vision_inference/CMakeLists.txt
+++ b/cpp/vision_inference/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR
+   NOT IREE_HAL_EXECUTABLE_LOADER_EMBEDDED_ELF)
+  message(STATUS "Missing LLVM backend and/or embeddded elf loader, skipping vision_inference sample")
+  return()
+endif()
+
+# vcpkg install stb
+#   tested with version 2021-09-10
+find_package(Stb)
+if(NOT Stb_FOUND)
+  message(STATUS "Could not find Stb, skipping vision inference sample")
+  return()
+endif()
+
+# Compile mnist.mlir to mnist.vmfb.
+set(_COMPILE_TOOL_EXECUTABLE $<TARGET_FILE:iree-compile>)
+set(_COMPILE_ARGS)
+list(APPEND _COMPILE_ARGS "--iree-input-type=mhlo")
+list(APPEND _COMPILE_ARGS "--iree-hal-target-backends=llvm-cpu")
+list(APPEND _COMPILE_ARGS "${IREE_SOURCE_DIR}/samples/models/mnist.mlir")
+list(APPEND _COMPILE_ARGS "-o")
+list(APPEND _COMPILE_ARGS "mnist.vmfb")
+add_custom_command(
+  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
+  COMMAND ${_COMPILE_TOOL_EXECUTABLE} ${_COMPILE_ARGS}
+  DEPENDS ${_COMPILE_TOOL_EXECUTABLE} "${IREE_SOURCE_DIR}/samples/models/mnist.mlir"
+)
+# Embed mnist.vmfb into a C file as mnist_bytecode_module_c.[h/c]
+set(_EMBED_DATA_EXECUTABLE $<TARGET_FILE:generate_embed_data>)
+set(_EMBED_ARGS)
+list(APPEND _EMBED_ARGS "--output_header=mnist_bytecode_module_c.h")
+list(APPEND _EMBED_ARGS "--output_impl=mnist_bytecode_module_c.c")
+list(APPEND _EMBED_ARGS "--identifier=iree_samples_vision_inference_mnist_bytecode_module")
+list(APPEND _EMBED_ARGS "--flatten")
+list(APPEND _EMBED_ARGS "${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb")
+add_custom_command(
+  OUTPUT "mnist_bytecode_module_c.h" "mnist_bytecode_module_c.c"
+  COMMAND ${_EMBED_DATA_EXECUTABLE} ${_EMBED_ARGS}
+  DEPENDS ${_EMBED_DATA_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
+)
+# Define a library target for mnist_bytecode_module_c.
+add_library(iree_samples_vision_inference_mnist_bytecode_module_c OBJECT)
+target_sources(iree_samples_vision_inference_mnist_bytecode_module_c
+  PRIVATE
+    mnist_bytecode_module_c.h
+    mnist_bytecode_module_c.c
+)
+
+# Define the sample executable.
+set(_NAME "iree-run-mnist-module")
+add_executable(${_NAME} "")
+target_sources(${_NAME}
+  PRIVATE
+    "image_util.h"
+    "image_util.c"
+    "iree-run-mnist-module.c"
+)
+set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-run-mnist-module")
+target_include_directories(${_NAME} PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+)
+target_include_directories(${_NAME} PRIVATE
+    ${Stb_INCLUDE_DIR}
+)
+target_link_libraries(${_NAME}
+  iree_base_base
+  iree_base_tracing
+  iree_hal_hal
+  iree_runtime_runtime
+  iree_samples_vision_inference_mnist_bytecode_module_c
+)
+
+# Define a target that copies the test image into the build directory.
+add_custom_target(iree_samples_vision_inference_test_image
+  COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/mnist_test.png" "${CMAKE_CURRENT_BINARY_DIR}/mnist_test.png")
+add_dependencies(${_NAME} iree_samples_vision_inference_test_image)
+
+message(STATUS "Configured vision_inference sample successfully")
--- a/cpp/vision_inference/README.md
+++ b/cpp/vision_inference/README.md
@@ -0,0 +1,8 @@
+# Vision Inference Sample (C code)
+
+This sample demonstrates how to run a MNIST handwritten digit detection vision
+model on an image using IREE's C API.
+
+A similar sample is implemented using a Python script and IREE's command line
+tools over in the primary iree repository at
+https://github.com/iree-org/iree/tree/main/samples/vision_inference
--- a/cpp/vision_inference/image_util.c
+++ b/cpp/vision_inference/image_util.c
@@ -0,0 +1,224 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "image_util.h"
+
+#include <math.h>
+
+#include "iree/base/internal/flags.h"
+#include "iree/base/tracing.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
+    const uint8_t* pixel_data, iree_host_size_t buffer_length,
+    const float* input_range, iree_host_size_t range_length,
+    float* out_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  if (range_length != 2) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "range defined as 2-element [min, max] array.");
+  }
+  float input_scale = fabsf(input_range[1] - input_range[0]) / 2.0f;
+  float input_offset = (input_range[0] + input_range[1]) / 2.0f;
+  const float kUint8Mean = 127.5f;
+  for (int i = 0; i < buffer_length; ++i) {
+    out_buffer[i] =
+        (((float)(pixel_data[i])) - kUint8Mean) / kUint8Mean * input_scale +
+        input_offset;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_tools_utils_load_pixel_data_impl(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
+  int img_dims[3];
+  if (stbi_info(filename.data, img_dims, &(img_dims[1]), &(img_dims[2])) == 0) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
+                            (int)filename.size, filename.data);
+  }
+  if (!(element_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 ||
+        element_type == IREE_HAL_ELEMENT_TYPE_SINT_8 ||
+        element_type == IREE_HAL_ELEMENT_TYPE_UINT_8)) {
+    char element_type_str[16];
+    IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
+        element_type, sizeof(element_type_str), element_type_str, NULL));
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "element type %s not supported", element_type_str);
+  }
+  switch (shape_rank) {
+    case 2: {  // Assume tensor <height x width>
+      if (img_dims[2] != 1 || (shape[0] != img_dims[1]) ||
+          (shape[1] != img_dims[0])) {
+        return iree_make_status(
+            IREE_STATUS_INVALID_ARGUMENT,
+            "image size: %dx%dx%d, expected: %" PRIdim "x%" PRIdim, img_dims[0],
+            img_dims[1], img_dims[2], shape[1], shape[0]);
+      }
+      break;
+    }
+    case 3: {  // Assume tensor <height x width x channel>
+      if (shape[0] != img_dims[1] || shape[1] != img_dims[0] ||
+          shape[2] != img_dims[2]) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "image size: %dx%dx%d, expected: %" PRIdim
+                                "x%" PRIdim "x%" PRIdim,
+                                img_dims[0], img_dims[1], img_dims[2], shape[1],
+                                shape[0], shape[2]);
+      }
+      break;
+    }
+    case 4: {  // Assume tensor <batch x height x width x channel>
+      if (shape[1] != img_dims[1] || shape[2] != img_dims[0] ||
+          shape[3] != img_dims[2]) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "image size: %dx%dx%d, expected: %" PRIdim
+                                "x%" PRIdim "x%" PRIdim,
+                                img_dims[0], img_dims[1], img_dims[2], shape[2],
+                                shape[1], shape[3]);
+      }
+      break;
+    }
+    default:
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "Input buffer shape rank %" PRIhsz " not supported", shape_rank);
+  }
+  // Drop the alpha channel if present.
+  int req_ch = (img_dims[2] >= 3) ? 3 : 0;
+  *out_pixel_data = stbi_load(filename.data, img_dims, &(img_dims[1]),
+                              &(img_dims[2]), req_ch);
+  if (*out_pixel_data == NULL) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
+                            (int)filename.size, filename.data);
+  }
+  *out_buffer_length =
+      img_dims[0] * img_dims[1] * (img_dims[2] > 3 ? 3 : img_dims[2]);
+  return iree_ok_status();
+}
+
+iree_status_t iree_tools_utils_load_pixel_data(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t result = iree_tools_utils_load_pixel_data_impl(
+      filename, shape, shape_rank, element_type, out_pixel_data,
+      out_buffer_length);
+  IREE_TRACE_ZONE_END(z0);
+  return result;
+}
+
+iree_status_t iree_tools_utils_buffer_view_from_image(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_buffer_view = NULL;
+  if (element_type != IREE_HAL_ELEMENT_TYPE_SINT_8 &&
+      element_type != IREE_HAL_ELEMENT_TYPE_UINT_8) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "element type should be i8 or u8");
+  }
+
+  iree_status_t result;
+  uint8_t* pixel_data = NULL;
+  iree_host_size_t buffer_length;
+  result = iree_tools_utils_load_pixel_data(
+      filename, shape, shape_rank, element_type, &pixel_data, &buffer_length);
+  if (iree_status_is_ok(result)) {
+    iree_host_size_t element_byte =
+        iree_hal_element_dense_byte_count(element_type);
+    // SINT_8 and UINT_8 perform direct buffer wrap.
+    result = iree_hal_buffer_view_allocate_buffer(
+        allocator, shape_rank, shape, element_type,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+        (iree_hal_buffer_params_t){
+            .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+            .access = IREE_HAL_MEMORY_ACCESS_READ,
+            .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                     IREE_HAL_BUFFER_USAGE_TRANSFER,
+        },
+        iree_make_const_byte_span(pixel_data, element_byte * buffer_length),
+        out_buffer_view);
+  }
+  stbi_image_free(pixel_data);
+  IREE_TRACE_ZONE_END(z0);
+  return result;
+}
+
+typedef struct iree_tools_utils_buffer_view_load_params_t {
+  const uint8_t* pixel_data;
+  iree_host_size_t pixel_data_length;
+  const float* input_range;
+  iree_host_size_t input_range_length;
+} iree_tools_utils_buffer_view_load_params_t;
+static iree_status_t iree_tools_utils_buffer_view_load_image_rescaled(
+    iree_hal_buffer_mapping_t* mapping, void* user_data) {
+  iree_tools_utils_buffer_view_load_params_t* params =
+      (iree_tools_utils_buffer_view_load_params_t*)user_data;
+  return iree_tools_utils_pixel_rescaled_to_buffer(
+      params->pixel_data, params->pixel_data_length, params->input_range,
+      params->input_range_length, (float*)mapping->contents.data);
+}
+
+iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, const float* input_range,
+    iree_host_size_t input_range_length,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_buffer_view = NULL;
+  if (element_type != IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "element type should be f32");
+  }
+
+  // Classic row-major image layout.
+  iree_hal_encoding_type_t encoding_type =
+      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR;
+
+  // Load pixel data from the file into a new host memory allocation (the only
+  // interface stb_image provides). A real application would want to use the
+  // generation callback to directly decode the image into the target mapped
+  // device buffer.
+  uint8_t* pixel_data = NULL;
+  iree_host_size_t buffer_length = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_tools_utils_load_pixel_data(filename, shape, shape_rank,
+                                           element_type, &pixel_data,
+                                           &buffer_length));
+
+  iree_tools_utils_buffer_view_load_params_t params = {
+      .pixel_data = pixel_data,
+      .pixel_data_length = buffer_length,
+      .input_range = input_range,
+      .input_range_length = input_range_length,
+  };
+  iree_status_t status = iree_hal_buffer_view_generate_buffer(
+      allocator, shape_rank, shape, element_type, encoding_type,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                  IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
+          .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                   IREE_HAL_BUFFER_USAGE_TRANSFER |
+                   IREE_HAL_BUFFER_USAGE_MAPPING,
+      },
+      iree_tools_utils_buffer_view_load_image_rescaled, &params,
+      out_buffer_view);
+
+  stbi_image_free(pixel_data);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
--- a/cpp/vision_inference/image_util.h
+++ b/cpp/vision_inference/image_util.h
@@ -0,0 +1,77 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
+#define IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/buffer_view.h"
+
+#if __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Loads the image at |filename| into |out_pixel_data| and sets
+// |out_buffer_length| to its length.
+//
+// The image dimension must match the width, height, and channel in|shape|,
+// while 2 <= |shape_rank| <= 4 to match the image tensor format.
+//
+// The file must be in a format supported by stb_image.h.
+// The returned |out_pixel_data| buffer must be released by the caller.
+iree_status_t iree_tools_utils_load_pixel_data(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length);
+
+// Parse the content in an image file in |filename| into a HAL buffer view
+// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
+// |shape_rank|, and |element_type|, while being allocated by |allocator|.
+//
+// The |element_type| has to be SINT_8 or UINT_8. For FLOAT_32, use
+// |iree_tools_utils_buffer_view_from_image_rescaled| instead.
+//
+// The returned |out_buffer_view| must be released by the caller.
+iree_status_t iree_tools_utils_buffer_view_from_image(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view);
+
+// Parse the content in an image file in |filename| into a HAL buffer view
+// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
+// |shape_rank|, and |element_type|, while being allocated by |allocator|.
+// The value in |out_buffer_view| is rescaled with |input_range|.
+//
+// The |element_type| has to be FLOAT_32, For SINT_8 or UINT_8, use
+// |iree_tools_utils_buffer_view_from_image| instead.
+//
+// The returned |out_buffer_view| must be released by the caller.
+iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
+    const iree_string_view_t filename, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_allocator_t* allocator, const float* input_range,
+    iree_host_size_t input_range_length,
+    iree_hal_buffer_view_t** out_buffer_view);
+
+// Normalize uint8_t |pixel_data| of the size |buffer_length| to float buffer
+// |out_buffer| with the range |input_range|.
+//
+// float32_x = (uint8_x - 127.5) / 127.5 * input_scale + input_offset, where
+// input_scale = abs(|input_range[0]| - |input_range[1]| / 2
+// input_offset = |input_range[0]| + |input_range[1]| / 2
+//
+// |out_buffer| needs to be allocated before the call.
+iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
+    const uint8_t* pixel_data, iree_host_size_t pixel_count,
+    const float* input_range, iree_host_size_t input_range_length,
+    float* out_buffer);
+
+#if __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
--- a/cpp/vision_inference/iree-run-mnist-module.c
+++ b/cpp/vision_inference/iree-run-mnist-module.c
@@ -0,0 +1,121 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This sample uses image_util to load a hand-written image as an
+// iree_hal_buffer_view_t then passes it to the bytecode module built from
+// mnist.mlir on the CPU backend with the local-task driver.
+
+#include <float.h>
+
+#include "image_util.h"
+#include "iree/runtime/api.h"
+#include "mnist_bytecode_module_c.h"
+
+iree_status_t Run(const iree_string_view_t image_path) {
+  iree_runtime_instance_options_t instance_options;
+  iree_runtime_instance_options_initialize(IREE_API_VERSION_LATEST,
+                                           &instance_options);
+  iree_runtime_instance_options_use_all_available_drivers(&instance_options);
+  iree_runtime_instance_t* instance = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_instance_create(
+      &instance_options, iree_allocator_system(), &instance));
+
+  // TODO(#5724): move device selection into the compiled modules.
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_instance_try_create_default_device(
+      instance, iree_make_cstring_view("local-task"), &device));
+
+  // Create one session per loaded module to hold the module state.
+  iree_runtime_session_options_t session_options;
+  iree_runtime_session_options_initialize(&session_options);
+  iree_runtime_session_t* session = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_session_create_with_device(
+      instance, &session_options, device,
+      iree_runtime_instance_host_allocator(instance), &session));
+  iree_hal_device_release(device);
+
+  const struct iree_file_toc_t* module_file =
+      iree_samples_vision_inference_mnist_bytecode_module_create();
+
+  IREE_RETURN_IF_ERROR(iree_runtime_session_append_bytecode_module_from_memory(
+      session, iree_make_const_byte_span(module_file->data, module_file->size),
+      iree_allocator_null()));
+
+  iree_runtime_call_t call;
+  IREE_RETURN_IF_ERROR(iree_runtime_call_initialize_by_name(
+      session, iree_make_cstring_view("module.predict"), &call));
+
+  // Prepare the input hal buffer view with image_util library.
+  // The input of the mmist model is single 28x28 pixel image as a
+  // tensor<1x28x28x1xf32>, with pixels in [0.0, 1.0].
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  iree_hal_dim_t buffer_shape[] = {1, 28, 28, 1};
+  iree_hal_element_type_t hal_element_type = IREE_HAL_ELEMENT_TYPE_FLOAT_32;
+  float input_range[2] = {0.0f, 1.0f};
+  IREE_RETURN_IF_ERROR(
+      iree_tools_utils_buffer_view_from_image_rescaled(
+          image_path, buffer_shape, IREE_ARRAYSIZE(buffer_shape),
+          hal_element_type, iree_hal_device_allocator(device), input_range,
+          IREE_ARRAYSIZE(input_range), &buffer_view),
+      "load image");
+  IREE_RETURN_IF_ERROR(
+      iree_runtime_call_inputs_push_back_buffer_view(&call, buffer_view));
+  iree_hal_buffer_view_release(buffer_view);
+
+  IREE_RETURN_IF_ERROR(iree_runtime_call_invoke(&call, /*flags=*/0));
+
+  // Get the result buffers from the invocation.
+  iree_hal_buffer_view_t* ret_buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_runtime_call_outputs_pop_front_buffer_view(&call, &ret_buffer_view));
+
+  // Read back the results. The output of the mnist model is a 1x10 prediction
+  // confidence values for each digit in [0, 9].
+  float predictions[1 * 10] = {0.0f};
+  IREE_RETURN_IF_ERROR(iree_hal_device_transfer_d2h(
+      iree_runtime_session_device(session),
+      iree_hal_buffer_view_buffer(ret_buffer_view), 0, predictions,
+      sizeof(predictions), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+      iree_infinite_timeout()));
+  iree_hal_buffer_view_release(ret_buffer_view);
+
+  // Get the highest index from the output.
+  float result_val = FLT_MIN;
+  int result_idx = 0;
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(predictions); ++i) {
+    if (predictions[i] > result_val) {
+      result_val = predictions[i];
+      result_idx = i;
+    }
+  }
+  fprintf(stdout, "Detected number: %d\n", result_idx);
+
+  iree_runtime_call_deinitialize(&call);
+  iree_runtime_session_release(session);
+  iree_runtime_instance_release(instance);
+  return iree_ok_status();
+}
+
+int main(int argc, char** argv) {
+  if (argc > 2) {
+    fprintf(stderr, "Usage: iree-run-mnist-module <image file>\n");
+    return -1;
+  }
+  iree_string_view_t image_path;
+  if (argc == 1) {
+    image_path = iree_make_cstring_view("mnist_test.png");
+  } else {
+    image_path = iree_make_cstring_view(argv[1]);
+  }
+  iree_status_t result = Run(image_path);
+  if (!iree_status_is_ok(result)) {
+    iree_status_fprint(stderr, result);
+    iree_status_ignore(result);
+    return -1;
+  }
+  iree_status_ignore(result);
+  return 0;
+}
--- a/cpp/vision_inference/mnist_test.png
+++ b/cpp/vision_inference/mnist_test.png
--- a/cpp/vulkan_gui/CMakeLists.txt
+++ b/cpp/vulkan_gui/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT IREE_TARGET_BACKEND_VULKAN_SPIRV OR
+   NOT IREE_HAL_DRIVER_VULKAN)
+  message(STATUS "Missing Vulkan backend and/or driver, skipping vulkan_gui sample")
+  return()
+endif()
+
+# This target statically links against Vulkan.
+# One way to achieve this is by installing the Vulkan SDK from
+# https://vulkan.lunarg.com/.
+include(FindVulkan)
+if(NOT Vulkan_FOUND)
+  message(STATUS "Could not find Vulkan, skipping vulkan_gui sample")
+  return()
+endif()
+
+# vcpkg install sdl2[vulkan]
+#   tested with versions 2.0.14#4 - 2.0.22#1
+find_package(SDL2)
+if(NOT SDL2_FOUND)
+  message(STATUS "Could not find SDL2, skipping vulkan_gui sample")
+  return()
+endif()
+
+FetchContent_Declare(
+  imgui
+  GIT_REPOSITORY https://github.com/ocornut/imgui
+  GIT_TAG        master
+)
+
+FetchContent_MakeAvailable(imgui)
+
+# Dear ImGui
+set(IMGUI_DIR ${CMAKE_BINARY_DIR}/_deps/imgui-src)
+message("Looking for Imgui in ${IMGUI_DIR}")
+include_directories(${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
+
+# Define the sample executable.
+set(_NAME "iree-samples-vulkan-gui")
+add_executable(${_NAME} "")
+target_sources(${_NAME}
+  PRIVATE
+    vulkan_inference_gui.cc
+    "${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
+    "${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
+    "${IMGUI_DIR}/imgui.cpp"
+    "${IMGUI_DIR}/imgui_draw.cpp"
+    "${IMGUI_DIR}/imgui_demo.cpp"
+    "${IMGUI_DIR}/imgui_tables.cpp"
+    "${IMGUI_DIR}/imgui_widgets.cpp"
+)
+set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-samples-vulkan-gui")
+target_include_directories(${_NAME} PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+)
+target_link_libraries(${_NAME}
+  SDL2::SDL2
+  Vulkan::Vulkan
+  iree_runtime_runtime
+  iree_base_internal_main
+  iree_hal_drivers_vulkan_registration_registration
+  iree_modules_hal_hal
+  iree_vm_vm
+  iree_vm_bytecode_module
+  iree_vm_cc
+)
+
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+  set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
+else()
+  set(_GUI_LINKOPTS "")
+endif()
+
+target_link_options(${_NAME}
+  PRIVATE
+    ${_GUI_LINKOPTS}
+)
+
+message(STATUS "Configured vulkan_gui sample successfully")
--- a/cpp/vulkan_gui/simple_mul.mlir
+++ b/cpp/vulkan_gui/simple_mul.mlir
@@ -0,0 +1,4 @@
+func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = "arith.mulf"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
--- a/cpp/vulkan_gui/snail_imagenet.jpg
+++ b/cpp/vulkan_gui/snail_imagenet.jpg
--- a/cpp/vulkan_gui/stb_image.h
+++ b/cpp/vulkan_gui/stb_image.h
--- a/cpp/vulkan_gui/vulkan_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_inference_gui.cc
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -2,20 +2,23 @@
 """SHARK Tank"""
 # python generate_sharktank.py, you have to give a csv tile with [model_name, model_download_url]
 # will generate local shark tank folder like this:
-#   /SHARK
-#     /gen_shark_tank
-#       /albert_lite_base
-#       /...model_name...
+#   HOME
+#     /.local
+#       /shark_tank
+#           /albert_lite_base
+#           /...model_name...
 #

 import os
 import csv
 import argparse
 from shark.shark_importer import SharkImporter
+from shark.parser import shark_args
 import tensorflow as tf
 import subprocess as sp
 import hashlib
 import numpy as np
+from pathlib import Path

 visible_default = tf.config.list_physical_devices("GPU")
 try:
@@ -27,9 +30,6 @@ except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

-# All generated models and metadata will be saved under this directory.
-WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
-

 def create_hash(file_name):
    with open(file_name, "rb") as f:
@@ -43,6 +43,7 @@ def create_hash(file_name):
 def save_torch_model(torch_model_list):
    from tank.model_utils import get_hf_model
    from tank.model_utils import get_vision_model
+    from tank.model_utils import get_hf_img_cls_model

    with open(torch_model_list) as csvfile:
        torch_reader = csv.reader(csvfile, delimiter=",")
@@ -51,8 +52,10 @@ def save_torch_model(torch_model_list):
            torch_model_name = row[0]
            tracing_required = row[1]
            model_type = row[2]
+            is_dynamic = row[3]

            tracing_required = False if tracing_required == "False" else True
+            is_dynamic = False if is_dynamic == "False" else True

            model = None
            input = None
@@ -60,6 +63,8 @@ def save_torch_model(torch_model_list):
                model, input, _ = get_vision_model(torch_model_name)
            elif model_type == "hf":
                model, input, _ = get_hf_model(torch_model_name)
+            elif model_type == "hf_img_cls":
+                model, input, _ = get_hf_img_cls_model(torch_model_name)

            torch_model_name = torch_model_name.replace("/", "_")
            torch_model_dir = os.path.join(
@@ -85,12 +90,13 @@ def save_torch_model(torch_model_list):
            )
            np.save(os.path.join(torch_model_dir, "hash"), np.array(mlir_hash))
            # Generate torch dynamic models.
-            mlir_importer.import_debug(
-                is_dynamic=True,
-                tracing_required=tracing_required,
-                dir=torch_model_dir,
-                model_name=torch_model_name + "_dynamic",
-            )
+            if is_dynamic:
+                mlir_importer.import_debug(
+                    is_dynamic=True,
+                    tracing_required=tracing_required,
+                    dir=torch_model_dir,
+                    model_name=torch_model_name + "_dynamic",
+                )


 def save_tf_model(tf_model_list):
@@ -215,9 +221,21 @@ if __name__ == "__main__":
        default="./tank/tflite/tflite_model_list.csv",
        help="Contains the file with tf model name and args.",
    )
+    parser.add_argument(
+        "--ci_tank_dir",
+        type=bool,
+        default=False,
+    )
    parser.add_argument("--upload", type=bool, default=False)

    args = parser.parse_args()
+
+    home = str(Path.home())
+    if args.ci_tank_dir == True:
+        WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
+    else:
+        WORKDIR = os.path.join(home, ".local/shark_tank/")
+
    if args.torch_model_csv:
        save_torch_model(args.torch_model_csv)

@@ -230,6 +248,4 @@ if __name__ == "__main__":
    if args.upload:
        git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
        print("uploading files to gs://shark_tank/" + git_hash)
-        os.system(
-            "gsutil cp -r ./gen_shark_tank/* gs://shark_tank/" + git_hash
-        )
+        os.system(f"gsutil cp -r {WORKDIR}* gs://shark_tank/" + git_hash)
--- a/reference_models/bert-base-uncased_torch/bert-base-uncased_torch_test.py
+++ b/reference_models/bert-base-uncased_torch/bert-base-uncased_torch_test.py
@@ -1,109 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-from shark.parser import shark_args
-
-import torch
-import unittest
-import numpy as np
-import pytest
-
-
-class BertBaseUncasedModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "bert-base-uncased", dynamic
-        )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_args.onnx_bench = self.onnx_bench
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "bert-base-uncased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class BertBaseUncasedModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = BertBaseUncasedModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
+++ b/reference_models/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
@@ -1,71 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class DistilBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "distilbert-base-uncased"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class DistilBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = DistilBertModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
+++ b/reference_models/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
@@ -1,95 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.parser import shark_args
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class DistilBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "distilbert-base-uncased", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "distilbert-base-uncased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class DistilBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = DistilBertModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
+++ b/reference_models/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class MobileNetV3ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "mobilenet_v3_small", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        np.testing.assert_allclose(act_out, results, rtol=1e-02, atol=1e-03)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "alexnet",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class MobileNetV3ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MobileNetV3ModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="golden results don't match.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="golden results don't match.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="stuck in the pipeline.")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/resnet101_torch/resnet101_torch_test.py
+++ b/reference_models/resnet101_torch/resnet101_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class Resnet101ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "resnet101", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "resnet101",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class Resnet101ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = Resnet101ModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/resnet50_torch/resnet50_torch_test.py
+++ b/reference_models/resnet50_torch/resnet50_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import get_vision_model, compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class Resnet50ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "resnet50", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "resnet50",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class Resnet50ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = Resnet50ModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/unet_torch/unet_torch_test.py
+++ b/reference_models/unet_torch/unet_torch_test.py
@@ -1,91 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class UnetModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "unet", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        np.testing.assert_allclose(act_out, results, rtol=1e-02, atol=1e-03)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "unet",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class UnetModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = UnetModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -19,13 +19,16 @@ tensorflow-macos
 tensorflow-metal
 #tf-models-nightly
 #tensorflow-text-nightly
-transformers==4.18.0
+transformers
 tensorflow-probability
 #jax[cpu]

 # tflitehub dependencies.
 Pillow

+# web dependecies.
+gradio
+
 # Testing and support.
 #lit
 #pyyaml
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -17,7 +17,8 @@ gin-config
 tensorflow
 #tf-models-nightly
 #tensorflow-text-nightly
-transformers==4.18.0
+transformers
+diffusers
 #tensorflow-probability
 #jax[cpu]

@@ -28,6 +29,12 @@ Pillow
 # Testing and support.
 lit
 pyyaml
+python-dateutil
+sacremoses
+
+# web dependecies.
+gradio
+scipy

 #ONNX and ORT for benchmarking
 #--extra-index-url https://test.pypi.org/simple/
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ gsutil
 pytest
 pytest-xdist
 Pillow
+parameterized
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,12 @@ with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

 PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
+backend_deps = []
+if "NO_BACKEND" in os.environ.keys():
+    backend_deps = [
+        "iree-compiler>=20220427.13",
+        "iree-runtime>=20220427.13",
+    ]

 setup(
    name="nodai-SHARK",
@@ -32,7 +38,6 @@ setup(
        "numpy",
        "PyYAML",
        "torch-mlir>=20220428.420",
-        "iree-compiler>=20220427.13",
-        "iree-runtime>=20220427.13",
-    ],
+    ]
+    + backend_deps,
 )
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -7,6 +7,8 @@
 # VENV_DIR=myshark.venv #create a venv called myshark.venv
 # USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
 # IMPORTER=1 #Install importer deps
+# BENCHMARK=1 #Install benchmark deps
+# NO_BACKEND=1 #Don't install iree or shark backend
 # if you run the script from a conda env it will install in your conda env

 TD="$(cd $(dirname $0) && pwd)"
@@ -74,7 +76,7 @@ fi
 $PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
 $PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
 if [ "$torch_mlir_bin" = true ]; then
-  $PYTHON -m pip install --find-links https://github.com/llvm/torch-mlir/releases torch-mlir --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
  if [ $? -eq 0 ];then
    echo "Successfully Installed torch-mlir"
  else
@@ -91,14 +93,17 @@ if [[ -z "${USE_IREE}" ]]; then
 else
  RUNTIME="google/iree"
 fi
-echo "Installing ${RUNTIME}..."
-$PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime
-
+if [[ -z "${NO_BACKEND}" ]]; then
+  echo "Installing ${RUNTIME}..."
+  $PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime
+else
+  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
+fi
 if [[ ! -z "${IMPORTER}" ]]; then
  echo "${Yellow}Installing importer tools.."
  if [[ $(uname -s) = 'Linux' ]]; then
    echo "${Yellow}Linux detected.. installing Linux importer tools"
-    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://test.pypi.org/simple/ --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  elif [[ $(uname -s) = 'Darwin' ]]; then
    echo "${Yellow}macOS detected.. installing macOS importer tools"
    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
@@ -106,9 +111,9 @@ if [[ ! -z "${IMPORTER}" ]]; then
  fi
 fi

-$PYTHON -m pip install -e . --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://github.com/llvm/torch-mlir/releases -f https://github.com/${RUNTIME}/releases
+$PYTHON -m pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://github.com/${RUNTIME}/releases

-if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
+if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
  $PYTHON -m pip uninstall -y torch torchvision
  $PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
  if [ $? -eq 0 ];then
--- a/shark/examples/shark_dynamo/basic_examples.py
+++ b/shark/examples/shark_dynamo/basic_examples.py
@@ -0,0 +1,70 @@
+import torchdynamo
+import torch
+import torch_mlir
+from shark.sharkdynamo.utils import make_shark_compiler
+
+
+import warnings, logging
+
+warnings.simplefilter("ignore")
+torchdynamo.config.log_level = logging.ERROR
+
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
+)
+def foo(t):
+    return 2 * t
+
+
+example_input = torch.rand((2, 3))
+x = foo(example_input)
+print(x)
+
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
+)
+def foo(a, b):
+    x = a / (a + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+print(foo(torch.rand((2, 3)), -torch.rand((2, 3))))
+
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
+)
+def foo(a):
+    for i in range(10):
+        a += 1.0
+    return a
+
+
+print(foo(torch.rand((1, 2))))
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
+)
+def test_unsupported_types(t, y):
+    return t, 2 * y
+
+
+str_input = "hello"
+tensor_input = torch.randn(2)
+print(test_unsupported_types(str_input, tensor_input))
--- a/shark/examples/shark_eager/squeezenet_lockstep.py
+++ b/shark/examples/shark_eager/squeezenet_lockstep.py
@@ -0,0 +1,73 @@
+import torch
+import numpy as np
+
+model = torch.hub.load(
+    "pytorch/vision:v0.10.0", "squeezenet1_0", pretrained=True
+)
+model.eval()
+
+# from PIL import Image
+# from torchvision import transforms
+# import urllib
+#
+# url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+# try: urllib.URLopener().retrieve(url, filename)
+# except: urllib.request.urlretrieve(url, filename)
+#
+#
+# input_image = Image.open(filename)
+# preprocess = transforms.Compose([
+#     transforms.Resize(256),
+#     transforms.CenterCrop(224),
+#     transforms.ToTensor(),
+#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+# ])
+# input_tensor = preprocess(input_image)
+# input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+# print(input_batch.shape) # size = [1, 3, 224, 224]
+
+# The above is code for generating sample inputs from an image. We can just use
+# random values for accuracy testing though
+input_batch = torch.randn(1, 3, 224, 224)
+
+
+# Focus on CPU for now
+if False and torch.cuda.is_available():
+    input_batch = input_batch.to("cuda")
+    model.to("cuda")
+
+with torch.no_grad():
+    output = model(input_batch)
+# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
+golden_confidences = output[0]
+# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
+golden_probabilities = torch.nn.functional.softmax(
+    golden_confidences, dim=0
+).numpy()
+
+golden_confidences = golden_confidences.numpy()
+
+from shark.torch_mlir_lockstep_tensor import TorchMLIRLockstepTensor
+
+input_detached_clone = input_batch.clone()
+eager_input_batch = TorchMLIRLockstepTensor(input_detached_clone)
+
+print("getting torch-mlir result")
+
+output = model(eager_input_batch)
+
+static_output = output.elem
+confidences = static_output[0]
+probabilities = torch.nn.functional.softmax(
+    torch.from_numpy(confidences), dim=0
+).numpy()
+
+print("The obtained result via shark is: ", confidences)
+print("The golden result is:", golden_confidences)
+
+np.testing.assert_allclose(
+    golden_confidences, confidences, rtol=1e-02, atol=1e-03
+)
+np.testing.assert_allclose(
+    golden_probabilities, probabilities, rtol=1e-02, atol=1e-03
+)
--- a/shark/examples/shark_inference/bloom_tank.py
+++ b/shark/examples/shark_inference/bloom_tank.py
@@ -0,0 +1,12 @@
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_torch_model
+
+mlir_model, func_name, inputs, golden_out = download_torch_model("bloom")
+
+shark_module = SharkInference(
+    mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"
+)
+shark_module.compile()
+result = shark_module.forward(inputs)
+print("The obtained result via shark is: ", result)
+print("The golden result is:", golden_out)
--- a/shark/examples/shark_inference/resnet50_fp16.py
+++ b/shark/examples/shark_inference/resnet50_fp16.py
@@ -0,0 +1,76 @@
+from shark.shark_inference import SharkInference
+from shark.parser import shark_args
+
+import torch
+import numpy as np
+import sys
+import torchvision.models as models
+import torch_mlir
+
+torch.manual_seed(0)
+
+
+class VisionModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = models.resnet50(pretrained=True)
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+model = VisionModule()
+test_input = torch.randn(1, 3, 224, 224)
+actual_out = model(test_input)
+
+test_input_fp16 = test_input.to(device=torch.device("cuda"), dtype=torch.half)
+model_fp16 = model.half()
+model_fp16.eval()
+model_fp16.to("cuda")
+actual_out_fp16 = model_fp16(test_input_fp16)
+
+ts_g = torch.jit.trace(model_fp16, [test_input_fp16])
+
+module = torch_mlir.compile(
+    ts_g,
+    (test_input_fp16),
+    torch_mlir.OutputType.LINALG_ON_TENSORS,
+    use_tracing=True,
+    verbose=False,
+)
+
+# from contextlib import redirect_stdout
+
+# with open('resnet50_fp16_linalg_ir.mlir', 'w') as f:
+#     with redirect_stdout(f):
+#         print(module.operation.get_asm())
+
+mlir_model = module
+func_name = "forward"
+
+shark_module = SharkInference(
+    mlir_model, func_name, device="cuda", mlir_dialect="linalg"
+)
+shark_module.compile()
+
+
+def shark_result(x):
+    x_ny = x.cpu().detach().numpy()
+    inputs = (x_ny,)
+    result = shark_module.forward(inputs)
+    return torch.from_numpy(result)
+
+
+observed_out = shark_result(test_input_fp16)
+
+print("Golden result:", actual_out_fp16)
+print("SHARK result:", observed_out)
+
+actual_out_fp16 = actual_out_fp16.to(device=torch.device("cpu"))
+
+print(
+    torch.testing.assert_allclose(
+        actual_out_fp16, observed_out, rtol=1e-2, atol=1e-2
+    )
+)
--- a/shark/examples/shark_inference/resnet50_script.py
+++ b/shark/examples/shark_inference/resnet50_script.py
@@ -69,7 +69,9 @@ labels = load_labels()
 mlir_model, func_name, inputs, golden_out = download_torch_model("resnet50")

 shark_module = SharkInference(mlir_model, func_name, mlir_dialect="linalg")
-shark_module.compile()
+# shark_module.compile()
+path = shark_module.save_module()
+shark_module.load_module(path)
 result = shark_module.forward((img.detach().numpy(),))

 print("The top 3 results obtained via shark_runner is:")
--- a/shark/examples/shark_inference/simple_dlrm.py
+++ b/shark/examples/shark_inference/simple_dlrm.py
@@ -0,0 +1,392 @@
+# Description: an implementation of a deep learning recommendation model (DLRM)
+# The model input consists of dense and sparse features. The former is a vector
+# of floating point values. The latter is a list of sparse indices into
+# embedding tables, which consist of vectors of floating point values.
+# The selected vectors are passed to mlp networks denoted by triangles,
+# in some cases the vectors are interacted through operators (Ops).
+#
+# output:
+#                         vector of values
+# model:                        |
+#                              /\
+#                             /__\
+#                               |
+#       _____________________> Op  <___________________
+#     /                         |                      \
+#    /\                        /\                      /\
+#   /__\                      /__\           ...      /__\
+#    |                          |                       |
+#    |                         Op                      Op
+#    |                    ____/__\_____           ____/__\____
+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+# input:
+# [ dense features ]     [sparse indices] , ..., [sparse indices]
+#
+# More precise definition of model layers:
+# 1) fully connected layers of an mlp
+# z = f(y)
+# y = Wx + b
+#
+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+# z = Op(e1,...,ek)
+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+#
+# 3) Operator Op can be one of the following
+# Sum(e1,...,ek) = e1 + ... + ek
+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+# Cat(e1,...,ek) = [e1', ..., ek']'
+# where ' denotes transpose operation
+#
+# References:
+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
+
+
+import argparse
+import sys
+import numpy as np
+import torch
+import torch.nn as nn
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+
+
+torch.manual_seed(0)
+np.random.seed(0)
+
+
+### define dlrm in PyTorch ###
+class DLRM_Net(nn.Module):
+    def create_mlp(self, ln, sigmoid_layer):
+        # build MLP layer by layer
+        layers = nn.ModuleList()
+        for i in range(0, ln.size - 1):
+            n = ln[i]
+            m = ln[i + 1]
+
+            # construct fully connected operator
+            LL = nn.Linear(int(n), int(m), bias=True)
+
+            # initialize the weights
+            # with torch.no_grad():
+            # custom Xavier input, output or two-sided fill
+
+            mean = 0.0  # std_dev = np.sqrt(variance)
+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+            bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+            LL.weight.data = torch.tensor(W, requires_grad=True)
+            LL.bias.data = torch.tensor(bt, requires_grad=True)
+
+            # approach 2
+            # LL.weight.data.copy_(torch.tensor(W))
+            # LL.bias.data.copy_(torch.tensor(bt))
+            # approach 3
+            # LL.weight = Parameter(torch.tensor(W),requires_grad=True)
+            # LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
+            layers.append(LL)
+
+            # construct sigmoid or relu operator
+            if i == sigmoid_layer:
+                layers.append(nn.Sigmoid())
+            else:
+                layers.append(nn.ReLU())
+
+        # approach 1: use ModuleList
+        # return layers
+        # approach 2: use Sequential container to wrap all layers
+        return torch.nn.Sequential(*layers)
+
+    def create_emb(self, m, ln, weighted_pooling=None):
+        emb_l = nn.ModuleList()
+        v_W_l = []
+        for i in range(0, ln.size):
+            n = ln[i]
+
+            # construct embedding operator
+            EE = nn.EmbeddingBag(n, m, mode="sum")
+            # initialize embeddings
+            # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
+            W = np.random.uniform(
+                low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
+            ).astype(np.float32)
+            # approach 1
+            print(W)
+            EE.weight.data = torch.tensor(W, requires_grad=True)
+            # approach 2
+            # EE.weight.data.copy_(torch.tensor(W))
+            # approach 3
+            # EE.weight = Parameter(torch.tensor(W),requires_grad=True)
+            if weighted_pooling is None:
+                v_W_l.append(None)
+            else:
+                v_W_l.append(torch.ones(n, dtype=torch.float32))
+            emb_l.append(EE)
+        return emb_l, v_W_l
+
+    def __init__(
+        self,
+        m_spa=None,
+        ln_emb=None,
+        ln_bot=None,
+        ln_top=None,
+        arch_interaction_op=None,
+        arch_interaction_itself=False,
+        sigmoid_bot=-1,
+        sigmoid_top=-1,
+        weighted_pooling=None,
+    ):
+        super(DLRM_Net, self).__init__()
+
+        if (
+            (m_spa is not None)
+            and (ln_emb is not None)
+            and (ln_bot is not None)
+            and (ln_top is not None)
+            and (arch_interaction_op is not None)
+        ):
+
+            # save arguments
+            self.output_d = 0
+            self.arch_interaction_op = arch_interaction_op
+            self.arch_interaction_itself = arch_interaction_itself
+            if weighted_pooling is not None and weighted_pooling != "fixed":
+                self.weighted_pooling = "learned"
+            else:
+                self.weighted_pooling = weighted_pooling
+
+            # create operators
+            self.emb_l, w_list = self.create_emb(
+                m_spa, ln_emb, weighted_pooling
+            )
+            if self.weighted_pooling == "learned":
+                self.v_W_l = nn.ParameterList()
+                for w in w_list:
+                    self.v_W_l.append(nn.Parameter(w))
+            else:
+                self.v_W_l = w_list
+            self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
+            self.top_l = self.create_mlp(ln_top, sigmoid_top)
+
+    def apply_mlp(self, x, layers):
+        return layers(x)
+
+    def apply_emb(self, lS_o, lS_i, emb_l, v_W_l):
+        # WARNING: notice that we are processing the batch at once. We implicitly
+        # assume that the data is laid out such that:
+        # 1. each embedding is indexed with a group of sparse indices,
+        #   corresponding to a single lookup
+        # 2. for each embedding the lookups are further organized into a batch
+        # 3. for a list of embedding tables there is a list of batched lookups
+        # TORCH-MLIR
+        # We are passing all the embeddings as arguments for easy parsing.
+
+        ly = []
+        for k, sparse_index_group_batch in enumerate(lS_i):
+            sparse_offset_group_batch = lS_o[k]
+
+            # embedding lookup
+            # We are using EmbeddingBag, which implicitly uses sum operator.
+            # The embeddings are represented as tall matrices, with sum
+            # happening vertically across 0 axis, resulting in a row vector
+            # E = emb_l[k]
+
+            if v_W_l[k] is not None:
+                per_sample_weights = v_W_l[k].gather(
+                    0, sparse_index_group_batch
+                )
+            else:
+                per_sample_weights = None
+
+            E = emb_l[k]
+            V = E(
+                sparse_index_group_batch,
+                sparse_offset_group_batch,
+                per_sample_weights=per_sample_weights,
+            )
+
+            ly.append(V)
+
+        return ly
+
+    def interact_features(self, x, ly):
+
+        if self.arch_interaction_op == "dot":
+            # concatenate dense and sparse features
+            (batch_size, d) = x.shape
+            T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
+            # perform a dot product
+            Z = torch.bmm(T, torch.transpose(T, 1, 2))
+            # append dense feature with the interactions (into a row vector)
+            # approach 1: all
+            # Zflat = Z.view((batch_size, -1))
+            # approach 2: unique
+            _, ni, nj = Z.shape
+            # approach 1: tril_indices
+            # offset = 0 if self.arch_interaction_itself else -1
+            # li, lj = torch.tril_indices(ni, nj, offset=offset)
+            # approach 2: custom
+            offset = 1 if self.arch_interaction_itself else 0
+            li = torch.tensor(
+                [i for i in range(ni) for j in range(i + offset)]
+            )
+            lj = torch.tensor(
+                [j for i in range(nj) for j in range(i + offset)]
+            )
+            Zflat = Z[:, li, lj]
+            # concatenate dense features and interactions
+            R = torch.cat([x] + [Zflat], dim=1)
+        elif self.arch_interaction_op == "cat":
+            # concatenation features (into a row vector)
+            R = torch.cat([x] + ly, dim=1)
+        else:
+            sys.exit(
+                "ERROR: --arch-interaction-op="
+                + self.arch_interaction_op
+                + " is not supported"
+            )
+
+        return R
+
+    def forward(self, dense_x, lS_o, *lS_i):
+        return self.sequential_forward(dense_x, lS_o, lS_i)
+
+    def sequential_forward(self, dense_x, lS_o, lS_i):
+        # process dense features (using bottom mlp), resulting in a row vector
+        x = self.apply_mlp(dense_x, self.bot_l)
+        # debug prints
+        # print("intermediate")
+        # print(x.detach().cpu().numpy())
+
+        # process sparse features(using embeddings), resulting in a list of row vectors
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
+        # for y in ly:
+        #     print(y.detach().cpu().numpy())
+
+        # interact features (dense and sparse)
+        z = self.interact_features(x, ly)
+        # print(z.detach().cpu().numpy())
+
+        # obtain probability of a click (using top mlp)
+        p = self.apply_mlp(z, self.top_l)
+
+        # # clamp output if needed
+        # if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+        # z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
+        # else:
+        # z = p
+
+        return p
+
+
+def dash_separated_ints(value):
+    vals = value.split("-")
+    for val in vals:
+        try:
+            int(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of ints" % value
+            )
+
+    return value
+
+
+# model related parameters
+parser = argparse.ArgumentParser(
+    description="Train Deep Learning Recommendation Model (DLRM)"
+)
+parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
+parser.add_argument(
+    "--arch-embedding-size", type=dash_separated_ints, default="4-3-2"
+)
+# j will be replaced with the table number
+parser.add_argument(
+    "--arch-mlp-bot", type=dash_separated_ints, default="4-3-2"
+)
+parser.add_argument(
+    "--arch-mlp-top", type=dash_separated_ints, default="8-2-1"
+)
+parser.add_argument(
+    "--arch-interaction-op", type=str, choices=["dot", "cat"], default="dot"
+)
+parser.add_argument(
+    "--arch-interaction-itself", action="store_true", default=False
+)
+parser.add_argument("--weighted-pooling", type=str, default=None)
+
+args = parser.parse_args()
+
+ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
+ln_top = np.fromstring(args.arch_mlp_top, dtype=int, sep="-")
+m_den = ln_bot[0]
+ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+m_spa = args.arch_sparse_feature_size
+ln_emb = np.asarray(ln_emb)
+num_fea = ln_emb.size + 1  # num sparse + num dense features
+
+
+# Initialize the model.
+dlrm_model = DLRM_Net(
+    m_spa=m_spa,
+    ln_emb=ln_emb,
+    ln_bot=ln_bot,
+    ln_top=ln_top,
+    arch_interaction_op=args.arch_interaction_op,
+)
+
+
+# Inputs to the model.
+dense_inp = torch.tensor([[0.6965, 0.2861, 0.2269, 0.5513]])
+vs0 = torch.tensor([[0], [0], [0]], dtype=torch.int64)
+vsi = torch.tensor([1, 2, 3]), torch.tensor([1]), torch.tensor([1])
+
+input_dlrm = (dense_inp, vs0, *vsi)
+
+golden_output = dlrm_model(dense_inp, vs0, *vsi)
+
+mlir_importer = SharkImporter(
+    dlrm_model,
+    input_dlrm,
+    frontend="torch",
+)
+
+(dlrm_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
+    tracing_required=True
+)
+
+shark_module = SharkInference(
+    dlrm_mlir, func_name, device="vulkan", mlir_dialect="linalg"
+)
+shark_module.compile()
+result = shark_module.forward(input_dlrm)
+np.testing.assert_allclose(
+    golden_output.detach().numpy(), result, rtol=1e-02, atol=1e-03
+)
+
+
+# Verified via torch-mlir.
+# import torch_mlir
+# from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
+
+
+# module = torch_mlir.compile(
+# dlrm_model, inputs, use_tracing=True, output_type="linalg-on-tensors"
+# )
+# backend = refbackend.RefBackendLinalgOnTensorsBackend()
+# compiled = backend.compile(module)
+# jit_module = backend.load(compiled)
+
+# dense_numpy = dense_inp.numpy()
+# vs0_numpy = vs0.numpy()
+# vsi_numpy = [inp.numpy() for inp in vsi]
+
+# numpy_inp = (dense_numpy, vs0_numpy, *vsi_numpy)
+
+# print(jit_module.forward(*numpy_inp))
--- a/shark/examples/shark_inference/sparse_arch.py
+++ b/shark/examples/shark_inference/sparse_arch.py
@@ -0,0 +1,314 @@
+import torch
+from torch import nn
+from torchrec.datasets.utils import Batch
+from torchrec.modules.crossnet import LowRankCrossNet
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
+from torchrec.modules.embedding_configs import EmbeddingBagConfig
+from torchrec.modules.embedding_modules import EmbeddingBagCollection
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+from typing import Dict, List, Optional, Tuple
+from torchrec.models.dlrm import (
+    choose,
+    DenseArch,
+    DLRM,
+    InteractionArch,
+    SparseArch,
+    OverArch,
+)
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+import numpy as np
+
+torch.manual_seed(0)
+
+np.random.seed(0)
+
+
+def calculate_offsets(tensor_list, prev_values, prev_offsets):
+    offset_init = 0
+    offset_list = []
+    values_list = []
+
+    if prev_offsets != None:
+        offset_init = prev_values.shape[-1]
+    for tensor in tensor_list:
+        offset_list.append(offset_init)
+        offset_init += tensor.shape[0]
+
+    concatendated_tensor_list = torch.cat(tensor_list)
+
+    if prev_values != None:
+        concatendated_tensor_list = torch.cat(
+            [prev_values, concatendated_tensor_list]
+        )
+
+    concatenated_offsets = torch.tensor(offset_list)
+
+    if prev_offsets != None:
+        concatenated_offsets = torch.cat([prev_offsets, concatenated_offsets])
+
+    return concatendated_tensor_list, concatenated_offsets
+
+
+# Have to make combined_keys as dict as to which embedding bags they
+# point to. {f1: 0, f3: 0, f2: 1}
+# The result will be a triple containing values, indices and pointer tensor.
+def to_list(key_jagged, combined_keys):
+    key_jagged_dict = key_jagged.to_dict()
+    combined_list = []
+
+    for key in combined_keys:
+        prev_values, prev_offsets = calculate_offsets(
+            key_jagged_dict[key].to_dense(), None, None
+        )
+        print(prev_values)
+        print(prev_offsets)
+        combined_list.append(prev_values)
+        combined_list.append(prev_offsets)
+        combined_list.append(torch.tensor(combined_keys[key]))
+
+    return combined_list
+
+
+class SparseArchShark(nn.Module):
+    def create_emb(self, embedding_dim, num_embeddings_list):
+        embedding_list = nn.ModuleList()
+        for i in range(0, num_embeddings_list.size):
+            num_embeddings = num_embeddings_list[i]
+            EE = nn.EmbeddingBag(num_embeddings, embedding_dim, mode="sum")
+            W = np.random.uniform(
+                low=-np.sqrt(1 / num_embeddings),
+                high=np.sqrt(1 / num_embeddings),
+                size=(num_embeddings, embedding_dim),
+            ).astype(np.float32)
+            EE.weight.data = torch.tensor(W, requires_grad=True)
+            embedding_list.append(EE)
+        return embedding_list
+
+    def __init__(
+        self,
+        embedding_dim,
+        total_features,
+        num_embeddings_list,
+    ):
+        super(SparseArchShark, self).__init__()
+        self.embedding_dim = embedding_dim
+        self.num_features = total_features
+        self.embedding_list = self.create_emb(
+            embedding_dim, num_embeddings_list
+        )
+
+    def forward(self, *batched_inputs):
+
+        concatenated_list = []
+        input_enum, embedding_enum = 0, 0
+
+        for k in range(len(batched_inputs) // 3):
+            values = batched_inputs[input_enum]
+            input_enum += 1
+            offsets = batched_inputs[input_enum]
+            input_enum += 1
+            embedding_pointer = int(batched_inputs[input_enum])
+            input_enum += 1
+
+            E = self.embedding_list[embedding_pointer]
+            V = E(values, offsets)
+            concatenated_list.append(V)
+
+        return torch.cat(concatenated_list, dim=1).reshape(
+            -1, self.num_features, self.embedding_dim
+        )
+
+
+def test_sparse_arch() -> None:
+
+    D = 3
+    eb1_config = EmbeddingBagConfig(
+        name="t1",
+        embedding_dim=D,
+        num_embeddings=10,
+        feature_names=["f1", "f3"],
+    )
+    eb2_config = EmbeddingBagConfig(
+        name="t2",
+        embedding_dim=D,
+        num_embeddings=10,
+        feature_names=["f2"],
+    )
+
+    ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
+
+    w1 = ebc.embedding_bags["t1"].weight
+    w2 = ebc.embedding_bags["t2"].weight
+
+    sparse_arch = SparseArch(ebc)
+
+    keys = ["f1", "f2", "f3", "f4", "f5"]
+    offsets = torch.tensor([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 19])
+    features = KeyedJaggedTensor.from_offsets_sync(
+        keys=keys,
+        values=torch.tensor(
+            [1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 3]
+        ),
+        offsets=offsets,
+    )
+    sparse_archi = SparseArchShark(D, 3, np.array([10, 10]))
+    sparse_archi.embedding_list[0].weight = w1
+    sparse_archi.embedding_list[1].weight = w2
+    inputs = to_list(features, {"f1": 0, "f3": 0, "f2": 1})
+
+    test_results = sparse_archi(*inputs)
+    sparse_features = sparse_arch(features)
+
+    torch.allclose(
+        sparse_features,
+        test_results,
+        rtol=1e-4,
+        atol=1e-4,
+    )
+
+
+test_sparse_arch()
+
+
+class DLRMShark(nn.Module):
+    def __init__(
+        self,
+        embedding_dim,
+        total_features,
+        num_embeddings_list,
+        dense_in_features: int,
+        dense_arch_layer_sizes: List[int],
+        over_arch_layer_sizes: List[int],
+    ) -> None:
+        super().__init__()
+
+        self.sparse_arch: SparseArchShark = SparseArchShark(
+            embedding_dim, total_features, num_embeddings_list
+        )
+        num_sparse_features: int = total_features
+
+        self.dense_arch = DenseArch(
+            in_features=dense_in_features,
+            layer_sizes=dense_arch_layer_sizes,
+        )
+
+        self.inter_arch = InteractionArch(
+            num_sparse_features=num_sparse_features,
+        )
+
+        over_in_features: int = (
+            embedding_dim
+            + choose(num_sparse_features, 2)
+            + num_sparse_features
+        )
+
+        self.over_arch = OverArch(
+            in_features=over_in_features,
+            layer_sizes=over_arch_layer_sizes,
+        )
+
+    def forward(
+        self, dense_features: torch.Tensor, *sparse_features
+    ) -> torch.Tensor:
+
+        embedded_dense = self.dense_arch(dense_features)
+        embedded_sparse = self.sparse_arch(*sparse_features)
+        concatenated_dense = self.inter_arch(
+            dense_features=embedded_dense, sparse_features=embedded_sparse
+        )
+        logits = self.over_arch(concatenated_dense)
+        return logits
+
+
+def test_dlrm() -> None:
+    B = 2
+    D = 8
+    dense_in_features = 100
+
+    eb1_config = EmbeddingBagConfig(
+        name="t1",
+        embedding_dim=D,
+        num_embeddings=100,
+        feature_names=["f1", "f3"],
+    )
+    eb2_config = EmbeddingBagConfig(
+        name="t2",
+        embedding_dim=D,
+        num_embeddings=100,
+        feature_names=["f2"],
+    )
+
+    ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
+
+    sparse_features = KeyedJaggedTensor.from_offsets_sync(
+        keys=["f1", "f3", "f2"],
+        values=torch.tensor([1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 3]),
+        offsets=torch.tensor([0, 2, 4, 6, 8, 10, 11]),
+    )
+    ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
+    sparse_nn = DLRM(
+        embedding_bag_collection=ebc,
+        dense_in_features=dense_in_features,
+        dense_arch_layer_sizes=[20, D],
+        over_arch_layer_sizes=[5, 1],
+    )
+    sparse_nn_nod = DLRMShark(
+        embedding_dim=8,
+        total_features=3,
+        num_embeddings_list=np.array([100, 100]),
+        dense_in_features=dense_in_features,
+        dense_arch_layer_sizes=[20, D],
+        over_arch_layer_sizes=[5, 1],
+    )
+
+    dense_features = torch.rand((B, dense_in_features))
+
+    x = to_list(sparse_features, {"f1": 0, "f3": 0, "f2": 1})
+
+    w1 = ebc.embedding_bags["t1"].weight
+    w2 = ebc.embedding_bags["t2"].weight
+
+    sparse_nn_nod.sparse_arch.embedding_list[0].weight = w1
+    sparse_nn_nod.sparse_arch.embedding_list[1].weight = w2
+
+    sparse_nn_nod.dense_arch.load_state_dict(sparse_nn.dense_arch.state_dict())
+    sparse_nn_nod.inter_arch.load_state_dict(sparse_nn.inter_arch.state_dict())
+    sparse_nn_nod.over_arch.load_state_dict(sparse_nn.over_arch.state_dict())
+
+    logits = sparse_nn(
+        dense_features=dense_features,
+        sparse_features=sparse_features,
+    )
+    logits_nod = sparse_nn_nod(dense_features, *x)
+
+    # print(logits)
+    # print(logits_nod)
+
+    # Import the module and print.
+    mlir_importer = SharkImporter(
+        sparse_nn_nod,
+        (dense_features, *x),
+        frontend="torch",
+    )
+
+    (dlrm_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
+        tracing_required=True
+    )
+
+    shark_module = SharkInference(
+        dlrm_mlir, func_name, device="cpu", mlir_dialect="linalg"
+    )
+    shark_module.compile()
+    result = shark_module.forward(inputs)
+    np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+
+    torch.allclose(
+        logits,
+        logits_nod,
+        rtol=1e-4,
+        atol=1e-4,
+    )
+
+
+test_dlrm()
--- a/shark/examples/shark_inference/stable_diff.py
+++ b/shark/examples/shark_inference/stable_diff.py
@@ -0,0 +1,268 @@
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
+import torch
+from PIL import Image
+from diffusers import LMSDiscreteScheduler
+from tqdm.auto import tqdm
+from shark.shark_inference import SharkInference
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+import torch_mlir
+import tempfile
+import numpy as np
+
+# pip install diffusers
+# pip install scipy
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a photograph of an astronaut riding a horse",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument("--steps", type=int, default=10, help="the device to use")
+p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
+p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
+args = p.parse_args()
+
+#####################################################
+
+
+def load_mlir(mlir_loc):
+    import os
+
+    if mlir_loc == None:
+        return None
+    print(f"Trying to load the model from {mlir_loc}.")
+    with open(os.path.join(mlir_loc)) as f:
+        mlir_module = f.read()
+    return mlir_module
+
+
+def compile_through_fx(model, inputs, mlir_loc=None):
+
+    module = load_mlir(mlir_loc)
+    if mlir_loc == None:
+        fx_g = make_fx(
+            model,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(*inputs)
+
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+
+        def strip_overloads(gm):
+            """
+            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+            Args:
+                gm(fx.GraphModule): The input Fx graph module to be modified
+            """
+            for node in gm.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverload):
+                    node.target = node.target.overloadpacket
+            gm.recompile()
+
+        strip_overloads(fx_g)
+
+        ts_g = torch.jit.script(fx_g)
+
+        module = torch_mlir.compile(
+            ts_g,
+            inputs,
+            torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=False,
+            verbose=False,
+        )
+
+    mlir_model = module
+    func_name = "forward"
+
+    shark_module = SharkInference(
+        mlir_model, func_name, device=args.device, mlir_dialect="tm_tensor"
+    )
+    shark_module.compile()
+
+    return shark_module
+
+
+if __name__ == "__main__":
+
+    YOUR_TOKEN = "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"
+
+    # 1. Load the autoencoder model which will be used to decode the latents into image space.
+    vae = AutoencoderKL.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        subfolder="vae",
+        use_auth_token=YOUR_TOKEN,
+    )
+
+    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModel.from_pretrained(
+        "openai/clip-vit-large-patch14"
+    )
+
+    class VaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="vae",
+                use_auth_token=YOUR_TOKEN,
+            )
+
+        def forward(self, input):
+            return self.vae.decode(input, return_dict=False)[0]
+
+    vae = VaeModel()
+    vae_input = torch.rand(1, 4, 64, 64)
+    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
+
+    # Wrap the unet model to return tuples.
+    class UnetModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.unet = UNet2DConditionModel.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="unet",
+                use_auth_token=YOUR_TOKEN,
+            )
+            self.in_channels = self.unet.in_channels
+            self.train(False)
+
+        def forward(self, x, y, z):
+            return self.unet.forward(x, y, z, return_dict=False)[0]
+
+    # 3. The UNet model for generating the latents.
+    unet = UnetModel()
+    latent_model_input = torch.rand([2, 4, 64, 64])
+    text_embeddings = torch.rand([2, 77, 768])
+    shark_unet = compile_through_fx(
+        unet,
+        (latent_model_input, torch.tensor([1.0]), text_embeddings),
+        args.mlir_loc,
+    )
+
+    # torch.jit.script(unet)
+
+    scheduler = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+
+    prompt = [args.prompt]
+
+    height = 512  # default height of Stable Diffusion
+    width = 512  # default width of Stable Diffusion
+
+    num_inference_steps = args.steps  # Number of denoising steps
+
+    guidance_scale = 7.5  # Scale for classifier-free guidance
+
+    generator = torch.manual_seed(
+        42
+    )  # Seed generator to create the inital latent noise
+
+    batch_size = len(prompt)
+
+    text_input = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+    text_embeddings = text_encoder(text_input.input_ids)[0]
+
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
+        [""] * batch_size,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
+
+    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+    latents = torch.randn(
+        (batch_size, unet.in_channels, height // 8, width // 8),
+        generator=generator,
+    )
+    # latents = latents.to(torch_device)
+
+    scheduler.set_timesteps(num_inference_steps)
+
+    latents = latents * scheduler.sigmas[0]
+    # print(latents, latents.shape)
+
+    for i, t in tqdm(enumerate(scheduler.timesteps)):
+
+        print(f"i = {i} t = {t}")
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        latent_model_input = torch.cat([latents] * 2)
+        sigma = scheduler.sigmas[i]
+        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+
+        # predict the noise residual
+
+        # with torch.no_grad():
+        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
+
+        latent_model_input_numpy = latent_model_input.detach().numpy()
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+
+        noise_pred = shark_unet.forward(
+            (
+                latent_model_input_numpy,
+                np.array([t]).astype(np.float32),
+                text_embeddings_numpy,
+            )
+        )
+        noise_pred = torch.from_numpy(noise_pred)
+
+        # perform guidance
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (
+            noise_pred_text - noise_pred_uncond
+        )
+
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
+
+    # print("Latents shape : ", latents.shape)
+
+    # scale and decode the image latents with vae
+    latents = 1 / 0.18215 * latents
+    latents_numpy = latents.detach().numpy()
+    image = shark_vae.forward((latents_numpy,))
+    image = torch.from_numpy(image)
+
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_tf.py
+++ b/shark/examples/shark_inference/stable_diff_tf.py
@@ -0,0 +1,313 @@
+import math
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from keras_cv.models.generative.stable_diffusion.clip_tokenizer import (
+    SimpleTokenizer,
+)
+from keras_cv.models.generative.stable_diffusion.constants import (
+    _ALPHAS_CUMPROD,
+)
+from keras_cv.models.generative.stable_diffusion.constants import (
+    _UNCONDITIONAL_TOKENS,
+)
+from keras_cv.models.generative.stable_diffusion.decoder import Decoder
+from keras_cv.models.generative.stable_diffusion.text_encoder import (
+    TextEncoder,
+)
+
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+from PIL import Image
+
+# pip install "git+https://github.com/keras-team/keras-cv.git"
+# pip install tensorflow_dataset
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a photograph of an astronaut riding a horse",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument(
+    "--steps", type=int, default=10, help="the number of steps to use"
+)
+p.add_argument(
+    "--save_path",
+    type=str,
+    default=None,
+    help="the file to save the resulting image to. (default to <input prompt>.jpg)",
+)
+args = p.parse_args()
+
+#####################################################
+
+MAX_PROMPT_LENGTH = 77
+
+
+class SharkStableDiffusion:
+    """Shark implementation of Stable Diffusion based on model from keras_cv.
+    Stable Diffusion is a powerful image generation model that can be used,
+    among other things, to generate pictures according to a short text description
+    (called a "prompt").
+    Arguments:
+        device: Device to use with SHARK. Default: cpu
+        jit_compile: Whether to compile the underlying models to XLA.
+            This can lead to a significant speedup on some systems. Default: False.
+    References:
+    - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
+    - [Original implementation](https://github.com/CompVis/stable-diffusion)
+    """
+
+    def __init__(self, device="cpu", jit_compile=True):
+        self.img_height = 512
+        self.img_width = 512
+        self.tokenizer = SimpleTokenizer()
+
+        # Create models
+        self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
+
+        mlir_model, func_name, inputs, golden_out = download_tf_model(
+            "stable_diff", tank_url="gs://shark_tank/quinn"
+        )
+        shark_module = SharkInference(
+            mlir_model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        self.diffusion_model = shark_module
+        self.decoder = Decoder(self.img_height, self.img_width)
+        if jit_compile:
+            self.text_encoder.compile(jit_compile=True)
+            self.decoder.compile(jit_compile=True)
+
+        print(
+            "By using this model checkpoint, you acknowledge that its usage is "
+            "subject to the terms of the CreativeML Open RAIL-M license at "
+            "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE"
+        )
+        # Load weights
+        text_encoder_weights_fpath = keras.utils.get_file(
+            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_encoder.h5",
+            file_hash="4789e63e07c0e54d6a34a29b45ce81ece27060c499a709d556c7755b42bb0dc4",
+        )
+        decoder_weights_fpath = keras.utils.get_file(
+            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_decoder.h5",
+            file_hash="ad350a65cc8bc4a80c8103367e039a3329b4231c2469a1093869a345f55b1962",
+        )
+        self.text_encoder.load_weights(text_encoder_weights_fpath)
+        self.decoder.load_weights(decoder_weights_fpath)
+
+    def text_to_image(
+        self,
+        prompt,
+        batch_size=1,
+        num_steps=25,
+        unconditional_guidance_scale=7.5,
+        seed=None,
+    ):
+        encoded_text = self.encode_text(prompt)
+
+        return self.generate_image(
+            encoded_text,
+            batch_size=batch_size,
+            num_steps=num_steps,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            seed=seed,
+        )
+
+    def encode_text(self, prompt):
+        """Encodes a prompt into a latent text encoding.
+        The encoding produced by this method should be used as the
+        `encoded_text` parameter of `StableDiffusion.generate_image`. Encoding
+        text separately from generating an image can be used to arbitrarily
+        modify the text encoding priot to image generation, e.g. for walking
+        between two prompts.
+        Args:
+            prompt: a string to encode, must be 77 tokens or shorter.
+        Example:
+        ```python
+        from keras_cv.models import StableDiffusion
+        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+        encoded_text  = model.encode_text("Tacos at dawn")
+        img = model.generate_image(encoded_text)
+        ```
+        """
+        # Tokenize prompt (i.e. starting context)
+        inputs = self.tokenizer.encode(prompt)
+        if len(inputs) > MAX_PROMPT_LENGTH:
+            raise ValueError(
+                f"Prompt is too long (should be <= {MAX_PROMPT_LENGTH} tokens)"
+            )
+        phrase = inputs + [49407] * (MAX_PROMPT_LENGTH - len(inputs))
+        phrase = tf.convert_to_tensor([phrase], dtype=tf.int32)
+
+        context = self.text_encoder.predict_on_batch(
+            [phrase, self._get_pos_ids()]
+        )
+
+        return context
+
+    def generate_image(
+        self,
+        encoded_text,
+        batch_size=1,
+        num_steps=25,
+        unconditional_guidance_scale=7.5,
+        diffusion_noise=None,
+        seed=None,
+    ):
+        """Generates an image based on encoded text.
+        The encoding passed to this method should be derived from
+        `StableDiffusion.encode_text`.
+        Args:
+            encoded_text: Tensor of shape (`batch_size`, 77, 768), or a Tensor
+            of shape (77, 768). When the batch axis is omitted, the same encoded
+            text will be used to produce every generated image.
+            batch_size: number of images to generate. Default: 1.
+            num_steps: number of diffusion steps (controls image quality).
+                Default: 25.
+            unconditional_guidance_scale: float controling how closely the image
+                should adhere to the prompt. Larger values result in more
+                closely adhering to the prompt, but will make the image noisier.
+                Default: 7.5.
+            diffusion_noise: Tensor of shape (`batch_size`, img_height // 8,
+                img_width // 8, 4), or a Tensor of shape (img_height // 8,
+                img_width // 8, 4). Optional custom noise to seed the diffusion
+                process. When the batch axis is omitted, the same noise will be
+                used to seed diffusion for every generated image.
+            seed: integer which is used to seed the random generation of
+                diffusion noise, only to be specified if `diffusion_noise` is
+                None.
+        Example:
+        ```python
+        from keras_cv.models import StableDiffusion
+        batch_size = 8
+        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+        e_tacos = model.encode_text("Tacos at dawn")
+        e_watermelons = model.encode_text("Watermelons at dusk")
+        e_interpolated = tf.linspace(e_tacos, e_watermelons, batch_size)
+        images = model.generate_image(e_interpolated, batch_size=batch_size)
+        ```
+        """
+        if diffusion_noise is not None and seed is not None:
+            raise ValueError(
+                "`diffusion_noise` and `seed` should not both be passed to "
+                "`generate_image`. `seed` is only used to generate diffusion "
+                "noise when it's not already user-specified."
+            )
+
+        encoded_text = tf.squeeze(encoded_text)
+        if encoded_text.shape.rank == 2:
+            encoded_text = tf.repeat(
+                tf.expand_dims(encoded_text, axis=0), batch_size, axis=0
+            )
+
+        context = encoded_text
+        unconditional_context = tf.repeat(
+            self._get_unconditional_context(), batch_size, axis=0
+        )
+        context = tf.concat([context, unconditional_context], 0)
+
+        if diffusion_noise is not None:
+            diffusion_noise = tf.squeeze(diffusion_noise)
+            if diffusion_noise.shape.rank == 3:
+                diffusion_noise = tf.repeat(
+                    tf.expand_dims(diffusion_noise, axis=0), batch_size, axis=0
+                )
+            latent = diffusion_noise
+        else:
+            latent = self._get_initial_diffusion_noise(batch_size, seed)
+
+        # Iterative reverse diffusion stage
+        timesteps = tf.range(1, 1000, 1000 // num_steps)
+        alphas, alphas_prev = self._get_initial_alphas(timesteps)
+        progbar = keras.utils.Progbar(len(timesteps))
+        iteration = 0
+        for index, timestep in list(enumerate(timesteps))[::-1]:
+            latent_prev = latent  # Set aside the previous latent vector
+            t_emb = self._get_timestep_embedding(timestep, batch_size)
+
+            # Prepare the latent and unconditional latent to be run with a single forward call
+            latent = tf.concat([latent, latent], 0)
+            t_emb = tf.concat([t_emb, t_emb], 0)
+            latent_numpy = self.diffusion_model.forward(
+                [latent.numpy(), t_emb.numpy(), context.numpy()]
+            )
+            latent = tf.convert_to_tensor(latent_numpy, dtype=tf.float32)
+            latent, unconditional_latent = tf.split(latent, 2)
+
+            latent = unconditional_latent + unconditional_guidance_scale * (
+                latent - unconditional_latent
+            )
+            a_t, a_prev = alphas[index], alphas_prev[index]
+            pred_x0 = (latent_prev - math.sqrt(1 - a_t) * latent) / math.sqrt(
+                a_t
+            )
+            latent = (
+                latent * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0
+            )
+            iteration += 1
+            progbar.update(iteration)
+
+        # Decoding stage
+        decoded = self.decoder.predict_on_batch(latent)
+        decoded = ((decoded + 1) / 2) * 255
+        return np.clip(decoded, 0, 255).astype("uint8")
+
+    def _get_unconditional_context(self):
+        unconditional_tokens = tf.convert_to_tensor(
+            [_UNCONDITIONAL_TOKENS], dtype=tf.int32
+        )
+        unconditional_context = self.text_encoder.predict_on_batch(
+            [unconditional_tokens, self._get_pos_ids()]
+        )
+
+        return unconditional_context
+
+    def _get_timestep_embedding(
+        self, timestep, batch_size, dim=320, max_period=10000
+    ):
+        half = dim // 2
+        freqs = tf.math.exp(
+            -math.log(max_period) * tf.range(0, half, dtype=tf.float32) / half
+        )
+        args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
+        embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
+        embedding = tf.reshape(embedding, [1, -1])
+        return tf.repeat(embedding, batch_size, axis=0)
+
+    def _get_initial_alphas(self, timesteps):
+        alphas = [_ALPHAS_CUMPROD[t] for t in timesteps]
+        alphas_prev = [1.0] + alphas[:-1]
+
+        return alphas, alphas_prev
+
+    def _get_initial_diffusion_noise(self, batch_size, seed):
+        return tf.random.normal(
+            (batch_size, self.img_height // 8, self.img_width // 8, 4),
+            seed=seed,
+        )
+
+    @staticmethod
+    def _get_pos_ids():
+        return tf.convert_to_tensor(
+            [list(range(MAX_PROMPT_LENGTH))], dtype=tf.int32
+        )
+
+
+if __name__ == "__main__":
+    SD = SharkStableDiffusion(device=args.device)
+    images = SD.text_to_image(args.prompt, num_steps=args.steps)
+    pil_images = [Image.fromarray(image) for image in images]
+    save_fname = args.prompt + ".jpg"
+    if args.save_path is not None:
+        save_fname = args.save_path
+    pil_images[0].save(save_fname)
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -39,7 +39,6 @@ def run_cmd(cmd):

 IREE_DEVICE_MAP = {
    "cpu": "local-task",
-    "gpu": "cuda",
    "cuda": "cuda",
    "vulkan": "vulkan",
    "metal": "vulkan",
@@ -49,7 +48,6 @@ IREE_DEVICE_MAP = {

 IREE_TARGET_MAP = {
    "cpu": "llvm-cpu",
-    "gpu": "cuda",
    "cuda": "cuda",
    "vulkan": "vulkan",
    "metal": "vulkan",
@@ -60,7 +58,7 @@ IREE_TARGET_MAP = {
 # Finds whether the required drivers are installed for the given device.
 def check_device_drivers(device):
    """Checks necessary drivers present for gpu and vulkan devices"""
-    if device in ["gpu", "cuda"]:
+    if device == "cuda":
        try:
            subprocess.check_output("nvidia-smi")
        except Exception:
@@ -78,6 +76,11 @@ def check_device_drivers(device):
            return True
    elif device == "cpu":
        return False
+    elif device == "rocm":
+        try:
+            subprocess.check_output("rocminfo")
+        except Exception:
+            return True
    # Unknown device.
    else:
        return True
@@ -87,9 +90,11 @@ def check_device_drivers(device):

 # Installation info for the missing device drivers.
 def device_driver_info(device):
-    if device in ["gpu", "cuda"]:
+    if device == "cuda":
        return "nvidia-smi not found, please install the required drivers from https://www.nvidia.in/Download/index.aspx?lang=en-in"
    elif device in ["metal", "vulkan"]:
        return "vulkaninfo not found, Install from https://vulkan.lunarg.com/sdk/home or your distribution"
+    elif device == "rocm":
+        return "rocm info not found. Please install rocm"
    else:
        return f"{device} is not supported."
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -23,7 +23,7 @@ def get_iree_device_args(device):
        from shark.iree_utils.cpu_utils import get_iree_cpu_args

        return get_iree_cpu_args()
-    if device in ["gpu", "cuda"]:
+    if device == "cuda":
        from shark.iree_utils.gpu_utils import get_iree_gpu_args

        return get_iree_gpu_args()
@@ -31,6 +31,10 @@ def get_iree_device_args(device):
        from shark.iree_utils.vulkan_utils import get_iree_vulkan_args

        return get_iree_vulkan_args()
+    if device == "rocm":
+        from shark.iree_utils.gpu_utils import get_iree_rocm_args
+
+        return get_iree_rocm_args()
    return []


@@ -72,6 +76,8 @@ def compile_module_to_flatbuffer(
        input_type = frontend
    elif frontend in ["tflite", "tflite-tosa"]:
        input_type = "tosa"
+    elif frontend in ["tm_tensor"]:
+        input_type = frontend

    # TODO: make it simpler.
    # Compile according to the input type, else just try compiling.
@@ -122,6 +128,16 @@ def get_iree_compiled_module(
    return get_iree_module(flatbuffer_blob, device, func_name)


+def load_flatbuffer(
+    flatbuffer_path: str, device: str, func_name: str = "forward"
+):
+
+    with open(os.path.join(flatbuffer_path), "rb") as f:
+        flatbuffer_blob = f.read()
+
+    return get_iree_module(flatbuffer_blob, device, func_name)
+
+
 def export_iree_module_to_vmfb(
    module,
    device: str,
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -35,6 +35,18 @@ def get_iree_gpu_args():
        return ["--iree-hal-cuda-disable-loop-nounroll-wa"]


+# Get the default gpu args given the architecture.
+def get_iree_rocm_args():
+    ireert.flags.FUNCTION_INPUT_VALIDATION = False
+    # TODO: find a way to get arch from code.
+    rocm_arch = "gfx908"
+    return [
+        f"--iree-rocm-target-chip={rocm_arch}",
+        "--iree-rocm-link-bc=true",
+        "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
+    ]
+
+
 # Some constants taken from cuda.h
 CUDA_SUCCESS = 0
 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -18,29 +18,30 @@ from shark.iree_utils._common import run_cmd


 def get_vulkan_triple_flag():
-    vulkan_device_cmd = "vulkaninfo | grep deviceName | awk 'END{{print $NF}}'"
+    vulkan_device_cmd = "vulkaninfo | grep deviceName"
    vulkan_device = run_cmd(vulkan_device_cmd).strip()
-    if vulkan_device == "Ultra":
-        print("Found MacStudio M1 Device. Using m1-moltenvk-macos")
+    if all(x in vulkan_device for x in ("Apple", "M1")):
+        print(f"Found {vulkan_device} Device. Using m1-moltenvk-macos")
        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif vulkan_device == "M2":
+    elif all(x in vulkan_device for x in ("Apple", "M2")):
        print("Found Apple M2 Device. Using m1-moltenvk-macos")
        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif vulkan_device == "Max":
-        print("Found Apple M1 Max Device. Using m1-moltenvk-macos")
-        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif vulkan_device == "Pro":
-        print("Found Apple M1 Pro Device. Using m1-moltenvk-macos")
-        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif vulkan_device == "M1":
-        print("Found Apple M1 Device. Using m1-moltenvk-macos")
-        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif vulkan_device == "A100-SXM4-40GB":
-        print("Found Nvidia Device. Using ampere-rtx3080-linux")
+    elif all(x in vulkan_device for x in ("A100", "SXM4")):
+        print(f"Found {vulkan_device} Device. Using ampere-rtx3080-linux")
        return "-iree-vulkan-target-triple=ampere-rtx3080-linux"
-    elif vulkan_device == "3090":
-        print("Found Nvidia Device. Using ampere-rtx3090-linux")
+    elif all(x in vulkan_device for x in ("RTX", "3090")):
+        print(f"Found {vulkan_device} Device. Using ampere-rtx3090-linux")
        return "-iree-vulkan-target-triple=ampere-rtx3090-linux"
+    elif all(x in vulkan_device for x in ("Radeon", "RX 5")):
+        print(
+            "Found AMD Radeon RX 5000 series device. Using rdna1-5700xt-linux"
+        )
+        return "-iree-vulkan-target-triple=rdna1-5700xt-linux"
+    elif all(x in vulkan_device for x in ("Radeon", "RX 6")):
+        print(
+            "Found AMD Radeon RX 6000 series device. Using rdna2-unknown-linux"
+        )
+        return "-iree-vulkan-target-triple=rdna2-unknown-linux"
    else:
        print(
            """Optimized kernel for your target device is not added yet.
--- a/shark/model_annotation.py
+++ b/shark/model_annotation.py
@@ -12,22 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import sys
 import json
 import os
-from typing import List, Dict
+import sys
+from typing import Dict, List

 from iree.compiler import ir
 from iree.compiler.transforms import ireec as ireec_trans

-MATMUL_OP_NAMES = set(
-    ["linalg.matmul", "linalg.batch_matmul", "mhlo.dot", "mhlo.dot_general"]
-)
-idx = 0
-

 def model_annotation(
-    ctx: ir.Context, *, input_contents: str, config_path: str
+    ctx: ir.Context,
+    *,
+    input_contents: str,
+    config_path: str,
+    search_op: str = "matmul",
 ):
    if os.path.isfile(input_contents):
        with open(input_contents, "rb") as f:
@@ -41,21 +40,35 @@ def model_annotation(

    # The Python API does not expose a general walk() function, so we just
    # do it ourselves.
-    walk_children(module.operation, configs)
+    walk_children(module.operation, configs, 0, search_op)

    if not module.operation.verify():
        raise RuntimeError("Modified program does not verify!")

-    # More efficient than: print(module)
-    #   - Disables verification (already done above)
-    #   - Writes as binary, avoiding costly unicode conversions
-    sys.stdout.buffer.write(
-        module.operation.get_asm(assume_verified=True, binary=True)
-    )
    return module


-def walk_children(op: ir.Operation, configs: List[Dict]):
+def walk_children(
+    op: ir.Operation, configs: List[Dict], idx: int, search_op: str
+):
+    if search_op == "matmul":
+        op_names = ["linalg.matmul", "mhlo.dot"]
+    elif search_op == "bmm":
+        op_names = ["linalg.batch_matmul", "mhlo.dot_general"]
+    elif search_op == "conv":
+        op_names = ["mhlo.convolution", "linalg.conv_2d_nhwc_hwcf"]
+    elif search_op == "all":
+        op_names = [
+            "mhlo.dot",
+            "mhlo.dot_general",
+            "mhlo.convolution",
+            "linalg.matmul",
+            "linalg.batch_matmul",
+            "linalg.conv_2d_nhwc_hwcf",
+        ]
+    else:
+        raise ValueError(f"{search_op} op is not tunable.")
+
    for region in op.regions:
        for block in region.blocks:
            for child_op in block.operations:
@@ -63,30 +76,32 @@ def walk_children(op: ir.Operation, configs: List[Dict]):
                # 'operation' and 'name' attributes.
                if isinstance(child_op, ir.OpView):
                    child_op = child_op.operation
-                if child_op.name in MATMUL_OP_NAMES:
-                    global idx
-                    (
-                        tile_sizes,
-                        pipeline,
-                        workgroup_size,
-                        split_k,
-                        pipeline_depth,
-                    ) = parse_config(configs[idx])
-
-                    add_compilation_info(
-                        child_op,
-                        tile_sizes=tile_sizes,
-                        pipeline=pipeline,
-                        workgroup_size=workgroup_size,
-                        pipeline_depth=pipeline_depth,
-                    )
-
-                    if split_k:
-                        add_split_k(child_op, split_k)
-
+                if child_op.name in op_names and idx < len(configs):
+                    add_attributes(child_op, configs[idx])
                    idx = idx + 1
                    print(f"Updated op {child_op}", file=sys.stderr)
-                walk_children(child_op, configs)
+                walk_children(child_op, configs, idx, search_op)
+
+
+def add_attributes(op: ir.Operation, config: Dict):
+    (
+        tile_sizes,
+        pipeline,
+        workgroup_size,
+        split_k,
+        pipeline_depth,
+    ) = parse_config(config)
+
+    add_compilation_info(
+        op,
+        tile_sizes=tile_sizes,
+        pipeline=pipeline,
+        workgroup_size=workgroup_size,
+        pipeline_depth=pipeline_depth,
+    )
+
+    if split_k:
+        add_attribute_by_name(op, "iree_flow_split_k", split_k)


 def parse_config(config: Dict):
@@ -145,9 +160,9 @@ def add_compilation_info(
    op.attributes["compilation_info"] = attr


-def add_split_k(op: ir.Operation, k: int):
-    attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), k)
-    op.attributes["iree_flow_split_k"] = attr
+def add_attribute_by_name(op: ir.Operation, name: str, val: int):
+    attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), val)
+    op.attributes[name] = attr


 def create_context() -> ir.Context:
@@ -159,6 +174,14 @@ def create_context() -> ir.Context:

 if __name__ == "__main__":
    with create_context() as ctx:
-        model_annotation(
-            ctx, input_contents=sys.argv[1], config_path=sys.argv[2]
+        module = model_annotation(
+            ctx,
+            input_contents=sys.argv[1],
+            config_path=sys.argv[2],
+            search_op="all",
        )
+        mlir_str = str(module)
+        filename = "tuned_model.mlir"
+        with open(filename, "w") as f:
+            f.write(mlir_str)
+        print(f"Saved mlir in {filename}.")
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -38,7 +38,7 @@ parser.add_argument(
    "--device",
    type=str,
    default="cpu",
-    help="Device on which shark_runner runs. options are cpu, gpu, and vulkan",
+    help="Device on which shark_runner runs. options are cpu, cuda, and vulkan",
 )
 parser.add_argument(
    "--repro_dir",
@@ -76,5 +76,21 @@ parser.add_argument(
    action="store_true",
    help="When enabled, pytest bench results will include ONNX benchmark results.",
 )
+parser.add_argument(
+    "--shark_prefix",
+    default="latest",
+    help="gs://shark_tank/<this_flag>/model_directories",
+)
+parser.add_argument(
+    "--update_tank",
+    default=False,
+    action="store_true",
+    help="When enabled, SHARK downloader will update local shark_tank if local hash is different from latest upstream hash.",
+)
+parser.add_argument(
+    "--local_tank_cache",
+    default="",
+    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
+)

 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -71,7 +71,7 @@ class SharkBenchmarkRunner(SharkRunner):
            input_tensors,
            mlir_dialect=self.mlir_dialect,
        )
-        # print(self.benchmark_cl)
+        print(self.benchmark_cl)

    def benchmark_frontend(self, modelname):
        if self.mlir_dialect in ["linalg", "torch"]:
@@ -83,12 +83,12 @@ class SharkBenchmarkRunner(SharkRunner):
        import torch
        from tank.model_utils import get_torch_model

-        if self.device == "gpu":
+        if self.device == "cuda":
            torch.set_default_tensor_type(torch.cuda.FloatTensor)
        else:
            torch.set_default_tensor_type(torch.FloatTensor)
        torch_device = torch.device(
-            "cuda:0" if self.device == "gpu" else "cpu"
+            "cuda:0" if self.device == "cuda" else "cpu"
        )
        HFmodel, input = get_torch_model(modelname)[:2]
        frontend_model = HFmodel.model
@@ -163,7 +163,7 @@ class SharkBenchmarkRunner(SharkRunner):
        ]

    def benchmark_onnx(self, modelname, inputs):
-        if self.device == "gpu":
+        if self.device == "cuda":
            print(
                "Currently GPU benchmarking on ONNX is not supported in SHARK."
            )
@@ -186,7 +186,7 @@ https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/tr
 for currently supported models. Exiting benchmark ONNX."
                )
                return ["N/A", "N/A"]
-            use_gpu = self.device == "gpu"
+            use_gpu = self.device == "cuda"
            num_threads = psutil.cpu_count(logical=False)
            batch_sizes = [1]
            sequence_lengths = [128]
@@ -236,6 +236,34 @@ for currently supported models. Exiting benchmark ONNX."
                result[0]["average_latency_ms"],
            ]

+    def get_metadata(self, modelname):
+        with open("./tank/model_metadata.csv", mode="r") as csvfile:
+            torch_reader = csv.reader(csvfile, delimiter=",")
+            fields = next(torch_reader)
+            for row in torch_reader:
+                torch_model_name = row[0]
+                if torch_model_name == modelname:
+                    param_count = row[3]
+                    model_tags = row[4]
+                    model_notes = row[5]
+                    return [param_count, model_tags, model_notes]
+
+    def compare_bench_results(self, baseline: str, result: str):
+        # Takes two numbers represented as strings and returns "<n>x slower/faster", as in "result is <n>x slower than baseline".
+        a = float(baseline)
+        b = float(result)
+        if a < b:
+            # result slower than baseline
+            comparison = (b - a) / a
+            comp_str = f"{round(comparison, 2)}x slower"
+        elif a > b:
+            # result faster than baseline
+            comparison = a / b
+            comp_str = f"{round(comparison, 2)}x faster"
+        else:
+            comp_str = "equal"
+        return comp_str
+
    def benchmark_all_csv(
        self, inputs: tuple, modelname, dynamic, device_str, frontend
    ):
@@ -243,12 +271,17 @@ for currently supported models. Exiting benchmark ONNX."
        field_names = [
            "model",
            "engine",
-            "dynamic",
            "dialect",
            "device",
+            "shape_type",
+            "data_type",
            "iter/sec",
            "ms/iter",
+            "vs. PyTorch/TF",
            "iterations",
+            "param_count",
+            "tags",
+            "notes",
            "datetime",
        ]
        engines = ["frontend", "shark_python", "shark_iree_c"]
@@ -265,29 +298,57 @@ for currently supported models. Exiting benchmark ONNX."
            bench_result = {}
            bench_result["model"] = modelname
            if dynamic == True:
-                bench_result["dynamic"] = "True"
+                bench_result["shape_type"] = "dynamic"
            else:
-                bench_result["dynamic"] = "False"
+                bench_result["shape_type"] = "static"
            bench_result["device"] = device_str
+            bench_result["data_type"] = inputs[0].dtype
            for e in engines:
+                (
+                    bench_result["param_count"],
+                    bench_result["tags"],
+                    bench_result["notes"],
+                ) = ["", "", ""]
                if e == "frontend":
                    bench_result["engine"] = frontend
                    (
                        bench_result["iter/sec"],
                        bench_result["ms/iter"],
                    ) = self.benchmark_frontend(modelname)
+                    self.frontend_result = bench_result["ms/iter"]
+                    bench_result["vs. PyTorch/TF"] = "="
+                    (
+                        bench_result["param_count"],
+                        bench_result["tags"],
+                        bench_result["notes"],
+                    ) = self.get_metadata(modelname)
+
                elif e == "shark_python":
                    bench_result["engine"] = "shark_python"
                    (
                        bench_result["iter/sec"],
                        bench_result["ms/iter"],
                    ) = self.benchmark_python(inputs)
+
+                    bench_result[
+                        "vs. PyTorch/TF"
+                    ] = self.compare_bench_results(
+                        self.frontend_result, bench_result["ms/iter"]
+                    )
+
                elif e == "shark_iree_c":
                    bench_result["engine"] = "shark_iree_c"
                    (
                        bench_result["iter/sec"],
                        bench_result["ms/iter"],
                    ) = self.benchmark_c()
+
+                    bench_result[
+                        "vs. PyTorch/TF"
+                    ] = self.compare_bench_results(
+                        self.frontend_result, bench_result["ms/iter"]
+                    )
+
                elif e == "onnxruntime":
                    bench_result["engine"] = "onnxruntime"
                    (
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -18,6 +18,7 @@ import urllib.request
 import json
 import hashlib
 from pathlib import Path
+from shark.parser import shark_args

 input_type_to_np_dtype = {
    "float32": np.float32,
@@ -29,14 +30,29 @@ input_type_to_np_dtype = {
    "int8": np.int8,
 }

-# default hash is updated when nightly populate_sharktank_ci is successful
-shark_default_sha = "latest"

 # Save the model in the home local so it needn't be fetched everytime in the CI.
 home = str(Path.home())
-WORKDIR = os.path.join(home, ".local/shark_tank/")
-print(WORKDIR)
+alt_path = os.path.join(os.path.dirname(__file__), "../gen_shark_tank/")
+custom_path = shark_args.local_tank_cache
+if os.path.exists(alt_path):
+    WORKDIR = alt_path
+    print(
+        f"Using {WORKDIR} as shark_tank directory. Delete this directory if you aren't working from locally generated shark_tank."
+    )
+if custom_path:
+    if not os.path.exists(custom_path):
+        os.mkdir(custom_path)

+    WORKDIR = custom_path
+
+    print(f"Using {WORKDIR} as local shark_tank cache directory.")
+else:
+    WORKDIR = os.path.join(home, ".local/shark_tank/")
+    print(
+        f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache="
+        " pytest flag"
+    )

 # Checks whether the directory and files exists.
 def check_dir_exists(model_name, frontend="torch", dynamic=""):
@@ -72,7 +88,9 @@ def check_dir_exists(model_name, frontend="torch", dynamic=""):


 # Downloads the torch model from gs://shark_tank dir.
-def download_torch_model(model_name, dynamic=False):
+def download_torch_model(
+    model_name, dynamic=False, tank_url="gs://shark_tank/latest"
+):
    model_name = model_name.replace("/", "_")
    dyn_str = "_dynamic" if dynamic else ""
    os.makedirs(WORKDIR, exist_ok=True)
@@ -80,8 +98,8 @@ def download_torch_model(model_name, dynamic=False):

    def gs_download_model():
        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
+            + tank_url
            + "/"
            + model_dir_name
            + " "
@@ -96,8 +114,8 @@ def download_torch_model(model_name, dynamic=False):
        model_dir = os.path.join(WORKDIR, model_dir_name)
        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp '
+            + tank_url
            + "/"
            + model_dir_name
            + "/hash.npy"
@@ -110,7 +128,12 @@ def download_torch_model(model_name, dynamic=False):
            np.load(os.path.join(model_dir, "upstream_hash.npy"))
        )
        if local_hash != upstream_hash:
-            gs_download_model()
+            if shark_args.update_tank == True:
+                gs_download_model()
+            else:
+                print(
+                    "Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
+                )

    model_dir = os.path.join(WORKDIR, model_dir_name)
    with open(
@@ -128,15 +151,17 @@ def download_torch_model(model_name, dynamic=False):


 # Downloads the tflite model from gs://shark_tank dir.
-def download_tflite_model(model_name, dynamic=False):
+def download_tflite_model(
+    model_name, dynamic=False, tank_url="gs://shark_tank/latest"
+):
    dyn_str = "_dynamic" if dynamic else ""
    os.makedirs(WORKDIR, exist_ok=True)
    model_dir_name = model_name + "_tflite"

    def gs_download_model():
        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
+            + tank_url
            + "/"
            + model_dir_name
            + " "
@@ -153,8 +178,8 @@ def download_tflite_model(model_name, dynamic=False):
        model_dir = os.path.join(WORKDIR, model_dir_name)
        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp '
+            + tank_url
            + "/"
            + model_dir_name
            + "/hash.npy"
@@ -167,7 +192,12 @@ def download_tflite_model(model_name, dynamic=False):
            np.load(os.path.join(model_dir, "upstream_hash.npy"))
        )
        if local_hash != upstream_hash:
-            gs_download_model()
+            if shark_args.update_tank == True:
+                gs_download_model()
+            else:
+                print(
+                    "Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
+                )

    model_dir = os.path.join(WORKDIR, model_dir_name)
    with open(
@@ -184,15 +214,17 @@ def download_tflite_model(model_name, dynamic=False):
    return mlir_file, function_name, inputs_tuple, golden_out_tuple


-def download_tf_model(model_name):
+def download_tf_model(
+    model_name, tuned=None, tank_url="gs://shark_tank/latest"
+):
    model_name = model_name.replace("/", "_")
    os.makedirs(WORKDIR, exist_ok=True)
    model_dir_name = model_name + "_tf"

    def gs_download_model():
        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r '
+            + tank_url
            + "/"
            + model_dir_name
            + " "
@@ -207,8 +239,8 @@ def download_tf_model(model_name):
        model_dir = os.path.join(WORKDIR, model_dir_name)
        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp '
+            + tank_url
            + "/"
            + model_dir_name
            + "/hash.npy"
@@ -221,10 +253,20 @@ def download_tf_model(model_name):
            np.load(os.path.join(model_dir, "upstream_hash.npy"))
        )
        if local_hash != upstream_hash:
-            gs_download_model()
+            if shark_args.update_tank == True:
+                gs_download_model()
+            else:
+                print(
+                    "Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
+                )

    model_dir = os.path.join(WORKDIR, model_dir_name)
-    with open(os.path.join(model_dir, model_name + "_tf.mlir")) as f:
+    suffix = "_tf.mlir" if tuned is None else "_tf_" + tuned + ".mlir"
+    filename = os.path.join(model_dir, model_name + suffix)
+    if not os.path.isfile(filename):
+        filename = os.path.join(model_dir, model_name + "_tf.mlir")
+
+    with open(filename) as f:
        mlir_file = f.read()

    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -9,7 +9,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from shark.iree_utils.compile_utils import (
+    export_iree_module_to_vmfb,
+    load_flatbuffer,
+)
+import os
 from shark.shark_runner import SharkRunner
+from shark.parser import shark_args
 import numpy as np


@@ -65,7 +71,7 @@ class SharkInference:
    ):
        self.mlir_module = mlir_module
        self.function_name = function_name
-        self.device = device
+        self.device = shark_args.device if device == "none" else device
        self.mlir_dialect = mlir_dialect
        self.is_benchmark = is_benchmark

@@ -135,3 +141,31 @@ class SharkInference:
                )
            )
        return tuple(inputs)
+
+    # TODO: Instead of passing directory and having names decided by the module
+    # , user may want to save the module with manual names.
+    def save_module(self, dir=os.getcwd()):
+        return export_iree_module_to_vmfb(
+            self.mlir_module,
+            self.device,
+            dir,
+            self.mlir_dialect,
+            self.function_name,
+        )
+
+    # load and return the module.
+    def load_module(self, path):
+        self.shark_runner = SharkRunner(
+            function_name=self.function_name,
+            device=self.device,
+            compile_vmfb=False,
+        )
+        (
+            self.shark_runner.iree_compilation_module,
+            self.shark_runner.iree_config,
+        ) = load_flatbuffer(
+            path,
+            self.device,
+            self.function_name,
+        )
+        return
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -16,6 +16,7 @@ from shark.iree_utils.compile_utils import (
    get_iree_compiled_module,
    get_results,
    export_iree_module_to_vmfb,
+    load_flatbuffer,
 )
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from shark.parser import shark_args
@@ -60,10 +61,11 @@ class SharkRunner:

    def __init__(
        self,
-        mlir_module: str,
+        mlir_module: str = "none",
        function_name: str = "forward",
        device: str = "none",
        mlir_dialect: str = "linalg",
+        compile_vmfb: bool = True,
    ):
        self.mlir_module = mlir_module
        self.function_name = function_name
@@ -74,16 +76,17 @@ class SharkRunner:
            device_driver_info(self.device)
            sys.exit(1)

-        # Compile the module to get the .vmfb.
-        (
-            self.iree_compilation_module,
-            self.iree_config,
-        ) = get_iree_compiled_module(
-            self.mlir_module,
-            self.device,
-            self.mlir_dialect,
-            func_name=self.function_name,
-        )
+        if compile_vmfb == True:
+            # Compile the module to get the .vmfb.
+            (
+                self.iree_compilation_module,
+                self.iree_config,
+            ) = get_iree_compiled_module(
+                self.mlir_module,
+                self.device,
+                self.mlir_dialect,
+                func_name=self.function_name,
+            )

    def run(self, inputs: tuple):
        return get_results(
@@ -92,10 +95,3 @@ class SharkRunner:
            self.iree_config,
            self.mlir_dialect,
        )
-
-    # TODO: Instead of passing directory and having names decided by the module
-    # , user may want to save the module with manual names.
-    def save_module(self, dir=os.getcwd()):
-        return export_iree_module_to_vmfb(
-            self.model, self.device, dir, self.mlir_dialect
-        )
--- a/shark/sharkdynamo/README.md
+++ b/shark/sharkdynamo/README.md
@@ -0,0 +1,11 @@
+1. Install torchdynamo
+   - `git clone https://github.com/pytorch/torchdynamo.git`
+   - `cd torchdynamo`
+   - `python -m pip install -r requirements.txt`
+   - `python setup.py develop`
+
+2. Install functorch
+   - `python -m pip install -v "git+https://github.com/pytorch/pytorch.git@$(python -c "import torch.version; print(torch.version.git_version)")#subdirectory=functorch"`
+
+3. Run examples.
+    - `python shark/examples/shark_dynamo/basic_examples.py`
--- a/shark/sharkdynamo/init.py
+++ b/shark/sharkdynamo/init.py
--- a/shark/sharkdynamo/utils.py
+++ b/shark/sharkdynamo/utils.py
@@ -0,0 +1,157 @@
+import functools
+import time
+from typing import List, Optional
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+from functorch._src.compile_utils import strip_overloads
+from shark.shark_inference import SharkInference
+from torch._decomp import get_decompositions
+
+import torch_mlir
+
+# TODO: Control decompositions.
+def default_decompositions():
+    return get_decompositions(
+        [
+            torch.ops.aten.embedding_dense_backward,
+            torch.ops.aten.native_layer_norm_backward,
+            torch.ops.aten.slice_backward,
+            torch.ops.aten.select_backward,
+            torch.ops.aten.norm.ScalarOpt_dim,
+            torch.ops.aten.native_group_norm,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.split_with_sizes,
+        ]
+    )
+
+
+def timeit(*, append_time_to: Optional[List] = None):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            start_time = time.time_ns()
+            result = func(*args, **kwargs)
+            end_time = time.time_ns()
+
+            if append_time_to is not None:
+                append_time_to.append(end_time - start_time)
+            return result
+
+        return wrapper
+
+    return decorator
+
+
+def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, tuple):
+                return len(node_arg) == 0
+    return False
+
+
+def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
+    """
+    Replace tuple with tuple element in functions that return one-element tuples.
+    Returns true if an unwrapping took place, and false otherwise.
+    """
+    unwrapped_tuple = False
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, tuple):
+                if len(node_arg) == 1:
+                    node.args = (node_arg[0],)
+                    unwrapped_tuple = True
+                    break
+
+    if unwrapped_tuple:
+        fx_g.graph.lint()
+        fx_g.recompile()
+    return unwrapped_tuple
+
+
+def make_shark_compiler(use_tracing: bool, device: str, verbose=False):
+    def compiler(
+        fx_graph: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+    ):
+        """Compile GraphModule using torch-mlir + SHARK."""
+        if verbose:
+            print("Compiling graph...")
+
+        if _returns_nothing(fx_graph):
+            return fx_graph
+
+        was_unwrapped = _unwrap_single_tuple_return(fx_graph)
+        fx_graph = make_fx(
+            fx_graph, decomposition_table=default_decompositions()
+        )(*example_inputs)
+        strip_overloads(fx_graph)
+
+        if verbose:
+            print("torch.fx graph:")
+            print(fx_graph.graph)
+
+        ts_compiler = torch.jit.trace if use_tracing else torch.jit.script
+        ts_graph = ts_compiler(fx_graph, example_inputs)
+
+        if verbose:
+            torch_mlir_module = torch_mlir.compile(
+                ts_graph,
+                example_inputs,
+                output_type=torch_mlir.OutputType.TORCH,
+            )
+            print("\n\ntorch-mlir backend contract graph:")
+            print(torch_mlir_module)
+
+        linalg_module = torch_mlir.compile(
+            ts_graph,
+            example_inputs,
+            output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
+        )
+
+        shark_module = SharkInference(
+            linalg_module, "forward", mlir_dialect="linalg", device=device
+        )
+        shark_module.compile()
+
+        def forward(*inputs):
+            result = shark_module.forward(inputs)
+            result = tuple() if result is None else result
+            return (result,) if was_unwrapped else result
+
+        return forward
+
+    return compiler
+
+
+def check_results(compiled_results, eager_results):
+    for compiled_result, eager_result in zip(compiled_results, eager_results):
+        if not torch.allclose(
+            compiled_result.to("cpu"), eager_result.to("cpu"), atol=1e-5
+        ):
+            print("Compiled result does not match eager result")
+            return
+    print("Compiled result matches eager result!")
+
+
+def print_time_stats(times):
+    times_tensor = torch.tensor(times)
+
+    def quantile_ms(q):
+        return torch.quantile(times_tensor.to(float), q).item() / 1e6
+
+    print(f"Median: {quantile_ms(0.5)} ms")
+    print(f"10%ile: {quantile_ms(0.1)} ms")
+    print(f"90%ile: {quantile_ms(0.9)} ms")
+    print(f"Total: {torch.sum(times_tensor) / 1e6} ms")
+    print()
--- a/shark/torch_mlir_lockstep_tensor.py
+++ b/shark/torch_mlir_lockstep_tensor.py
@@ -0,0 +1,220 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Also available under a BSD-style license. See LICENSE.
+import contextlib
+import re
+import traceback
+import warnings
+from typing import Any
+import numpy as np
+
+import torch
+from torch.utils._pytree import tree_map
+
+from torch_mlir.eager_mode.ir_building import build_mlir_module
+from torch_mlir.eager_mode.torch_mlir_dispatch import (
+    UnsupportedByTorchMlirEagerMode,
+    normalize_args_kwargs,
+    check_get_aliased_arg,
+)
+from torch_mlir.eager_mode import EAGER_MODE_DEBUG
+from torch_mlir.eager_mode.torch_mlir_tensor import (
+    TorchMLIRTensor,
+    check_requires_grad,
+    make_wrapper_subclass_from_torch_tensor,
+    make_bare_wrapper_subclass,
+    UNSUPPORTED_OPS,
+    no_dispatch,
+)
+from torch_mlir.eager_mode import torch_mlir_tensor
+from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
+
+
+backend = EagerModeIREELinalgOnTensorsBackend("cpu")
+torch_mlir_tensor.backend = backend
+rtol = 1e-04
+atol = 1e-05
+
+
+class TorchMLIRLockstepTensor(TorchMLIRTensor):
+    """This class overrides the dispatching for TorchMLIRTensor to allow for an op-by-op numerical comparison between PyTorch and the Torch-MLIR -> IREE backend compilation pipeline. This only supports the IREE backend and focuses on op-by-op level verification.
+
+    TODO: Extend this to do a cumulative trace with summary statistics at the end. Possibly requires a wrapper environment to store full trace info.
+    """
+
+    def __new__(cls, elem, **kwargs):
+        if kwargs.get("constructing_from_device_tensor", False):
+            tensor_meta_data = backend.get_torch_metadata(elem, kwargs)
+            r = make_bare_wrapper_subclass(
+                cls=cls,
+                size=tensor_meta_data.size,
+                strides=tensor_meta_data.strides,
+                storage_offset=tensor_meta_data.storage_offset,
+                dtype=tensor_meta_data.dtype,
+                layout=tensor_meta_data.layout,
+                device=tensor_meta_data.device,
+                requires_grad=tensor_meta_data.requires_grad,
+            )
+            r.elem = elem
+        elif isinstance(elem, torch.nn.Parameter):
+            r = make_wrapper_subclass_from_torch_tensor(
+                cls, elem.data, **kwargs
+            )
+            # This is a hack to handle non-contiguous data through IREE-backend
+            nt = elem.detach().data.numpy()
+            if not nt.flags["C_CONTIGUOUS"]:
+                nt = np.ascontiguousarray(nt, dtype=nt.dtype)
+            r.elem = backend.transfer_from_torch_to_device(
+                torch.from_numpy(nt)
+            )
+        elif isinstance(elem, torch.Tensor):
+            r = make_wrapper_subclass_from_torch_tensor(cls, elem, **kwargs)
+            # Ditto TODO: Find a better way to handle this
+            nt = elem.numpy()
+            if not nt.flags["C_CONTIGUOUS"]:
+                nt = np.ascontiguousarray(nt, dtype=nt.dtype)
+            r.elem = backend.transfer_from_torch_to_device(
+                torch.from_numpy(nt)
+            )
+        # This branch handles the case when a python scalar is passed to some op
+        # or is returned from some aten op, such as _local_scalar_dense.
+        elif isinstance(elem, (int, float, bool)):
+            return elem
+        else:
+            raise ValueError(f"Unknown element type: {type(elem)}")
+        return r
+
+    def __repr__(self):
+        if self.grad_fn:
+            return f"TorchMLIRLockstepTensor({self.elem}, backend={backend.__class__.__name__}, grad_fn={self.grad_fn})"
+        else:
+            return f"TorchMLIRLockstepTensor({self.elem}, backend={backend.__class__.__name__})"
+
+    """This does essentially the same dispatch as TorchMLIRTensor but operates as if debug mode is enabled. The numeric verification happens after the Torch-MLIR result is obtained by comparing against the 
+    """
+
+    @classmethod
+    def __torch_dispatch__(cls, func, _types, args=(), kwargs=None):
+        requires_grad = check_requires_grad(*args, **kwargs)
+        try:
+            with no_dispatch():
+                if hasattr(func, "op_name"):
+                    op_name = func.op_name
+                elif hasattr(func, "__name__"):
+                    # Handle builtin_function_or_method.
+                    op_name = func.__name__
+                else:
+                    raise RuntimeError(f"op {func} has no name")
+
+                if UNSUPPORTED_OPS.match(op_name):
+                    raise UnsupportedByTorchMlirEagerMode(op_name)
+
+                if not hasattr(func, "_schema"):
+                    raise RuntimeError(f"op {func} has no schema.")
+
+                normalized_kwargs = normalize_args_kwargs(func, args, kwargs)
+
+                if "layout" in normalized_kwargs and normalized_kwargs[
+                    "layout"
+                ] not in {0, None}:
+                    raise UnsupportedByTorchMlirEagerMode(
+                        f"{normalized_kwargs['layout']} layout not supported."
+                    )
+                if "memory_format" in normalized_kwargs and normalized_kwargs[
+                    "memory_format"
+                ] not in {0, None}:
+                    raise UnsupportedByTorchMlirEagerMode(
+                        f"{normalized_kwargs['memory_format']} memory format not supported."
+                    )
+                eager_module = build_mlir_module(func, normalized_kwargs)
+            device_tensor_args = [
+                kwarg.elem
+                for _, kwarg in normalized_kwargs.items()
+                if isinstance(kwarg, cls)
+            ]
+            assert len(eager_module.body.operations[0].arguments) == len(
+                device_tensor_args
+            ), "Number of parameters and number of arguments differs."
+            op_mlir_backend_callable = backend.compile(eager_module)
+            out = op_mlir_backend_callable(*device_tensor_args)
+            out = tree_map(
+                lambda x: cls(
+                    x,
+                    requires_grad=requires_grad,
+                    constructing_from_device_tensor=True,
+                ),
+                out,
+            )
+
+            # Numeric verification; Value for comparison comes from PyTorch eager
+            with no_dispatch():
+                unwrapped_args = tree_map(cls.unwrap, args)
+                unwrapped_kwargs = tree_map(cls.unwrap, kwargs)
+                if "_reshape_alias" in op_name:
+                    native_out = torch.ops.aten.view(
+                        unwrapped_args[0], unwrapped_args[1]
+                    )
+                else:
+                    native_out = func(*unwrapped_args, **unwrapped_kwargs)
+
+            native_out = tree_map(
+                lambda x: cls(x, requires_grad=requires_grad), native_out
+            ).elem
+            tmp_out = out.elem
+
+            try:
+                np.testing.assert_allclose(
+                    native_out.to_host(),
+                    tmp_out.to_host(),
+                    rtol=rtol,
+                    atol=atol,
+                )
+            except Exception as e:
+                shaped_args = [
+                    arg.shape if torch.is_tensor(arg) else arg
+                    for arg in unwrapped_args
+                ]
+                shaped_kwargs = [
+                    kwarg.shape if torch.is_tensor(kwarg) else kwarg
+                    for kwarg in unwrapped_kwargs
+                ]
+                warnings.warn(
+                    f"Lockstep accuracy verification failed with error: *{str(e)}*; "
+                    f"Dispatched function name: *{str(func)}*; "
+                    f"Dispatched function args: *{str(shaped_args)}*; "
+                    f"Dispatched function kwargs: *{str(shaped_kwargs)}*; "
+                )
+        except Exception as e:
+            warnings.warn(traceback.format_exc())
+            if isinstance(e, UnsupportedByTorchMlirEagerMode):
+                warnings.warn(
+                    f"Couldn't use TorchMLIR eager because current incompatibility: *{str(e)}*; running through PyTorch eager."
+                )
+            else:
+                warnings.warn(
+                    f"Couldn't use TorchMLIR eager because of error: *{str(e)}*; "
+                    f"Running through PyTorch eager"
+                )
+
+            with no_dispatch():
+                unwrapped_args = tree_map(cls.unwrap, args)
+                unwrapped_kwargs = tree_map(cls.unwrap, kwargs)
+                if "_reshape_alias" in op_name:
+                    out = torch.ops.aten.view(
+                        unwrapped_args[0], unwrapped_args[1]
+                    )
+                else:
+                    out = func(*unwrapped_args, **unwrapped_kwargs)
+
+            out = tree_map(lambda x: cls(x, requires_grad=requires_grad), out)
+
+        maybe_aliased_arg_name = check_get_aliased_arg(func)
+        if maybe_aliased_arg_name is not None:
+            warnings.warn(
+                f"Found aliased arg, but didn't copy tensor contents. This could lead to incorrect results for E2E model execution but doesn't affect the validity of the lockstep op verification."
+            )
+            # TODO: Find a way to handle argument aliasing for IREE backend
+            # backend.copy_into(normalized_kwargs[maybe_aliased_arg_name].elem, out.elem)
+
+        return out
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -15,6 +15,8 @@
 from torch_mlir.ir import StringAttr
 import torch_mlir
 from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
+import tempfile
+from shark.parser import shark_args


 def get_module_name_for_asm_dump(module):
@@ -62,6 +64,8 @@ def get_torch_mlir_module(
    if jit_trace:
        ignore_traced_shapes = True

+    tempfile.tempdir = shark_args.repro_dir
+
    module = torch_mlir.compile(
        module,
        input,
--- a/tank/MiniLM-L12-H384-uncased/MiniLM-L12-H384-uncased_test.py
+++ b/tank/MiniLM-L12-H384-uncased/MiniLM-L12-H384-uncased_test.py
@@ -1,101 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-from shark.parser import shark_args
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class MiniLMModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "microsoft/MiniLM-L12-H384-uncased"
-        )
-
-        shark_module = SharkInference(
-            model,
-            func_name,
-            device=device,
-            mlir_dialect="mhlo",
-            is_benchmark=self.benchmark,
-        )
-        if self.benchmark == True:
-            shark_args.enable_tf32 = True
-            shark_module.compile()
-            shark_args.onnx_bench = self.onnx_bench
-            shark_module.shark_runner.benchmark_all_csv(
-                (inputs),
-                "microsoft/MiniLM-L12-H384-uncased",
-                dynamic,
-                device,
-                "tensorflow",
-            )
-            shark_args.enable_tf32 = False
-            rtol = 1e-01
-            atol = 1e-02
-
-        else:
-            shark_module.compile()
-            rtol = 1e-02
-            atol = 1e-03
-
-        # TODO: Remove catch once new MiniLM stable
-        try:
-            result = shark_module.forward(inputs)[0][1].to_host()
-
-        except:
-            result = shark_module.forward(inputs)
-
-        np.testing.assert_allclose(golden_out, result, rtol=rtol, atol=atol)
-
-
-class MiniLMModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MiniLMModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/MiniLM-L12-H384-uncased_torch/MiniLM-L12-H384-uncased_torch_test.py
+++ b/tank/MiniLM-L12-H384-uncased_torch/MiniLM-L12-H384-uncased_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-from shark.parser import shark_args
-
-import unittest
-import numpy as np
-import pytest
-
-
-class MiniLMModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "microsoft/MiniLM-L12-H384-uncased", dynamic
-        )
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        if self.benchmark == True:
-            shark_args.enable_tf32 = True
-            shark_module.compile()
-            shark_args.onnx_bench = self.onnx_bench
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "microsoft/MiniLM-L12-H384-uncased",
-                dynamic,
-                device,
-                "torch",
-            )
-            shark_args.enable_tf32 = False
-            rtol = 1e-01
-            atol = 1e-02
-        else:
-            shark_module.compile()
-            rtol = 1e-02
-            atol = 1e-03
-
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results, rtol, atol)
-
-
-class MiniLMModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MiniLMModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/albert-base-v2_tf/albert-base-v2_tf_test.py
+++ b/tank/albert-base-v2_tf/albert-base-v2_tf_test.py
@@ -1,69 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class AlbertBaseModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "albert-base-v2"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class AlbertBaseModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = AlbertBaseModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/albert-base-v2_torch/albert-base-v2_torch_test.py
+++ b/tank/albert-base-v2_torch/albert-base-v2_torch_test.py
@@ -1,113 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class AlbertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "albert-base-v2", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "albert-base-v2",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class AlbertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = AlbertModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/alexnet_torch/alexnet_torch_test.py
+++ b/tank/alexnet_torch/alexnet_torch_test.py
@@ -1,115 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class AlexnetModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "alexnet", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "alexnet",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class AlexnetModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = AlexnetModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    @pytest.mark.xfail(
-        reason="Issue known, WIP",
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -0,0 +1,34 @@
+resnet50,mhlo,tf,1e-02,1e-3,default
+albert-base-v2,mhlo,tf,1e-02,1e-3,default
+roberta-base,mhlo,tf,1e-02,1e-3,default
+bert-base-uncased,mhlo,tf,1e-2,1e-3,default
+camembert-base,mhlo,tf,1e-2,1e-3,default
+dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default
+distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default
+facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit
+funnel-transformer/small,mhlo,tf,1e-2,1e-3,default
+google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default
+google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default
+google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit
+hf-internal-testing/tiny-random-flaubert,mhlo,tf,1e-2,1e-3,default
+microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf
+microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default
+microsoft/mpnet-base,mhlo,tf,1e-2,1e-3,default
+albert-base-v2,linalg,torch,1e-2,1e-3,default
+alexnet,linalg,torch,1e-2,1e-3,default
+bert-base-cased,linalg,torch,1e-2,1e-3,default
+bert-base-uncased,linalg,torch,1e-2,1e-3,default
+distilbert-base-uncased,linalg,torch,1e-2,1e-3,default
+facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default
+google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default
+microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default
+microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default
+microsoft/resnet-50,linalg,torch,1e-2,1e-3,default
+google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default
+mobilenet_v3_small,linalg,torch,1e-2,1e-3,default
+nvidia/mit-b0,linalg,torch,1e-2,1e-3,default
+resnet101,linalg,torch,1e-2,1e-3,default
+resnet18,linalg,torch,1e-2,1e-3,default
+resnet50,linalg,torch,1e-2,1e-3,default
+squeezenet1_0,linalg,torch,1e-2,1e-3,default
+wide_resnet50_2,linalg,torch,1e-2,1e-3,default
--- a/tank/bert-base-cased_torch/bert-base-cased_torch_test.py
+++ b/tank/bert-base-cased_torch/bert-base-cased_torch_test.py
@@ -1,117 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import torch
-import unittest
-import numpy as np
-import pytest
-
-
-class BertBaseUncasedModuleTester:
-    def __init__(
-        self,
-        save_mlir=False,
-        save_vmfb=False,
-        benchmark=False,
-    ):
-        self.save_mlir = save_mlir
-        self.save_vmfb = save_vmfb
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "bert-base-cased", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "bert-base-cased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class BertBaseUncasedModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = BertBaseUncasedModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/bert-base-uncased_tf/bert-base-uncased_tf_test.py
+++ b/tank/bert-base-uncased_tf/bert-base-uncased_tf_test.py
@@ -1,71 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-from shark.parser import shark_args
-
-import unittest
-import pytest
-import numpy as np
-
-
-class BertBaseUncasedModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "bert-base-uncased"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class BertBaseUncasedModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = BertBaseUncasedModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/bert-base-uncased_torch/bert-base-uncased_torch_test.py
+++ b/tank/bert-base-uncased_torch/bert-base-uncased_torch_test.py
@@ -1,108 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-from shark.parser import shark_args
-
-import torch
-import unittest
-import numpy as np
-import pytest
-
-
-class BertBaseUncasedModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "bert-base-uncased", dynamic
-        )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_args.onnx_bench = self.onnx_bench
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "bert-base-uncased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class BertBaseUncasedModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = BertBaseUncasedModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/bert-base-uncased_tosa_torch/bert_base_uncased_tosa.py
+++ b/tank/bert-base-uncased_tosa_torch/bert_base_uncased_tosa.py
@@ -0,0 +1,24 @@
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_torch_model
+
+mlir_model, func_name, inputs, golden_out = download_torch_model(
+    "bert-base-uncased_tosa"
+)
+
+shark_module = SharkInference(
+    mlir_model, func_name, device="cpu", mlir_dialect="tosa"
+)
+shark_module.compile()
+result = shark_module.forward(inputs)
+print("The obtained result via shark is: ", result)
+print("The golden result is:", golden_out)
+
+import numpy as np
+
+result_unsqueeze = np.expand_dims(result, axis=0)
+
+print(
+    np.testing.assert_allclose(
+        result_unsqueeze, golden_out, rtol=1e-3, atol=1e-3
+    )
+)
--- a/tank/camembert-base_tf/camembert-base_tf_test.py
+++ b/tank/camembert-base_tf/camembert-base_tf_test.py
@@ -1,68 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class CamemBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "camembert-base"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class CamemBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = CamemBertModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/convbert-base-turkish-cased_tf/convbert-base-turkish-cased_tf_test.py
+++ b/tank/convbert-base-turkish-cased_tf/convbert-base-turkish-cased_tf_test.py
@@ -1,71 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class ConvBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "dbmdz/convbert-base-turkish-cased"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class ConvBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = ConvBertModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    @pytest.mark.xfail(
-        reason="Issue: https://github.com/iree-org/iree/issues/9971",
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
+++ b/tank/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
@@ -1,71 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class DistilBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "distilbert-base-uncased"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class DistilBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = DistilBertModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
+++ b/tank/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
@@ -1,128 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.parser import shark_args
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class DistilBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "distilbert-base-uncased", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "distilbert-base-uncased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class DistilBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = DistilBertModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    @pytest.mark.skip(
-        reason="Fails to lower in torch-mlir. See https://github.com/nod-ai/SHARK/issues/222"
-    )
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skip(
-        reason="Fails to lower in torch-mlir. See https://github.com/nod-ai/SHARK/issues/222"
-    )
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skip(
-        reason="Fails to lower in torch-mlir. See https://github.com/nod-ai/SHARK/issues/222"
-    )
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skip(reason="DistilBert needs to be uploaded to cloud.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skip(reason="DistilBert needs to be uploaded to cloud.")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skip(reason="DistilBert needs to be uploaded to cloud.")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skip(reason="DistilBert needs to be uploaded to cloud.")
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/electra-small-discriminator_tf/electra-small-discriminator_tf_test.py
+++ b/tank/electra-small-discriminator_tf/electra-small-discriminator_tf_test.py
@@ -1,68 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class ElectraModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "google/electra-small-discriminator"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class ElectraModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = ElectraModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/facebook_convnext-tiny-224_tf/facebook_convnext-tiny-224_tf_test.py
+++ b/tank/facebook_convnext-tiny-224_tf/facebook_convnext-tiny-224_tf_test.py
@@ -1,81 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import unittest
-import pytest
-import numpy as np
-
-
-class ConvNextTinyModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "facebook/convnext-tiny-224"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        #  result: array([['logits',
-        #         <IREE DeviceArray: shape=[1, 1000], dtype=<class 'numpy.float32'>>]],
-        #       dtype=object)
-
-        # post process of img output
-        ir_device_array = result[0][1]
-        logits = ir_device_array.astype(ir_device_array.dtype)
-        logits = np.squeeze(logits, axis=0)
-        print("logits: ", logits.shape)
-        print("golden_out: ", golden_out[0].shape)
-        print(np.allclose(golden_out[0], logits, rtol=1e-02, atol=1e-03))
-
-
-class ConvNextTinyModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = ConvNextTinyModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    # dynamic = False
-    # device = "cpu"
-    # module_tester = ConvNextTinyModuleTester()
-    # module_tester.create_and_check_module(dynamic, device)
-    unittest.main()
--- a/tank/funnel-transformer_tf/funnel-transformer_tf_test.py
+++ b/tank/funnel-transformer_tf/funnel-transformer_tf_test.py
@@ -1,74 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class FunnelModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "funnel-transformer/small"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class FunnelModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = FunnelModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(
-        reason="failing in the iree-compiler passes, see https://github.com/nod-ai/SHARK/issues/201"
-    )
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(
-        reason="failing in the iree-compiler passes, see https://github.com/nod-ai/SHARK/issues/201"
-    )
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/google_vit-base-patch16-224_tf/google_vit-base-patch16-224_tf_test.py
+++ b/tank/google_vit-base-patch16-224_tf/google_vit-base-patch16-224_tf_test.py
@@ -1,78 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import unittest
-import pytest
-import numpy as np
-
-
-class VitBaseModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "google/vit-base-patch16-224"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-
-        # post process of img output
-        ir_device_array = result[0][1]
-        logits = ir_device_array.astype(ir_device_array.dtype)
-        logits = np.squeeze(logits, axis=0)
-        print("logits: ", logits.shape)
-        print("golden_out: ", golden_out[0].shape)
-        print(np.allclose(golden_out[0], logits, rtol=1e-02, atol=1e-03))
-
-
-class VitBaseModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = VitBaseModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    dynamic = False
-    device = "cpu"
-    module_tester = VitBaseModuleTester()
-    module_tester.create_and_check_module(dynamic, device)
-    # unittest.main()
--- a/tank/layoutlm-base-uncased_tf/layoutlm-base-uncased_tf_test.py
+++ b/tank/layoutlm-base-uncased_tf/layoutlm-base-uncased_tf_test.py
@@ -1,68 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class LayoutLMModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "microsoft/layoutlm-base-uncased"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class LayoutLMModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = LayoutLMModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/longformer-base-4096_tf/longformer-base-4096_tf_test.py
+++ b/tank/longformer-base-4096_tf/longformer-base-4096_tf_test.py
@@ -1,69 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class LongformerModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "allenai/longformer-base-4096"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class LongformerModuleTest(unittest.TestCase):
-    @pytest.skip(reason="Model can't be imported.", allow_module_level=True)
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = LongformerModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/mobilebert-uncased_tf/mobilebert-uncased_tf_test.py
+++ b/tank/mobilebert-uncased_tf/mobilebert-uncased_tf_test.py
@@ -1,60 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class MobileBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "google/mobilebert-uncased"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class MobileBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MobileBertModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/mobilebert-uncased_torch/mobilebert-uncased_torch_test.py
+++ b/tank/mobilebert-uncased_torch/mobilebert-uncased_torch_test.py
@@ -1,108 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import torch
-import unittest
-import numpy as np
-import pytest
-
-
-class MobileBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "google/mobilebert-uncased", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "google/mobilebert-uncased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class MobileBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MobileBertModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    @pytest.mark.xfail(
-        reason="Issue known, WIP",
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
+++ b/tank/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class MobileNetV3ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "mobilenet_v3_small", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        np.testing.assert_allclose(act_out, results, rtol=1e-02, atol=1e-03)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "alexnet",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class MobileNetV3ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MobileNetV3ModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="golden results don't match.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="golden results don't match.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="stuck in the pipeline.")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/model_metadata.csv
+++ b/tank/model_metadata.csv
@@ -0,0 +1,29 @@
+model_name, use_tracing, dynamic, param_count, tags, notes
+microsoft/MiniLM-L12-H384-uncased,True,True,66M,"nlp;bert-variant;transformer-encoder","Large version has 12 layers; 384 hidden size; Smaller than BERTbase (66M params vs 109M params)"
+albert-base-v2,True,True,11M,"nlp;bert-variant;transformer-encoder","12 layers; 128 embedding dim; 768 hidden dim; 12 attention heads; Smaller than BERTbase (11M params vs 109M params); Uses weight sharing to reduce # params but computational cost is similar to BERT."
+bert-base-uncased,True,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
+bert-base-cased,True,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
+distilbert-base-uncased,True,True,66M,"nlp;bert-variant;transformer-encoder","Smaller and faster than BERT with 97percent retained accuracy."
+google/mobilebert-uncased,True,True,25M,"nlp,bert-variant,transformer-encoder,mobile","24 layers, 512 hidden size, 128 embedding"
+alexnet,False,True,61M,"cnn,parallel-layers","The CNN that revolutionized computer vision (move away from hand-crafted features to neural networks),10 years old now and probably no longer used in prod."
+resnet18,False,True,11M,"cnn,image-classification,residuals,resnet-variant","1 7x7 conv2d and the rest are 3x3 conv2d"
+resnet50,False,True,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+resnet101,False,True,29M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+squeezenet1_0,False,True,1.25M,"cnn,image-classification,mobile,parallel-layers","Parallel conv2d (1x1 conv to compress -> (3x3 expand | 1x1 expand) -> concat)"
+wide_resnet50_2,False,True,69M,"cnn,image-classification,residuals,resnet-variant","Resnet variant where model depth is decreased and width is increased."
+mobilenet_v3_small,False,True,2.5M,"image-classification,cnn,mobile",N/A
+google/vit-base-patch16-224,True,False,86M,"image-classification,vision-transformer,transformer-encoder",N/A
+microsoft/resnet-50,True,False,23M,"image-classification,cnn,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+facebook/deit-small-distilled-patch16-224,True,False,22M,"image-classification,vision-transformer,cnn",N/A
+microsoft/beit-base-patch16-224-pt22k-ft22k,True,False,86M,"image-classification,transformer-encoder,bert-variant,vision-transformer",N/A
+nvidia/mit-b0,True,False,3.7M,"image-classification,transformer-encoder",SegFormer
+camembert-base,False,False,-,-,-
+dbmdz/convbert-base-turkish-cased,False,False,-,-,-
+google/electra-small-discriminator,False,False,-,-,-
+hf-internal-testing/tiny-random-flaubert,False,False,-,-,-
+funnel-transformer/small,False,False,-,-,-
+microsoft/layoutlm-base-uncased,False,False,-,-,-
+microsoft/mpnet-base,False,False,-,-,-
+roberta-base,False,False,-,-,-
+xlm-roberta-base,False,False,-,-,-
+facebook/convnext-tiny-224,False,False,-,-,-
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -16,15 +16,77 @@ vision_models = [
    "wide_resnet50_2",
    "mobilenet_v3_small",
 ]
+hf_img_cls_models = [
+    "google/vit-base-patch16-224",
+    "microsoft/resnet-50",
+    "facebook/deit-small-distilled-patch16-224",
+    "microsoft/beit-base-patch16-224-pt22k-ft22k",
+    "nvidia/mit-b0",
+]


 def get_torch_model(modelname):
    if modelname in vision_models:
        return get_vision_model(modelname)
+    elif modelname in hf_img_cls_models:
+        return get_hf_img_cls_model(modelname)
    else:
        return get_hf_model(modelname)


+##################### Hugging Face Image Classification Models ###################################
+from transformers import AutoModelForImageClassification
+from transformers import AutoFeatureExtractor
+from PIL import Image
+import requests
+
+
+def preprocess_input_image(model_name):
+    # from datasets import load_dataset
+    # dataset = load_dataset("huggingface/cats-image")
+    # image1 = dataset["test"]["image"][0]
+    # # print("image1: ", image1) # <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7FA0B86BB6D0>
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    # <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7FA0B86BB6D0>
+    image = Image.open(requests.get(url, stream=True).raw)
+    # feature_extractor = img_models_fe_dict[model_name].from_pretrained(
+    #     model_name
+    # )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
+    inputs = feature_extractor(images=image, return_tensors="pt")
+    # inputs = {'pixel_values': tensor([[[[ 0.1137..., -0.2000, -0.4275, -0.5294]]]])}
+    #           torch.Size([1, 3, 224, 224]), torch.FloatTensor
+
+    return inputs[str(*inputs)]
+
+
+class HuggingFaceImageClassification(torch.nn.Module):
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForImageClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            return_dict=False,  # https://github.com/huggingface/transformers/issues/9095
+            torchscript=True,
+        )
+
+    def forward(self, inputs):
+        return self.model.forward(inputs)[0]
+
+
+def get_hf_img_cls_model(name):
+    model = HuggingFaceImageClassification(name)
+    # you can use preprocess_input_image to get the test_input or just random value.
+    test_input = preprocess_input_image(name)
+    # test_input = torch.FloatTensor(1, 3, 224, 224).uniform_(-1, 1)
+    print("test_input.shape: ", test_input.shape)
+    # test_input.shape:  torch.Size([1, 3, 224, 224])
+    actual_out = model(test_input)
+    print("actual_out.shape： ", actual_out.shape)
+    # actual_out.shape：  torch.Size([1, 1000])
+    return model, test_input, actual_out
+
+
 ##################### Hugging Face LM Models ###################################


--- a/tank/model_utils_tf.py
+++ b/tank/model_utils_tf.py
@@ -28,24 +28,28 @@ maskedlm_models = [
    "albert-base-v2",
    "bert-base-uncased",
    "camembert-base",
-    "convbert-base-turkish-cased",
+    "dbmdz/convbert-base-turkish-cased",
    "deberta-base",
    "distilbert-base-uncased",
-    "electra-small-discriminator",
-    "funnel-transformer",
-    "layoutlm-base-uncased",
+    "google/electra-small-discriminator",
+    "funnel-transformer/small",
+    "microsoft/layoutlm-base-uncased",
    "longformer-base-4096",
-    "mobilebert-uncased",
-    "mpnet-base",
-    "rembert",
+    "google/mobilebert-uncased",
+    "microsoft/mpnet-base",
+    "google/rembert",
    "roberta-base",
    "tapas-base",
-    "tiny-random-flaubert",
+    "hf-internal-testing/tiny-random-flaubert",
    "xlm-roberta",
 ]
 tfhf_models = [
    "microsoft/MiniLM-L12-H384-uncased",
 ]
+img_models = [
+    "google/vit-base-patch16-224",
+    "facebook/convnext-tiny-224",
+]


 def get_tf_model(name):
@@ -55,8 +59,12 @@ def get_tf_model(name):
        return get_causal_lm_model(name)
    elif name in tfhf_models:
        return get_TFhf_model(name)
-    else:
+    elif name in img_models:
        return get_causal_image_model(name)
+    else:
+        raise Exception(
+            "TF model not found! Please check that the modelname has been input correctly."
+        )


 ##################### Tensorflow Hugging Face LM Models ###################################
--- a/tank/mpnet-base_tf/mpnet-base_tf_test.py
+++ b/tank/mpnet-base_tf/mpnet-base_tf_test.py
@@ -1,71 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class MpNetModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "microsoft/mpnet-base"
-        )
-
-        shark_module = SharkInference(
-            model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-
-class MpNetModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MpNetModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    @pytest.mark.xfail(reason="https://github.com/nod-ai/SHARK/issues/203")
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="https://github.com/nod-ai/SHARK/issues/203")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="https://github.com/nod-ai/SHARK/issues/203")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/pytorch/bench_results.csv
+++ b/tank/pytorch/bench_results.csv
@@ -1 +0,0 @@
-platform,model,dynamic,device,iter/sec,ms/iter,datetime
--- a/tank/pytorch/bloom/README.md
+++ b/tank/pytorch/bloom/README.md
@@ -0,0 +1,36 @@
+# Bloom model
+
+## Installation
+
+<details>
+  <summary>Installation (Linux)</summary>
+
+### Activate shark.venv Virtual Environment
+
+```shell
+source shark.venv/bin/activate
+
+# Some older pip installs may not be able to handle the recent PyTorch deps
+python -m pip install --upgrade pip
+```
+
+### Install dependencies
+
+```shell
+pip install transformers==4.21.2
+```
+Use this branch of Torch-MLIR for running the model: https://github.com/vivekkhandelwal1/torch-mlir/tree/bloom-ops
+
+
+### Run bloom model
+
+```shell
+python bloom_model.py
+```
+
+The runtime device, model config, and text prompt can be specified with `--device <device string>`, `--config <config string>`, `--prompt <prompt string>` respectively.
+
+To run the complete 176B params bloom model, run the following command:
+```shell
+python bloom_model.py --config "bloom"
+```
--- a/tank/pytorch/bloom/bloom_model.py
+++ b/tank/pytorch/bloom/bloom_model.py
@@ -0,0 +1,122 @@
+### Please do `pip install transformers==4.21.2` before running this script.
+
+### To run the complete bloom model: pass as argument "--config bloom".
+
+import argparse
+import torch
+import torch_mlir
+from transformers import BloomTokenizerFast, BloomForSequenceClassification
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from shark.shark_inference import SharkInference
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="Hello, my dog is cute",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument("--seed", type=int, default=0, help="the random seed")
+p.add_argument(
+    "--config",
+    type=str,
+    default="bloom-560m",
+    help="the configuration of model to use",
+)
+args = p.parse_args()
+
+torch.manual_seed(args.seed)
+
+model_config = "bigscience/" + args.config
+tokenizer = BloomTokenizerFast.from_pretrained(model_config)
+test_input = tokenizer(args.prompt, return_tensors="pt")["input_ids"]
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = BloomForSequenceClassification.from_pretrained(
+            model_config
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+model = HuggingFaceLanguage()
+actual_out = model(test_input)
+
+# import numpy as np
+# test_input_ny = test_input.detach().numpy()
+# input_tuple = (test_input_ny,)
+# np.savez('inputs.npz', *input_tuple)
+# output_ny = actual_out.detach().numpy()
+# output_tuple = (output_ny,)
+# np.savez('golden_out.npz', *output_tuple)
+
+fx_g = make_fx(
+    model,
+    decomposition_table=get_decompositions(
+        [
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.split_with_sizes,
+        ]
+    ),
+)(test_input)
+
+# # print(fx_g.graph)
+
+fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+fx_g.recompile()
+
+
+def strip_overloads(gm):
+    """
+    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+    Args:
+        gm(fx.GraphModule): The input Fx graph module to be modified
+    """
+    for node in gm.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    gm.recompile()
+
+
+strip_overloads(fx_g)
+
+ts_g = torch.jit.script(fx_g)
+
+module = torch_mlir.compile(
+    ts_g,
+    [test_input],
+    torch_mlir.OutputType.LINALG_ON_TENSORS,
+    use_tracing=True,
+    verbose=False,
+)
+# # module.dump()
+
+mlir_model = module
+func_name = "forward"
+
+shark_module = SharkInference(
+    mlir_model, func_name, device=args.device, mlir_dialect="tm_tensor"
+)
+shark_module.compile()
+
+
+def shark_result(x):
+    x_ny = x.detach().numpy()
+    inputs = (x_ny,)
+    result = shark_module.forward(inputs)
+    return torch.from_numpy(result)
+
+
+observed_out = shark_result(test_input)
+
+print("Golden result:", actual_out)
+print("SHARK result:", observed_out)
--- a/tank/pytorch/torch_model_list.csv
+++ b/tank/pytorch/torch_model_list.csv
@@ -1,13 +1,18 @@
-model_name, use_tracing, model_type
-microsoft/MiniLM-L12-H384-uncased,True,hf
-albert-base-v2,True,hf
-bert-base-uncased,True,hf
-bert-base-cased,True,hf
-google/mobilebert-uncased,True,hf
-alexnet,False,vision
-resnet18,False,vision
-resnet50,False,vision
-resnet101,False,vision
-squeezenet1_0,False,vision
-wide_resnet50_2,False,vision
-mobilenet_v3_small,False,vision
+model_name, use_tracing, model_type, dynamic, param_count, tags, notes
+microsoft/MiniLM-L12-H384-uncased,True,hf,True,66M,"nlp;bert-variant;transformer-encoder","Large version has 12 layers; 384 hidden size; Smaller than BERTbase (66M params vs 109M params)"
+albert-base-v2,True,hf,True,11M,"nlp;bert-variant;transformer-encoder","12 layers; 128 embedding dim; 768 hidden dim; 12 attention heads; Smaller than BERTbase (11M params vs 109M params); Uses weight sharing to reduce # params but computational cost is similar to BERT."
+bert-base-uncased,True,hf,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
+bert-base-cased,True,hf,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
+google/mobilebert-uncased,True,hf,True,25M,"nlp,bert-variant,transformer-encoder,mobile","24 layers, 512 hidden size, 128 embedding"
+alexnet,False,vision,True,61M,"cnn,parallel-layers","The CNN that revolutionized computer vision (move away from hand-crafted features to neural networks),10 years old now and probably no longer used in prod."
+resnet18,False,vision,True,11M,"cnn,image-classification,residuals,resnet-variant","1 7x7 conv2d and the rest are 3x3 conv2d"
+resnet50,False,vision,True,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+resnet101,False,vision,True,29M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+squeezenet1_0,False,vision,True,1.25M,"cnn,image-classification,mobile,parallel-layers","Parallel conv2d (1x1 conv to compress -> (3x3 expand | 1x1 expand) -> concat)"
+wide_resnet50_2,False,vision,True,69M,"cnn,image-classification,residuals,resnet-variant","Resnet variant where model depth is decreased and width is increased."
+mobilenet_v3_small,False,vision,True,2.5M,"image-classification,cnn,mobile",N/A
+google/vit-base-patch16-224,True,hf_img_cls,False,86M,"image-classification,vision-transformer,transformer-encoder",N/A
+microsoft/resnet-50,True,hf_img_cls,False,23M,"image-classification,cnn,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+facebook/deit-small-distilled-patch16-224,True,hf_img_cls,False,22M,"image-classification,vision-transformer,cnn",N/A
+microsoft/beit-base-patch16-224-pt22k-ft22k,True,hf_img_cls,False,86M,"image-classification,transformer-encoder,bert-variant,vision-transformer",N/A
+nvidia/mit-b0,True,hf_img_cls,False,3.7M,"image-classification,transformer-encoder",SegFormer
--- a/tank/pytorch/v_diffusion_pytorch/.gitignore
+++ b/tank/pytorch/v_diffusion_pytorch/.gitignore
@@ -0,0 +1,4 @@
+*.png
+*.pth
+checkpoints/
+v-diffusion-pytorch/
--- a/tank/pytorch/v_diffusion_pytorch/README.md
+++ b/tank/pytorch/v_diffusion_pytorch/README.md
@@ -27,7 +27,25 @@ Run the script setup_v_diffusion_pytorch.sh
 ./v-diffusion-pytorch/cfg_sample.py "New York City, oil on canvas":5 -n 5 -bs 5
 ```

+The runtime device can be specified with `--runtime_device=<device string>`
+
 ### Run the v-diffusion model via torch-mlir
 ```shell
 ./cfg_sample.py "New York City, oil on canvas":5 -n 1 -bs 1 --steps 2
 ```
+
+### Run the model stored in the tank
+```shell
+./cfg_sample_from_mlir.py "New York City, oil on canvas":5 -n 1 -bs 1 --steps 2
+```
+Note that the current model in the tank requires batch size 1 statically.
+
+### Run the model with preprocessing elements taken out
+To run the model without preprocessing copy `cc12m_1.py` to replace the version in `v-diffusion-pytorch`
+```shell
+cp cc12m_1.py v-diffusion-pytorch/diffusion/models
+```
+Then run
+```shell
+./cfg_sample_preprocess.py "New York City, oil on canvas":5 -n 1 -bs 1 --steps 2
+```
--- a/tank/pytorch/v_diffusion_pytorch/cc12m_1.py
+++ b/tank/pytorch/v_diffusion_pytorch/cc12m_1.py
@@ -0,0 +1,385 @@
+from functools import partial
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, main, skip=None):
+        super().__init__()
+        self.main = nn.Sequential(*main)
+        self.skip = skip if skip else nn.Identity()
+
+    def forward(self, input):
+        return self.main(input) + self.skip(input)
+
+
+class ResLinearBlock(ResidualBlock):
+    def __init__(self, f_in, f_mid, f_out, is_last=False):
+        skip = None if f_in == f_out else nn.Linear(f_in, f_out, bias=False)
+        super().__init__(
+            [
+                nn.Linear(f_in, f_mid),
+                nn.ReLU(inplace=True),
+                nn.Linear(f_mid, f_out),
+                nn.ReLU(inplace=True) if not is_last else nn.Identity(),
+            ],
+            skip,
+        )
+
+
+class Modulation2d(nn.Module):
+    def __init__(self, state, feats_in, c_out):
+        super().__init__()
+        self.state = state
+        self.layer = nn.Linear(feats_in, c_out * 2, bias=False)
+
+    def forward(self, input):
+        scales, shifts = self.layer(self.state["cond"]).chunk(2, dim=-1)
+        return torch.addcmul(
+            shifts[..., None, None], input, scales[..., None, None] + 1
+        )
+
+
+class ResModConvBlock(ResidualBlock):
+    def __init__(self, state, feats_in, c_in, c_mid, c_out, is_last=False):
+        skip = None if c_in == c_out else nn.Conv2d(c_in, c_out, 1, bias=False)
+        super().__init__(
+            [
+                nn.Conv2d(c_in, c_mid, 3, padding=1),
+                nn.GroupNorm(1, c_mid, affine=False),
+                Modulation2d(state, feats_in, c_mid),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(c_mid, c_out, 3, padding=1),
+                nn.GroupNorm(1, c_out, affine=False)
+                if not is_last
+                else nn.Identity(),
+                Modulation2d(state, feats_in, c_out)
+                if not is_last
+                else nn.Identity(),
+                nn.ReLU(inplace=True) if not is_last else nn.Identity(),
+            ],
+            skip,
+        )
+
+
+class SkipBlock(nn.Module):
+    def __init__(self, main, skip=None):
+        super().__init__()
+        self.main = nn.Sequential(*main)
+        self.skip = skip if skip else nn.Identity()
+
+    def forward(self, input):
+        return torch.cat([self.main(input), self.skip(input)], dim=1)
+
+
+class FourierFeatures(nn.Module):
+    def __init__(self, in_features, out_features, std=1.0):
+        super().__init__()
+        assert out_features % 2 == 0
+        self.weight = nn.Parameter(
+            torch.randn([out_features // 2, in_features]) * std
+        )
+        self.weight.requires_grad_(False)
+        # self.register_buffer('weight', torch.randn([out_features // 2, in_features]) * std)
+
+    def forward(self, input):
+        f = 2 * math.pi * input @ self.weight.T
+        return torch.cat([f.cos(), f.sin()], dim=-1)
+
+
+class SelfAttention2d(nn.Module):
+    def __init__(self, c_in, n_head=1, dropout_rate=0.1):
+        super().__init__()
+        assert c_in % n_head == 0
+        self.norm = nn.GroupNorm(1, c_in)
+        self.n_head = n_head
+        self.qkv_proj = nn.Conv2d(c_in, c_in * 3, 1)
+        self.out_proj = nn.Conv2d(c_in, c_in, 1)
+        self.dropout = (
+            nn.Identity()
+        )  # nn.Dropout2d(dropout_rate, inplace=True)
+
+    def forward(self, input):
+        n, c, h, w = input.shape
+        qkv = self.qkv_proj(self.norm(input))
+        qkv = qkv.view(
+            [n, self.n_head * 3, c // self.n_head, h * w]
+        ).transpose(2, 3)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = k.shape[3] ** -0.25
+        att = ((q * scale) @ (k.transpose(2, 3) * scale)).softmax(3)
+        y = (att @ v).transpose(2, 3).contiguous().view([n, c, h, w])
+        return input + self.dropout(self.out_proj(y))
+
+
+def expand_to_planes(input, shape):
+    return input[..., None, None].repeat([1, 1, shape[2], shape[3]])
+
+
+class CC12M1Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.shape = (3, 256, 256)
+        self.clip_model = "ViT-B/16"
+        self.min_t = 0.0
+        self.max_t = 1.0
+
+        c = 128  # The base channel count
+        cs = [c, c * 2, c * 2, c * 4, c * 4, c * 8, c * 8]
+
+        self.mapping_timestep_embed = FourierFeatures(1, 128)
+        self.mapping = nn.Sequential(
+            ResLinearBlock(512 + 128, 1024, 1024),
+            ResLinearBlock(1024, 1024, 1024, is_last=True),
+        )
+
+        with torch.no_grad():
+            for param in self.mapping.parameters():
+                param *= 0.5**0.5
+
+        self.state = {}
+        conv_block = partial(ResModConvBlock, self.state, 1024)
+
+        self.timestep_embed = FourierFeatures(1, 16)
+        self.down = nn.AvgPool2d(2)
+        self.up = nn.Upsample(
+            scale_factor=2, mode="bilinear", align_corners=False
+        )
+
+        self.net = nn.Sequential(  # 256x256
+            conv_block(3 + 16, cs[0], cs[0]),
+            conv_block(cs[0], cs[0], cs[0]),
+            conv_block(cs[0], cs[0], cs[0]),
+            conv_block(cs[0], cs[0], cs[0]),
+            SkipBlock(
+                [
+                    self.down,  # 128x128
+                    conv_block(cs[0], cs[1], cs[1]),
+                    conv_block(cs[1], cs[1], cs[1]),
+                    conv_block(cs[1], cs[1], cs[1]),
+                    conv_block(cs[1], cs[1], cs[1]),
+                    SkipBlock(
+                        [
+                            self.down,  # 64x64
+                            conv_block(cs[1], cs[2], cs[2]),
+                            conv_block(cs[2], cs[2], cs[2]),
+                            conv_block(cs[2], cs[2], cs[2]),
+                            conv_block(cs[2], cs[2], cs[2]),
+                            SkipBlock(
+                                [
+                                    self.down,  # 32x32
+                                    conv_block(cs[2], cs[3], cs[3]),
+                                    conv_block(cs[3], cs[3], cs[3]),
+                                    conv_block(cs[3], cs[3], cs[3]),
+                                    conv_block(cs[3], cs[3], cs[3]),
+                                    SkipBlock(
+                                        [
+                                            self.down,  # 16x16
+                                            conv_block(cs[3], cs[4], cs[4]),
+                                            SelfAttention2d(
+                                                cs[4], cs[4] // 64
+                                            ),
+                                            conv_block(cs[4], cs[4], cs[4]),
+                                            SelfAttention2d(
+                                                cs[4], cs[4] // 64
+                                            ),
+                                            conv_block(cs[4], cs[4], cs[4]),
+                                            SelfAttention2d(
+                                                cs[4], cs[4] // 64
+                                            ),
+                                            conv_block(cs[4], cs[4], cs[4]),
+                                            SelfAttention2d(
+                                                cs[4], cs[4] // 64
+                                            ),
+                                            SkipBlock(
+                                                [
+                                                    self.down,  # 8x8
+                                                    conv_block(
+                                                        cs[4], cs[5], cs[5]
+                                                    ),
+                                                    SelfAttention2d(
+                                                        cs[5], cs[5] // 64
+                                                    ),
+                                                    conv_block(
+                                                        cs[5], cs[5], cs[5]
+                                                    ),
+                                                    SelfAttention2d(
+                                                        cs[5], cs[5] // 64
+                                                    ),
+                                                    conv_block(
+                                                        cs[5], cs[5], cs[5]
+                                                    ),
+                                                    SelfAttention2d(
+                                                        cs[5], cs[5] // 64
+                                                    ),
+                                                    conv_block(
+                                                        cs[5], cs[5], cs[5]
+                                                    ),
+                                                    SelfAttention2d(
+                                                        cs[5], cs[5] // 64
+                                                    ),
+                                                    SkipBlock(
+                                                        [
+                                                            self.down,  # 4x4
+                                                            conv_block(
+                                                                cs[5],
+                                                                cs[6],
+                                                                cs[6],
+                                                            ),
+                                                            SelfAttention2d(
+                                                                cs[6],
+                                                                cs[6] // 64,
+                                                            ),
+                                                            conv_block(
+                                                                cs[6],
+                                                                cs[6],
+                                                                cs[6],
+                                                            ),
+                                                            SelfAttention2d(
+                                                                cs[6],
+                                                                cs[6] // 64,
+                                                            ),
+                                                            conv_block(
+                                                                cs[6],
+                                                                cs[6],
+                                                                cs[6],
+                                                            ),
+                                                            SelfAttention2d(
+                                                                cs[6],
+                                                                cs[6] // 64,
+                                                            ),
+                                                            conv_block(
+                                                                cs[6],
+                                                                cs[6],
+                                                                cs[6],
+                                                            ),
+                                                            SelfAttention2d(
+                                                                cs[6],
+                                                                cs[6] // 64,
+                                                            ),
+                                                            conv_block(
+                                                                cs[6],
+                                                                cs[6],
+                                                                cs[6],
+                                                            ),
+                                                            SelfAttention2d(
+                                                                cs[6],
+                                                                cs[6] // 64,
+                                                            ),
+                                                            conv_block(
+                                                                cs[6],
+                                                                cs[6],
+                                                                cs[6],
+                                                            ),
+                                                            SelfAttention2d(
+                                                                cs[6],
+                                                                cs[6] // 64,
+                                                            ),
+                                                            conv_block(
+                                                                cs[6],
+                                                                cs[6],
+                                                                cs[6],
+                                                            ),
+                                                            SelfAttention2d(
+                                                                cs[6],
+                                                                cs[6] // 64,
+                                                            ),
+                                                            conv_block(
+                                                                cs[6],
+                                                                cs[6],
+                                                                cs[5],
+                                                            ),
+                                                            SelfAttention2d(
+                                                                cs[5],
+                                                                cs[5] // 64,
+                                                            ),
+                                                            self.up,
+                                                        ]
+                                                    ),
+                                                    conv_block(
+                                                        cs[5] * 2, cs[5], cs[5]
+                                                    ),
+                                                    SelfAttention2d(
+                                                        cs[5], cs[5] // 64
+                                                    ),
+                                                    conv_block(
+                                                        cs[5], cs[5], cs[5]
+                                                    ),
+                                                    SelfAttention2d(
+                                                        cs[5], cs[5] // 64
+                                                    ),
+                                                    conv_block(
+                                                        cs[5], cs[5], cs[5]
+                                                    ),
+                                                    SelfAttention2d(
+                                                        cs[5], cs[5] // 64
+                                                    ),
+                                                    conv_block(
+                                                        cs[5], cs[5], cs[4]
+                                                    ),
+                                                    SelfAttention2d(
+                                                        cs[4], cs[4] // 64
+                                                    ),
+                                                    self.up,
+                                                ]
+                                            ),
+                                            conv_block(
+                                                cs[4] * 2, cs[4], cs[4]
+                                            ),
+                                            SelfAttention2d(
+                                                cs[4], cs[4] // 64
+                                            ),
+                                            conv_block(cs[4], cs[4], cs[4]),
+                                            SelfAttention2d(
+                                                cs[4], cs[4] // 64
+                                            ),
+                                            conv_block(cs[4], cs[4], cs[4]),
+                                            SelfAttention2d(
+                                                cs[4], cs[4] // 64
+                                            ),
+                                            conv_block(cs[4], cs[4], cs[3]),
+                                            SelfAttention2d(
+                                                cs[3], cs[3] // 64
+                                            ),
+                                            self.up,
+                                        ]
+                                    ),
+                                    conv_block(cs[3] * 2, cs[3], cs[3]),
+                                    conv_block(cs[3], cs[3], cs[3]),
+                                    conv_block(cs[3], cs[3], cs[3]),
+                                    conv_block(cs[3], cs[3], cs[2]),
+                                    self.up,
+                                ]
+                            ),
+                            conv_block(cs[2] * 2, cs[2], cs[2]),
+                            conv_block(cs[2], cs[2], cs[2]),
+                            conv_block(cs[2], cs[2], cs[2]),
+                            conv_block(cs[2], cs[2], cs[1]),
+                            self.up,
+                        ]
+                    ),
+                    conv_block(cs[1] * 2, cs[1], cs[1]),
+                    conv_block(cs[1], cs[1], cs[1]),
+                    conv_block(cs[1], cs[1], cs[1]),
+                    conv_block(cs[1], cs[1], cs[0]),
+                    self.up,
+                ]
+            ),
+            conv_block(cs[0] * 2, cs[0], cs[0]),
+            conv_block(cs[0], cs[0], cs[0]),
+            conv_block(cs[0], cs[0], cs[0]),
+            conv_block(cs[0], cs[0], 3, is_last=True),
+        )
+
+        with torch.no_grad():
+            for param in self.net.parameters():
+                param *= 0.5**0.5
+
+    def forward(self, input, timestep_embed, selfcond):
+        self.state["cond"] = selfcond
+        out = self.net(torch.cat([input, timestep_embed], dim=1))
+        self.state.clear()
+        return out
--- a/tank/pytorch/v_diffusion_pytorch/cfg_sample.py
+++ b/tank/pytorch/v_diffusion_pytorch/cfg_sample.py
@@ -67,6 +67,12 @@ p.add_argument(
 )
 p.add_argument("--checkpoint", type=str, help="the checkpoint to use")
 p.add_argument("--device", type=str, help="the device to use")
+p.add_argument(
+    "--runtime_device",
+    type=str,
+    help="the device to use with SHARK",
+    default="cpu",
+)
 p.add_argument(
    "--eta",
    type=float,
@@ -234,19 +240,31 @@ module = torch_mlir.compile(
 mlir_model = module
 func_name = "forward"

+shark_module = SharkInference(
+    mlir_model, func_name, device=args.runtime_device, mlir_dialect="linalg"
+)
+shark_module.compile()
+

 def compiled_cfg_model_fn(x, t):
    x_ny = x.detach().numpy()
    t_ny = t.detach().numpy()
    inputs = (x_ny, t_ny)
-    shark_module = SharkInference(
-        mlir_model, func_name, device="gpu", mlir_dialect="linalg"
-    )
-    shark_module.compile()
    result = shark_module.forward(inputs)
    return torch.from_numpy(result)


+from typing import Dict
+
+
+def save_intermediate_images(args: Dict):
+    x = args["x"]
+    num_iter = args["i"]
+    for j, out in enumerate(x):
+        utils.to_pil_image(out).save(f"out_iter_" + str(num_iter) + ".png")
+    return
+
+
 def run(x, steps):
    if args.method == "ddpm":
        return sampling.sample(compiled_cfg_model_fn, x, steps, 1.0, {})
@@ -255,7 +273,13 @@ def run(x, steps):
    if args.method == "prk":
        return sampling.prk_sample(compiled_cfg_model_fn, x, steps, {})
    if args.method == "plms":
-        return sampling.plms_sample(compiled_cfg_model_fn, x, steps, {})
+        return sampling.plms_sample(
+            compiled_cfg_model_fn,
+            x,
+            steps,
+            {},
+            callback=save_intermediate_images,
+        )
    if args.method == "pie":
        return sampling.pie_sample(compiled_cfg_model_fn, x, steps, {})
    if args.method == "plms2":
--- a/tank/pytorch/v_diffusion_pytorch/cfg_sample_eager.py
+++ b/tank/pytorch/v_diffusion_pytorch/cfg_sample_eager.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+
+"""Classifier-free guidance sampling from a diffusion model."""
+
+import argparse
+from functools import partial
+from pathlib import Path
+
+from PIL import Image
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torchvision import transforms
+from torchvision.transforms import functional as TF
+from tqdm import trange
+
+from shark.shark_inference import SharkInference
+from shark.torch_mlir_lockstep_tensor import TorchMLIRLockstepTensor
+
+import sys
+
+sys.path.append("v-diffusion-pytorch")
+from CLIP import clip
+from diffusion import get_model, get_models, sampling, utils
+
+MODULE_DIR = Path(__file__).resolve().parent
+
+
+def parse_prompt(prompt, default_weight=3.0):
+    if prompt.startswith("http://") or prompt.startswith("https://"):
+        vals = prompt.rsplit(":", 2)
+        vals = [vals[0] + ":" + vals[1], *vals[2:]]
+    else:
+        vals = prompt.rsplit(":", 1)
+    vals = vals + ["", default_weight][len(vals) :]
+    return vals[0], float(vals[1])
+
+
+def resize_and_center_crop(image, size):
+    fac = max(size[0] / image.size[0], size[1] / image.size[1])
+    image = image.resize(
+        (int(fac * image.size[0]), int(fac * image.size[1])), Image.LANCZOS
+    )
+    return TF.center_crop(image, size[::-1])
+
+
+# def main():
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+p.add_argument(
+    "prompts", type=str, default=[], nargs="*", help="the text prompts to use"
+)
+p.add_argument(
+    "--images",
+    type=str,
+    default=[],
+    nargs="*",
+    metavar="IMAGE",
+    help="the image prompts",
+)
+p.add_argument(
+    "--batch-size",
+    "-bs",
+    type=int,
+    default=1,
+    help="the number of images per batch",
+)
+p.add_argument("--checkpoint", type=str, help="the checkpoint to use")
+p.add_argument("--device", type=str, help="the device to use")
+p.add_argument(
+    "--eta",
+    type=float,
+    default=0.0,
+    help="the amount of noise to add during sampling (0-1)",
+)
+p.add_argument("--init", type=str, help="the init image")
+p.add_argument(
+    "--method",
+    type=str,
+    default="plms",
+    choices=["ddpm", "ddim", "prk", "plms", "pie", "plms2", "iplms"],
+    help="the sampling method to use",
+)
+p.add_argument(
+    "--model",
+    type=str,
+    default="cc12m_1_cfg",
+    choices=["cc12m_1_cfg"],
+    help="the model to use",
+)
+p.add_argument(
+    "-n", type=int, default=1, help="the number of images to sample"
+)
+p.add_argument("--seed", type=int, default=0, help="the random seed")
+p.add_argument("--size", type=int, nargs=2, help="the output image size")
+p.add_argument(
+    "--starting-timestep",
+    "-st",
+    type=float,
+    default=0.9,
+    help="the timestep to start at (used with init images)",
+)
+p.add_argument("--steps", type=int, default=50, help="the number of timesteps")
+args = p.parse_args()
+
+if args.device:
+    device = torch.device(args.device)
+else:
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("Using device:", device)
+
+model = get_model(args.model)()
+_, side_y, side_x = model.shape
+if args.size:
+    side_x, side_y = args.size
+checkpoint = args.checkpoint
+if not checkpoint:
+    checkpoint = MODULE_DIR / f"checkpoints/{args.model}.pth"
+model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
+if device.type == "cuda":
+    model = model.half()
+model = model.to(device).eval().requires_grad_(False)
+clip_model_name = (
+    model.clip_model if hasattr(model, "clip_model") else "ViT-B/16"
+)
+clip_model = clip.load(clip_model_name, jit=False, device=device)[0]
+clip_model.eval().requires_grad_(False)
+normalize = transforms.Normalize(
+    mean=[0.48145466, 0.4578275, 0.40821073],
+    std=[0.26862954, 0.26130258, 0.27577711],
+)
+
+if args.init:
+    init = Image.open(utils.fetch(args.init)).convert("RGB")
+    init = resize_and_center_crop(init, (side_x, side_y))
+    init = (
+        utils.from_pil_image(init).to(device)[None].repeat([args.n, 1, 1, 1])
+    )
+
+zero_embed = torch.zeros([1, clip_model.visual.output_dim], device=device)
+target_embeds, weights = [zero_embed], []
+
+for prompt in args.prompts:
+    txt, weight = parse_prompt(prompt)
+    target_embeds.append(
+        clip_model.encode_text(clip.tokenize(txt).to(device)).float()
+    )
+    weights.append(weight)
+
+for prompt in args.images:
+    path, weight = parse_prompt(prompt)
+    img = Image.open(utils.fetch(path)).convert("RGB")
+    clip_size = clip_model.visual.input_resolution
+    img = resize_and_center_crop(img, (clip_size, clip_size))
+    batch = TF.to_tensor(img)[None].to(device)
+    embed = F.normalize(
+        clip_model.encode_image(normalize(batch)).float(), dim=-1
+    )
+    target_embeds.append(embed)
+    weights.append(weight)
+
+weights = torch.tensor([1 - sum(weights), *weights], device=device)
+
+torch.manual_seed(args.seed)
+
+
+def cfg_model_fn(x, t):
+    n = x.shape[0]
+    n_conds = len(target_embeds)
+    x_in = x.repeat([n_conds, 1, 1, 1])
+    t_in = t.repeat([n_conds])
+    clip_embed_in = torch.cat([*target_embeds]).repeat([n, 1])
+    vs = model(x_in, t_in, clip_embed_in).view([n_conds, n, *x.shape[1:]])
+    v = vs.mul(weights[:, None, None, None, None]).sum(0)
+    return v
+
+
+x = torch.randn([args.n, 3, side_y, side_x], device=device)
+t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
+steps = utils.get_spliced_ddpm_cosine_schedule(t)
+min_batch_size = min(args.n, args.batch_size)
+x_in = x[0:min_batch_size, :, :, :]
+ts = x_in.new_ones([x_in.shape[0]])
+t_in = t[0] * ts
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+import torch_mlir
+
+fx_g = make_fx(
+    cfg_model_fn,
+    decomposition_table=get_decompositions(
+        [
+            torch.ops.aten.embedding_dense_backward,
+            torch.ops.aten.native_layer_norm_backward,
+            torch.ops.aten.slice_backward,
+            torch.ops.aten.select_backward,
+            torch.ops.aten.norm.ScalarOpt_dim,
+            torch.ops.aten.native_group_norm,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.split_with_sizes,
+        ]
+    ),
+)(x_in, t_in)
+
+fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+fx_g.recompile()
+
+
+def strip_overloads(gm):
+    """
+    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+    Args:
+        gm(fx.GraphModule): The input Fx graph module to be modified
+    """
+    for node in gm.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    gm.recompile()
+
+
+strip_overloads(fx_g)
+
+ts_g = torch.jit.script(fx_g)
+
+# module = torch_mlir.compile(
+#    ts_g,
+#    [x_in, t_in],
+#    torch_mlir.OutputType.LINALG_ON_TENSORS,
+#    use_tracing=False,
+# )
+#
+# mlir_model = module
+# func_name = "forward"
+#
+# shark_module = SharkInference(
+#    mlir_model, func_name, device="cuda", mlir_dialect="linalg"
+# )
+# shark_module.compile()
+
+
+def compiled_cfg_model_fn(x, t):
+    x_in_eager = TorchMLIRLockstepTensor(x.clone())
+    t_in_eager = TorchMLIRLockstepTensor(t.clone())
+    return ts_g(x_in_eager, t_in_eager)
+
+
+from typing import Dict
+
+
+def save_intermediate_images(args: Dict):
+    x = args["x"]
+    num_iter = args["i"]
+    for j, out in enumerate(x):
+        utils.to_pil_image(out).save(f"out_iter_" + str(num_iter) + ".png")
+    return
+
+
+def run(x, steps):
+    if args.method == "ddpm":
+        return sampling.sample(compiled_cfg_model_fn, x, steps, 1.0, {})
+    if args.method == "ddim":
+        return sampling.sample(compiled_cfg_model_fn, x, steps, args.eta, {})
+    if args.method == "prk":
+        return sampling.prk_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "plms":
+        return sampling.plms_sample(
+            compiled_cfg_model_fn,
+            x,
+            steps,
+            {},
+            callback=save_intermediate_images,
+        )
+    if args.method == "pie":
+        return sampling.pie_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "plms2":
+        return sampling.plms2_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "iplms":
+        return sampling.iplms_sample(compiled_cfg_model_fn, x, steps, {})
+    assert False
+
+
+def run_all(x, t, steps, n, batch_size):
+    x = torch.randn([n, 3, side_y, side_x], device=device)
+    t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
+    steps = utils.get_spliced_ddpm_cosine_schedule(t)
+    if args.init:
+        steps = steps[steps < args.starting_timestep]
+        alpha, sigma = utils.t_to_alpha_sigma(steps[0])
+        x = init * alpha + x * sigma
+    for i in trange(0, n, batch_size):
+        cur_batch_size = min(n - i, batch_size)
+        outs = run(x[i : i + cur_batch_size], steps)
+        for j, out in enumerate(outs):
+            utils.to_pil_image(out).save(f"out_{i + j:05}.png")
+
+
+steps = 1
+
+run_all(x, t, steps, args.n, args.batch_size)
--- a/tank/pytorch/v_diffusion_pytorch/cfg_sample_from_mlir.py
+++ b/tank/pytorch/v_diffusion_pytorch/cfg_sample_from_mlir.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+
+"""Classifier-free guidance sampling from a diffusion model."""
+
+import argparse
+from functools import partial
+from pathlib import Path
+
+from PIL import Image
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torchvision import transforms
+from torchvision.transforms import functional as TF
+from tqdm import trange
+
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_torch_model
+import numpy as np
+
+import sys
+
+sys.path.append("v-diffusion-pytorch")
+from CLIP import clip
+from diffusion import get_model, get_models, sampling, utils
+
+MODULE_DIR = Path(__file__).resolve().parent
+
+
+def parse_prompt(prompt, default_weight=3.0):
+    if prompt.startswith("http://") or prompt.startswith("https://"):
+        vals = prompt.rsplit(":", 2)
+        vals = [vals[0] + ":" + vals[1], *vals[2:]]
+    else:
+        vals = prompt.rsplit(":", 1)
+    vals = vals + ["", default_weight][len(vals) :]
+    return vals[0], float(vals[1])
+
+
+def resize_and_center_crop(image, size):
+    fac = max(size[0] / image.size[0], size[1] / image.size[1])
+    image = image.resize(
+        (int(fac * image.size[0]), int(fac * image.size[1])), Image.LANCZOS
+    )
+    return TF.center_crop(image, size[::-1])
+
+
+# def main():
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+p.add_argument(
+    "prompts", type=str, default=[], nargs="*", help="the text prompts to use"
+)
+p.add_argument(
+    "--images",
+    type=str,
+    default=[],
+    nargs="*",
+    metavar="IMAGE",
+    help="the image prompts",
+)
+p.add_argument(
+    "--batch-size",
+    "-bs",
+    type=int,
+    default=1,
+    help="the number of images per batch",
+)
+p.add_argument("--checkpoint", type=str, help="the checkpoint to use")
+p.add_argument("--device", type=str, help="the device to use")
+p.add_argument(
+    "--runtime_device",
+    type=str,
+    help="the device to use with SHARK",
+    default="cpu",
+)
+p.add_argument(
+    "--eta",
+    type=float,
+    default=0.0,
+    help="the amount of noise to add during sampling (0-1)",
+)
+p.add_argument("--init", type=str, help="the init image")
+p.add_argument(
+    "--method",
+    type=str,
+    default="plms",
+    choices=["ddpm", "ddim", "prk", "plms", "pie", "plms2", "iplms"],
+    help="the sampling method to use",
+)
+p.add_argument(
+    "--model",
+    type=str,
+    default="cc12m_1_cfg",
+    choices=["cc12m_1_cfg"],
+    help="the model to use",
+)
+p.add_argument(
+    "-n", type=int, default=1, help="the number of images to sample"
+)
+p.add_argument("--seed", type=int, default=0, help="the random seed")
+p.add_argument("--size", type=int, nargs=2, help="the output image size")
+p.add_argument(
+    "--starting-timestep",
+    "-st",
+    type=float,
+    default=0.9,
+    help="the timestep to start at (used with init images)",
+)
+p.add_argument("--steps", type=int, default=50, help="the number of timesteps")
+args = p.parse_args()
+
+if args.device:
+    device = torch.device(args.device)
+else:
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("Using device:", device)
+
+model = get_model(args.model)()
+_, side_y, side_x = model.shape
+if args.size:
+    side_x, side_y = args.size
+checkpoint = args.checkpoint
+if not checkpoint:
+    checkpoint = MODULE_DIR / f"checkpoints/{args.model}.pth"
+model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
+if device.type == "cuda":
+    model = model.half()
+model = model.to(device).eval().requires_grad_(False)
+clip_model_name = (
+    model.clip_model if hasattr(model, "clip_model") else "ViT-B/16"
+)
+clip_model = clip.load(clip_model_name, jit=False, device=device)[0]
+clip_model.eval().requires_grad_(False)
+normalize = transforms.Normalize(
+    mean=[0.48145466, 0.4578275, 0.40821073],
+    std=[0.26862954, 0.26130258, 0.27577711],
+)
+
+if args.init:
+    init = Image.open(utils.fetch(args.init)).convert("RGB")
+    init = resize_and_center_crop(init, (side_x, side_y))
+    init = (
+        utils.from_pil_image(init).to(device)[None].repeat([args.n, 1, 1, 1])
+    )
+
+zero_embed = torch.zeros([1, clip_model.visual.output_dim], device=device)
+target_embeds, weights = [zero_embed], []
+
+for prompt in args.prompts:
+    txt, weight = parse_prompt(prompt)
+    target_embeds.append(
+        clip_model.encode_text(clip.tokenize(txt).to(device)).float()
+    )
+    weights.append(weight)
+
+for prompt in args.images:
+    path, weight = parse_prompt(prompt)
+    img = Image.open(utils.fetch(path)).convert("RGB")
+    clip_size = clip_model.visual.input_resolution
+    img = resize_and_center_crop(img, (clip_size, clip_size))
+    batch = TF.to_tensor(img)[None].to(device)
+    embed = F.normalize(
+        clip_model.encode_image(normalize(batch)).float(), dim=-1
+    )
+    target_embeds.append(embed)
+    weights.append(weight)
+
+weights = torch.tensor([1 - sum(weights), *weights], device=device)
+
+torch.manual_seed(args.seed)
+
+
+def cfg_model_fn(x, t):
+    n = x.shape[0]
+    n_conds = len(target_embeds)
+    x_in = x.repeat([n_conds, 1, 1, 1])
+    t_in = t.repeat([n_conds])
+    clip_embed_in = torch.cat([*target_embeds]).repeat([n, 1])
+    vs = model(x_in, t_in, clip_embed_in).view([n_conds, n, *x.shape[1:]])
+    v = vs.mul(weights[:, None, None, None, None]).sum(0)
+    return v
+
+
+x = torch.randn([args.n, 3, side_y, side_x], device=device)
+t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
+steps = utils.get_spliced_ddpm_cosine_schedule(t)
+min_batch_size = min(args.n, args.batch_size)
+x_in = x[0:min_batch_size, :, :, :]
+ts = x_in.new_ones([x_in.shape[0]])
+t_in = t[0] * ts
+
+mlir_model, func_name, inputs, golden_out = download_torch_model("v_diffusion")
+
+shark_module = SharkInference(
+    mlir_model, func_name, device=args.runtime_device, mlir_dialect="linalg"
+)
+shark_module.compile()
+
+
+def compiled_cfg_model_fn(x, t):
+    x_ny = x.detach().numpy()
+    t_ny = t.detach().numpy()
+    inputs = (x_ny, t_ny)
+    result = shark_module.forward(inputs)
+    return torch.from_numpy(result)
+
+
+from typing import Dict
+
+
+def save_intermediate_images(args: Dict):
+    x = args["x"]
+    num_iter = args["i"]
+    for j, out in enumerate(x):
+        utils.to_pil_image(out).save(f"out_iter_" + str(num_iter) + ".png")
+    return
+
+
+def run(x, steps):
+    if args.method == "ddpm":
+        return sampling.sample(compiled_cfg_model_fn, x, steps, 1.0, {})
+    if args.method == "ddim":
+        return sampling.sample(compiled_cfg_model_fn, x, steps, args.eta, {})
+    if args.method == "prk":
+        return sampling.prk_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "plms":
+        return sampling.plms_sample(
+            compiled_cfg_model_fn,
+            x,
+            steps,
+            {},
+            callback=save_intermediate_images,
+        )
+    if args.method == "pie":
+        return sampling.pie_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "plms2":
+        return sampling.plms2_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "iplms":
+        return sampling.iplms_sample(compiled_cfg_model_fn, x, steps, {})
+    assert False
+
+
+def run_all(x, t, steps, n, batch_size):
+    x = torch.randn([n, 3, side_y, side_x], device=device)
+    t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
+    steps = utils.get_spliced_ddpm_cosine_schedule(t)
+    if args.init:
+        steps = steps[steps < args.starting_timestep]
+        alpha, sigma = utils.t_to_alpha_sigma(steps[0])
+        x = init * alpha + x * sigma
+    for i in trange(0, n, batch_size):
+        cur_batch_size = min(n - i, batch_size)
+        outs = run(x[i : i + cur_batch_size], steps)
+        for j, out in enumerate(outs):
+            utils.to_pil_image(out).save(f"out_{i + j:05}.png")
+
+
+run_all(x, t, steps, args.n, args.batch_size)
--- a/tank/pytorch/v_diffusion_pytorch/cfg_sample_preprocess.py
+++ b/tank/pytorch/v_diffusion_pytorch/cfg_sample_preprocess.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+
+"""Classifier-free guidance sampling from a diffusion model."""
+
+import argparse
+from functools import partial
+from pathlib import Path
+
+from PIL import Image
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torchvision import transforms
+from torchvision.transforms import functional as TF
+from tqdm import trange
+import numpy as np
+
+from shark.shark_inference import SharkInference
+
+import sys
+
+sys.path.append("v-diffusion-pytorch")
+from CLIP import clip
+from diffusion import get_model, get_models, sampling, utils
+from torch.nn import functional as F
+
+MODULE_DIR = Path(__file__).resolve().parent
+
+
+def parse_prompt(prompt, default_weight=3.0):
+    if prompt.startswith("http://") or prompt.startswith("https://"):
+        vals = prompt.rsplit(":", 2)
+        vals = [vals[0] + ":" + vals[1], *vals[2:]]
+    else:
+        vals = prompt.rsplit(":", 1)
+    vals = vals + ["", default_weight][len(vals) :]
+    return vals[0], float(vals[1])
+
+
+def resize_and_center_crop(image, size):
+    fac = max(size[0] / image.size[0], size[1] / image.size[1])
+    image = image.resize(
+        (int(fac * image.size[0]), int(fac * image.size[1])), Image.LANCZOS
+    )
+    return TF.center_crop(image, size[::-1])
+
+
+# def main():
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+p.add_argument(
+    "prompts", type=str, default=[], nargs="*", help="the text prompts to use"
+)
+p.add_argument(
+    "--images",
+    type=str,
+    default=[],
+    nargs="*",
+    metavar="IMAGE",
+    help="the image prompts",
+)
+p.add_argument(
+    "--batch-size",
+    "-bs",
+    type=int,
+    default=1,
+    help="the number of images per batch",
+)
+p.add_argument("--checkpoint", type=str, help="the checkpoint to use")
+p.add_argument("--device", type=str, help="the device to use")
+p.add_argument(
+    "--runtime_device",
+    type=str,
+    help="the device to use with SHARK",
+    default="intel-gpu",
+)
+p.add_argument(
+    "--eta",
+    type=float,
+    default=0.0,
+    help="the amount of noise to add during sampling (0-1)",
+)
+p.add_argument("--init", type=str, help="the init image")
+p.add_argument(
+    "--method",
+    type=str,
+    default="plms",
+    choices=["ddpm", "ddim", "prk", "plms", "pie", "plms2", "iplms"],
+    help="the sampling method to use",
+)
+p.add_argument(
+    "--model",
+    type=str,
+    default="cc12m_1_cfg",
+    choices=["cc12m_1_cfg"],
+    help="the model to use",
+)
+p.add_argument(
+    "-n", type=int, default=1, help="the number of images to sample"
+)
+p.add_argument("--seed", type=int, default=0, help="the random seed")
+p.add_argument("--size", type=int, nargs=2, help="the output image size")
+p.add_argument(
+    "--starting-timestep",
+    "-st",
+    type=float,
+    default=0.9,
+    help="the timestep to start at (used with init images)",
+)
+p.add_argument("--steps", type=int, default=50, help="the number of timesteps")
+args = p.parse_args()
+
+if args.device:
+    device = torch.device(args.device)
+else:
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("Using device:", device)
+
+model = get_model(args.model)()
+_, side_y, side_x = model.shape
+if args.size:
+    side_x, side_y = args.size
+checkpoint = args.checkpoint
+if not checkpoint:
+    checkpoint = MODULE_DIR / f"checkpoints/{args.model}.pth"
+model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
+if device.type == "cuda":
+    model = model.half()
+model = model.to(device).eval().requires_grad_(False)
+clip_model_name = (
+    model.clip_model if hasattr(model, "clip_model") else "ViT-B/16"
+)
+clip_model = clip.load(clip_model_name, jit=False, device=device)[0]
+clip_model.eval().requires_grad_(False)
+normalize = transforms.Normalize(
+    mean=[0.48145466, 0.4578275, 0.40821073],
+    std=[0.26862954, 0.26130258, 0.27577711],
+)
+
+if args.init:
+    init = Image.open(utils.fetch(args.init)).convert("RGB")
+    init = resize_and_center_crop(init, (side_x, side_y))
+    init = (
+        utils.from_pil_image(init).to(device)[None].repeat([args.n, 1, 1, 1])
+    )
+
+zero_embed = torch.zeros([1, clip_model.visual.output_dim], device=device)
+target_embeds, weights = [zero_embed], []
+
+for prompt in args.prompts:
+    txt, weight = parse_prompt(prompt)
+    target_embeds.append(
+        clip_model.encode_text(clip.tokenize(txt).to(device)).float()
+    )
+    weights.append(weight)
+
+for prompt in args.images:
+    path, weight = parse_prompt(prompt)
+    img = Image.open(utils.fetch(path)).convert("RGB")
+    clip_size = clip_model.visual.input_resolution
+    img = resize_and_center_crop(img, (clip_size, clip_size))
+    batch = TF.to_tensor(img)[None].to(device)
+    embed = F.normalize(
+        clip_model.encode_image(normalize(batch)).float(), dim=-1
+    )
+    target_embeds.append(embed)
+    weights.append(weight)
+
+weights = torch.tensor([1 - sum(weights), *weights], device=device)
+
+torch.manual_seed(args.seed)
+
+
+def cfg_model_fn(x, timestep_embed, selfcond):
+    vs = model(x, timestep_embed, selfcond)
+    return vs
+
+
+def expand_to_planes(input, shape):
+    return input[..., None, None].repeat([1, 1, shape[2], shape[3]])
+
+
+x = torch.randn([args.n, 3, side_y, side_x], device=device)
+t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
+steps = utils.get_spliced_ddpm_cosine_schedule(t)
+min_batch_size = min(args.n, args.batch_size)
+x_in = x[0:min_batch_size, :, :, :]
+ts = x_in.new_ones([x_in.shape[0]])
+t_in = t[0] * ts
+
+n_conds = len(target_embeds)
+x_in = x.repeat([n_conds, 1, 1, 1])
+t_in = t.repeat([n_conds])
+clip_embed_in = torch.cat([*target_embeds]).repeat([args.n, 1])
+
+x_in = torch.randn(2, 3, 256, 256)
+t_in = torch.randn(2)
+clip_embed_in = torch.randn(2, 512)
+
+clip_embed = (
+    F.normalize(clip_embed_in, dim=-1) * clip_embed_in.shape[-1] ** 0.5
+)
+mapping_timestep_embed = model.mapping_timestep_embed(t_in[:, None])
+selfcond = model.mapping(
+    torch.cat([clip_embed, mapping_timestep_embed], dim=1)
+)
+timestep_embed = expand_to_planes(
+    model.timestep_embed(t_in[:, None]), x_in.shape
+)
+
+# x_in = torch.randn(2, 3, 256, 256)
+# selfcond = torch.randn(2, 1024)
+# timestep_embed = torch.randn(2, 512)
+
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+import torch_mlir
+
+fx_g = make_fx(
+    cfg_model_fn,
+    decomposition_table=get_decompositions(
+        [
+            torch.ops.aten.embedding_dense_backward,
+            torch.ops.aten.native_layer_norm_backward,
+            torch.ops.aten.slice_backward,
+            torch.ops.aten.select_backward,
+            torch.ops.aten.norm.ScalarOpt_dim,
+            torch.ops.aten.native_group_norm,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.split_with_sizes,
+        ]
+    ),
+)(x_in, timestep_embed, selfcond)
+
+fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+fx_g.recompile()
+
+
+def strip_overloads(gm):
+    """
+    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+    Args:
+        gm(fx.GraphModule): The input Fx graph module to be modified
+    """
+    for node in gm.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    gm.recompile()
+
+
+strip_overloads(fx_g)
+
+ts_g = torch.jit.script(fx_g)
+
+module = torch_mlir.compile(
+    ts_g,
+    [x_in, timestep_embed, selfcond],
+    torch_mlir.OutputType.LINALG_ON_TENSORS,
+    use_tracing=False,
+)
+
+mlir_model = module
+func_name = "forward"
+
+shark_module = SharkInference(
+    mlir_model, func_name, device=args.runtime_device, mlir_dialect="linalg"
+)
+shark_module.compile()
+
+
+def compiled_cfg_model_fn(x, t):
+    # Preprocessing previously found in cfg_model_fn
+    n = x.shape[0]
+    n_conds = len(target_embeds)
+    x_in = x.repeat([n_conds, 1, 1, 1])
+    t_in = t.repeat([n_conds])
+    clip_embed_in = torch.cat([*target_embeds]).repeat([n, 1])
+
+    # Initial setup found in base v-diffusion
+    clip_embed = (
+        F.normalize(clip_embed_in, dim=-1) * clip_embed_in.shape[-1] ** 0.5
+    )
+    mapping_timestep_embed = model.mapping_timestep_embed(t_in[:, None])
+    selfcond = model.mapping(
+        torch.cat([clip_embed, mapping_timestep_embed], dim=1)
+    )
+    timestep_embed = expand_to_planes(
+        model.timestep_embed(t_in[:, None]), x_in.shape
+    )
+
+    x_ny = x_in.detach().numpy()
+    timestep_embed_ny = timestep_embed.detach().numpy()
+    selfcond_ny = selfcond.detach().numpy()
+    inputs = (x_ny, timestep_embed_ny, selfcond_ny)
+    result = shark_module.forward(inputs)
+
+    vs = torch.from_numpy(result).view([n_conds, n, *x.shape[1:]])
+    v = vs.mul(weights[:, None, None, None, None]).sum(0)
+    return v
+
+
+from typing import Dict
+
+
+def save_intermediate_images(args: Dict):
+    x = args["x"]
+    num_iter = args["i"]
+    for j, out in enumerate(x):
+        utils.to_pil_image(out).save(f"out_iter_" + str(num_iter) + ".png")
+    return
+
+
+def run(x, steps):
+    if args.method == "ddpm":
+        return sampling.sample(compiled_cfg_model_fn, x, steps, 1.0, {})
+    if args.method == "ddim":
+        return sampling.sample(compiled_cfg_model_fn, x, steps, args.eta, {})
+    if args.method == "prk":
+        return sampling.prk_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "plms":
+        return sampling.plms_sample(
+            compiled_cfg_model_fn,
+            x,
+            steps,
+            {},
+            callback=save_intermediate_images,
+        )
+    if args.method == "pie":
+        return sampling.pie_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "plms2":
+        return sampling.plms2_sample(compiled_cfg_model_fn, x, steps, {})
+    if args.method == "iplms":
+        return sampling.iplms_sample(compiled_cfg_model_fn, x, steps, {})
+    assert False
+
+
+def run_all(x, t, steps, n, batch_size):
+    x = torch.randn([n, 3, side_y, side_x], device=device)
+    t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
+    steps = utils.get_spliced_ddpm_cosine_schedule(t)
+    if args.init:
+        steps = steps[steps < args.starting_timestep]
+        alpha, sigma = utils.t_to_alpha_sigma(steps[0])
+        x = init * alpha + x * sigma
+    for i in trange(0, n, batch_size):
+        cur_batch_size = min(n - i, batch_size)
+        outs = run(x[i : i + cur_batch_size], steps)
+        for j, out in enumerate(outs):
+            utils.to_pil_image(out).save(f"out_{i + j:05}.png")
+
+
+run_all(x, t, steps, args.n, args.batch_size)
--- a/tank/pytorch/v_diffusion_pytorch/setup_v_diffusion_pytorch.sh
+++ b/tank/pytorch/v_diffusion_pytorch/setup_v_diffusion_pytorch.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 TD="$(cd $(dirname $0) && pwd)"
 if [ -z "$PYTHON" ]; then
  PYTHON="$(which python3)"
@@ -20,3 +22,5 @@ pip install -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html -

 mkdir checkpoints
 wget https://the-eye.eu/public/AI/models/v-diffusion/cc12m_1_cfg.pth -P checkpoints/
+
+cp -r checkpoints/ v-diffusion-pytorch/
--- a/tank/resnet101_torch/resnet101_torch_test.py
+++ b/tank/resnet101_torch/resnet101_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class Resnet101ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "resnet101", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "resnet101",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class Resnet101ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = Resnet101ModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/resnet18_torch/resnet18_torch_test.py
+++ b/tank/resnet18_torch/resnet18_torch_test.py
@@ -1,112 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import get_vision_model, compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class Resnet18ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "resnet18", dynamic
-        )
-
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "resnet18",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class Resnet18ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = Resnet18ModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/resnet50/resnet50_test.py
+++ b/tank/resnet50/resnet50_test.py
@@ -1,81 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_downloader import download_tf_model
-from shark.parser import shark_args
-
-import unittest
-import numpy as np
-import pytest
-import numpy as np
-
-
-class Resnet50ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model("resnet50")
-
-        shark_module = SharkInference(
-            model,
-            func_name,
-            device=device,
-            mlir_dialect="mhlo",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-        if self.benchmark == True:
-            shark_args.enable_tf32 = True
-            shark_args.onnx_bench = self.onnx_bench
-            shark_module.shark_runner.benchmark_all_csv(
-                (inputs), "resnet50", dynamic, device, "tensorflow"
-            )
-
-
-class Resnet50ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = Resnet50ModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Prashant Kumar	326827198b	Update vulkan_utils.py	2022-10-11 20:53:41 +05:30
powderluv	a63755bc24	Correct spelling	2022-10-11 01:53:55 -07:00
Quinn Dawkins	d93d0783a8	Add script for tensorflow stable diffusion (#391 )	2022-10-10 12:01:49 -04:00
Daniel Garvey	d38e37bd99	seperate importer and benchmark deps (#393 )	2022-10-08 23:31:20 -05:00
Ean Garvey	3618fb3ada	Move old test scripts out of base tank directory and add xfails. (#389 )	2022-10-07 16:02:46 -07:00
Vivek Khandelwal	70a29b03e0	Add FP16 Resnet50 script	2022-10-06 21:56:43 +05:30
Ean Garvey	006adf8746	Fix issue with FASTAPI pip install. (#382 )	2022-10-01 14:55:24 -05:00
Quinn Dawkins	33b53e7caf	Add flag for specifying the vae mlir file location in stable diffusion (#381 )	2022-09-30 00:37:58 -04:00
Daniel Garvey	c54815de17	edit assets path (#376 )	2022-09-28 16:42:36 -05:00
Gaurav Shukla	0013fb0753	[WEB] Add shark-web logging 1. This commit adds support to display logs in the shark-web. 2. It also adds nod logo in the home page. 3. Stable-diffusion outputs are being saved now. Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2022-09-29 01:20:42 +05:30
Ean Garvey	56f8a0d85a	Update torch-mlir releases page in setup_venv.sh (#374 ) * Update README.md * Update setup_venv.sh	2022-09-28 11:07:44 -07:00
Ean Garvey	9035a2eed3	Add --local_tank_cache flag and update requirements. (#368 ) * Add --local_tank_cache flag and update requirements. * Update requirements-importer.txt	2022-09-28 03:02:59 -05:00
Vivek Khandelwal	28daf410b6	Add instructions to use locally build Torch-MLIR with SHARK	2022-09-28 10:16:38 +05:30
Ean Garvey	cbf3f784aa	Add pytest option to specify a URL for shark tank artifacts. (#363 ) * Xfail updates. * Generalize tank SHA option to bucket address and add pytest option.	2022-09-27 02:40:40 -05:00
Anush Elangovan	ef4b306c7b	Add diffusers and scipy	2022-09-26 13:35:23 -07:00
powderluv	5316c1e0bf	Use latest transformers (#346 )	2022-09-26 13:11:41 -07:00
Gaurav Shukla	0228973eef	[WEB] Fix the mlir location of stable-diffusion model (#367 ) Update the location of stable-diffusion mlir file since there is some problem with iree-compile. Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2022-09-26 10:56:36 -07:00
Gaurav Shukla	d4eeff0a5d	[WEB] Add Stable-Diffusion in the SHARK web (#366 ) 1. This commit adds stable-diffusion as a part of shark web. 2. The V-diffusion model has been disabled for now as it's not working(will raise a different patch with fix). 3. Add standard output in the web ui. 4. Add instructions to launch the shark-web. Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2022-09-26 10:42:02 -07:00
Prashant Kumar	c7b2d39ab2	Update stable_diff to contain vae.	2022-09-26 20:11:43 +05:30
Gaurav Shukla	21958cc02a	[WEB] Remove unused parameters in the v-diffuison model (#314 ) This commit removes unused parameters in the v-diffusion model. It also updated the server parameters in order to make multiple requests to be handled sequentially. Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2022-09-25 10:57:06 -07:00
Ean Garvey	de23e5d9d7	update xfails for PyTorch DistilBERT (#355 )	2022-09-24 14:53:20 -05:00
Quinn Dawkins	6438bce023	Add a script to convert a jpg to the correct input for resnet50 with the vulkan gui (#362 )	2022-09-23 16:32:52 -07:00
yzhang93	587d74b449	Update model annotation tool (#361 ) Usage: with create_context() as ctx: module = model_annotation(ctx, input_contents=..., config_path=..., search_op=...) Example: The example is to annotate the minilm model with GPU config files. python model_annotation.py /nodclouddata/vivian/minilm_model/model.mlir /nodclouddata/vivian/minilm_model/model_config.json	2022-09-23 15:44:51 -07:00
Prashant Kumar	b9c8985047	Add sharkdynamo which combines shark with torchdynamo. -- Adds graph breaks when necessary. -- Even for loops are supported.	2022-09-23 22:40:02 +05:30
Vivek Khandelwal	93ebe07d2b	Add bert_tosa script	2022-09-23 10:52:06 +05:30
Ean Garvey	d82b305781	Fix issues with loading .vmfb into SharkInference	2022-09-23 09:53:13 +05:30
Quinn Dawkins	1df20fac95	[Lockstep] Hack to avoid aten._reshape_alias (#332 ) This enforces the decomposition for aten._reshape_alias used in AOTAutograd to essentially avoid having to deal with problems with strides when running in eager mode.	2022-09-22 18:02:09 -04:00
Prashant Kumar	991e7043d1	Add stable diffusion model.	2022-09-22 13:40:51 +05:30
powderluv	1c4d6c23fa	Update CMakeLists.txt	2022-09-21 22:48:56 -07:00
Anush Elangovan	87895446a5	Roll SHARK-Runtime	2022-09-22 00:09:04 -07:00
Ean Garvey	c0f3a09a40	Include SHA in path to failure reproducers. Add --save_fails option. (#352 )	2022-09-21 17:55:06 -05:00
Anush Elangovan	e9ad4b9fc4	Update SHARK Runtime	2022-09-21 06:31:48 -07:00
Ean Garvey	c061a8897d	Add pytest options to save reproducers. (#350 ) * Add pytest options to save and/or upload reproducers. * pass shark_module to benchmark method.	2022-09-20 20:29:46 -05:00
Ean Garvey	4253551b67	Update README with new testing instructions and filter test cases. (#349 )	2022-09-20 15:55:46 -05:00
Vivek Khandelwal	e4991c049e	Add Readme file for the bloom model	2022-09-20 20:27:52 +05:30
Daniel Garvey	5df582e7e8	creates abstract test case class (#333 )	2022-09-20 07:06:38 -07:00
Ean Garvey	814a6f8295	Modify vulkan target triple substring searches. (#318 )	2022-09-20 01:20:20 -05:00
Vivek Khandelwal	7013c3cd4a	Add bloom e2e script	2022-09-20 10:56:04 +05:30
powderluv	0ddd65b6f1	Create LICENSE	2022-09-19 15:07:59 -07:00
powderluv	44d8f08bfc	Fix Torch-MLIR release page	2022-09-17 00:50:39 -07:00
erman-gurses	fc8aa6ae63	Add ROCM parameters (#335 )	2022-09-16 09:12:19 -07:00
Quinn Dawkins	9bd951b083	Clean up the v-diffusion install pipeline (#327 )	2022-09-16 11:47:07 -04:00
Vivek Khandelwal	c43448a826	Update compile_utils.py	2022-09-15 18:28:10 +05:30
Vivek Khandelwal	864723a473	add bloom model example	2022-09-15 18:23:09 +05:30
Anush Elangovan	3b0ec8ce4e	Update resnet paths	2022-09-14 16:56:20 -07:00
Anush Elangovan	174b171913	Clean up SDL linking	2022-09-14 13:18:55 -07:00
powderluv	cfd9733c2b	Delete shark_web directory	2022-09-14 06:38:30 -07:00
Anush Elangovan	8d4d543a49	Update shark runtime	2022-09-14 06:14:02 -07:00
Anush Elangovan	1b9c88a052	Update vulkan gui readme	2022-09-13 19:35:47 -07:00
Anush Elangovan	e212ff2071	Fix resnet50 vulkan_gui to work with tank models	2022-09-13 19:22:41 -07:00
Quinn Dawkins	8d21292d34	Fix input tensors with non-floating point dtype in the lockstep tracer (#328 )	2022-09-13 21:14:38 -04:00
Anush Elangovan	e304041574	Remove redundant {}	2022-09-13 16:12:35 -07:00
Anush Elangovan	1776c55e73	Fix torch-mlir download URL	2022-09-13 16:07:25 -07:00
Anush Elangovan	4e4c34c717	fix release downloads	2022-09-13 15:00:47 -07:00
Anush Elangovan	23378b6be8	Add resnet to vulkan-gui	2022-09-13 07:06:47 -07:00
Ean Garvey	6cf5564c84	Remove "gpu" device alias and migrate to using "cuda" for NVIDIA GPU. (#325 ) * Replace instances of "gpu" alias for devices with "cuda"	2022-09-13 01:16:56 -05:00
Ean Garvey	7143902a90	Update test-models.yml (#323 )	2022-09-12 22:47:40 -05:00
Anush Elangovan	15186db73f	Hardcode SDL2 for now (works on linux)	2022-09-12 10:17:41 -07:00
powderluv	ccd7a01ce2	Update README.md	2022-09-12 07:12:57 -07:00
Anush Elangovan	1d7035117d	Add cpp inference examples and vulkan_gui	2022-09-12 07:07:33 -07:00
Ean Garvey	1710abd366	Update mobilenet_v3_small_torch_test.py (#322 )	2022-09-10 15:22:57 -05:00
Ean Garvey	6aeda3670f	Split nightly workflow by backend (IREE / SHARK) (#313 ) * Fix validation for nightly builds. * Add option to generate shark_tank inside SHARK project Add shark_arg for updating tank on mismatched hash (downloader) * Fixup CI tank dir option. * Fixup work directory variable	2022-09-09 22:51:30 +05:30
Prashant Kumar	bb52b224d0	Add sparse architecture and test with torchrec SparseArch. Features that don't work with current implementation: -- embeddingbag config with multiple features.	2022-09-09 21:49:30 +05:30
Stanley Winata	95ec3d7216	[tank][v-diffusion] Polish up v-diffusion UX (#315 )	2022-09-08 12:55:51 -07:00
powderluv	18872222d3	Update README.md	2022-09-07 01:14:30 -07:00
Ean Garvey	d453f2e49d	Enable CPU benchmarks on test-models workflows. (#299 ) * Update test-models.yml * Update README.md	2022-09-07 01:22:58 -05:00
Ean Garvey	3824d37d27	Add metadata to benchmark results. (#297 )	2022-09-06 13:03:48 -05:00
Ean Garvey	d946287723	Update xfails for torchvision models. (#310 )	2022-09-01 13:06:12 -05:00
Prashant Kumar	885b0969f5	[WEB] Cache the compiled module. -- Don't compile the module again and again.	2022-09-01 23:08:08 +05:30
Gaurav Shukla	a886cba655	[WEB] Add `v_diffusion` model in the shark web (#306 ) This commit adds adds `v_diffusion` model web visualization as a part of shark web. Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2022-09-01 06:34:51 -07:00
Vivek Khandelwal	4afe2e3adb	Add func to save intermediate images in v_diffusion_pytorch	2022-09-01 18:36:58 +05:30
Gaurav Shukla	fe080eaee6	[WEB] Introduce web interface for the SHARK models (#298 ) This commit introduces web application for SHARK using gradio platform. This adds web visualization of `Resnet50` and `Albert_Maskfill` models as a start. Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2022-08-31 23:17:52 -07:00
Quinn Dawkins	3703f014d9	Add scripts for generating images on ats-m (#305 )	2022-08-31 23:07:02 -07:00
Daniel Garvey	d45a496030	adds a flag to enable directory choice (#303 ) individual tests will require implementation of the flag alternatively, simply passing shark_default_sha in your individual app's download function will allow for this behavior	2022-08-31 22:17:40 -07:00
powderluv	4ee164c66f	remove a100 cpu	2022-08-31 12:59:47 -07:00
powderluv	bf84c033bb	add icelake	2022-08-31 12:58:40 -07:00
Prashant Kumar	5105f62551	Add the dlrm_model in shark example. (#301 ) -- DLRM model is added in the shark example. -- The model is verified on cpu, gpu and vulkan. Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2022-08-31 12:54:21 -07:00
Quinn Dawkins	99be837d84	Add lockstep tracer based on TorchMLIR eager mode + examples (#243 )	2022-08-31 15:50:24 -04:00
Quinn Dawkins	b7766898ee	Add cfg sampling from tank model for v-diffusion and move compilation outside of the sampling loop (#302 )	2022-08-31 11:35:04 -07:00
powderluv	57f73dfbc9	Update nightly.yml	2022-08-28 23:59:03 -07:00
powderluv	50b2b9638d	Update nightly.yml	2022-08-28 23:43:32 -07:00
Daniel Garvey	1bfd00e2f8	fixes an install issue (#295 )	2022-08-25 18:52:00 -05:00
Daniel Garvey	64424877ac	No iree instal (#294 ) * adds support to default to tuned model currently setup for tf bert/resnet50 going to refactor test class to avoid having to add an argument to 50+ files * adds an option to avoid installing iree useful when building iree from source specify env variable NO_BACKEND=1	2022-08-25 15:02:28 -05:00
Phaneesh Barwaria	02d857260c	Update ReadMe -Add gsutil installation for resnet50 example	2022-08-25 20:28:50 +05:30
Phaneesh Barwaria	1322ec5935	Simplified Testing Interface (#289 )	2022-08-24 23:54:56 -05:00
Daniel Garvey	48e9818f7e	adds support to default to tuned model (#287 ) currently setup for tf bert/resnet50 going to refactor test class to avoid having to add an argument to 50+ files	2022-08-24 16:30:02 -05:00
Ean Garvey	14857770dc	Fix local artifact recognition and usage by SHARK downloader. (#286 ) * Fix local artifact recognition and usage by SHARK downloader. * Update generate_sharktank.py * Update generate_sharktank.py	2022-08-24 14:37:16 -05:00
Vivek Khandelwal	f79a6bf5aa	Update setup_v_diffusion_pytorch.sh (#291 ) Fix minor issue with v-diffusion PyTorch version	2022-08-24 22:00:02 +05:30
Prashant Kumar	7dc27a7477	Don't remove the latest .whl package from CI. (#290 ) Previously, the CI was removing the latest package and pointing to the stale package.	2022-08-24 09:03:48 -07:00
Chi_Liu	17dba601c8	Add huggingface top5 image classification automodel (#268 )	2022-08-22 15:05:38 -07:00
Chi_Liu	064aa3b1f4	Fix tmp dir bug (#285 )	2022-08-22 15:00:35 -07:00
Ean Garvey	4960efc686	Update requirements-importer.txt (#284 )	2022-08-19 23:21:41 -05:00
				`@@ -1 +0,0 @@`
				`platform,model,dynamic,device,iter/sec,ms/iter,datetime`