WIP: Intel-GPU integration

2026-01-11 14:58:11 -05:00 · 2022-08-10 11:24:00 -07:00
228 changed files with 3757 additions and 23002 deletions
--- a/.github/workflows/gh-pages-releases.yml
+++ b/.github/workflows/gh-pages-releases.yml
@@ -1,37 +0,0 @@
-# See: https://github.com/llvm/torch-mlir/issues/1374
-name: Publish releases page
-
-on:
-  workflow_dispatch:
-
-jobs:
-  scrape_and_publish_releases:
-    name: "Scrape and publish releases"
-    runs-on: ubuntu-latest
-
-    # Don't run this in everyone's forks.
-    if: github.repository == 'nod-ai/SHARK'
-
-    steps:
-      - name: Checking out repository
-        uses: actions/checkout@v2
-        with:
-          token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      - name: Run scrape releases script
-        run: python ./build_tools/scrape_releases.py nod-ai SHARK > /tmp/index.html
-        shell: bash
-      - run: git fetch --all
-      - run: git switch github-pages
-      - run: git config --global user.email "none@none.com"
-      - run: git config --global user.name "nod-ai"
-      - run: mv /tmp/index.html package-index/index.html
-      - run: git add package-index/index.html
-
-      # Only try to make a commit if the file has changed.
-      - run: git diff --cached --exit-code || git commit -m "Update releases."
-
-      - name: GitHub Push
-        uses: ad-m/github-push-action@v0.6.0
-        with:
-          github_token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-          branch: github-pages
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -9,87 +9,13 @@ on:
  workflow_dispatch:

 jobs:
-  windows-build:
-    runs-on: windows-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.10"]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Compute version
-      shell: powershell
-      run: |
-        $package_version = $(Get-Date -UFormat "%Y%m%d")+"."+${{ github.run_number }}
-        $package_version_ = $(Get-Date -UFormat "%Y%m%d")+"_"+${{ github.run_number }}
-        $tag_name=$package_version
-        echo "package_version=$package_version" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
-        echo "package_version_=$package_version_" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
-        echo "tag_name=$tag_name" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
-
-    - name: Create Release
-      id: create_release
-      uses: actions/create-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        tag_name: ${{ env.tag_name }}
-        release_name: nod.ai SHARK ${{ env.tag_name }}
-        body: |
-          Automatic snapshot release of nod.ai SHARK.
-        draft: true
-        prerelease: false
-
-    - name: Build Package 
-      shell: powershell
-      run: |
-        ./setup_venv.ps1
-        pyinstaller web/shark_sd.spec
-        mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
-
-        
-    # GHA windows VM OOMs so disable for now
-    #- name: Build and validate the SHARK Runtime package
-    #  shell: powershell
-    #  run: |
-    #    $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
-    #    pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
-
-    - uses: actions/upload-artifact@v2
-      with:
-        path: dist/*
-    
-    - name: Upload Release Assets
-      id: upload-release-assets
-      uses: dwenegar/upload-release-assets@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ./dist/*
-
-    - name: Publish Release
-      id: publish_release
-      uses: eregon/publish-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
-
-  linux-build:
+  build:

    runs-on: a100
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.10"]
-        backend: [IREE, SHARK]

    steps:
    - uses: actions/checkout@v3
@@ -105,55 +31,63 @@ jobs:
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
-
+    
+    - name: Compute version
+      run: |
+        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
+        tag_name="${package_version}"
+        echo "package_version=${package_version}" >> $GITHUB_ENV
+        echo "tag_name=${tag_name}" >> $GITHUB_ENV    
+    - name: Create Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        tag_name: ${{ env.tag_name }}
+        release_name: nod.ai SHARK ${{ env.tag_name }}
+        body: |
+          Automatic snapshot release of nod.ai SHARK.
+        draft: true
+        prerelease: false        
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html; fi
+        if [ -f requirements.txt ]; then pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cpu  -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude shark.venv,lit.cfg.py 
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
-    - name: Build and validate the IREE package
-      if: ${{ matrix.backend == 'IREE' }}
-      continue-on-error: true
-      run: |
-        cd $GITHUB_WORKSPACE
-        USE_IREE=1 VENV_DIR=iree.venv NIGHTLY=1 ./setup_venv.sh
-        source iree.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://iree-org.github.io/iree/pip-release-links.html
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        if !(grep -Fxq " failed" pytest_results.txt) 
-          then 
-            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
-            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/latest/
-        fi
-        rm -rf ./wheelhouse/nodai*

-    - name: Build and validate the SHARK Runtime package
-      if: ${{ matrix.backend == 'SHARK' }}
+    - name: Build and validate the package
      run: |
        cd $GITHUB_WORKSPACE
-        NIGHTLY=1 ./setup_venv.sh
+        ./setup_venv.sh
        source shark.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} tank/test_models.py |
-          tail -n 1 |
-          tee -a pytest_results.txt
+        pytest -k 'not benchmark' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/
+    
+    - name: Upload Release Assets
+      id: upload-release-assets
+      uses: dwenegar/upload-release-assets@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+        assets_path: ./wheelhouse/nodai_*.whl
+
+    - name: Publish Release
+      id: publish_release
+      uses: eregon/publish-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -10,21 +10,13 @@ on:
    branches: [ main ]
  workflow_dispatch:

-# Ensure that only a single job or workflow using the same
-# concurrency group will run at a time. This would cancel
-# any in-progress jobs in the same github workflow and github
-# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
  build-validate:
    strategy:
      fail-fast: true
      matrix:
-        os: [icelake, a100, MacStudio, ubuntu-latest]
-        suite: [cpu,cuda,vulkan]
+        os: [a100, MacStudio, ubuntu-latest]
+        suite: [cpu,gpu,vulkan]
        python-version: ["3.10"]
        include:
          - os: ubuntu-latest
@@ -33,38 +25,27 @@ jobs:
          - os: ubuntu-latest
            suite: vulkan
          - os: ubuntu-latest
-            suite: cuda
+            suite: gpu
          - os: ubuntu-latest
            suite: cpu
          - os: MacStudio
-            suite: cuda
+            suite: gpu
          - os: MacStudio
            suite: cpu
-          - os: icelake
-            suite: vulkan
-          - os: icelake
-            suite: cuda
-          - os: a100
-            suite: cpu

    runs-on: ${{ matrix.os }}

    steps:
    - uses: actions/checkout@v3
    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-        
    - name: Set up Python Version File ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
+      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest'
      run: |
        # See https://github.com/actions/setup-python/issues/433
        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
    
    - name: Set up Python ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
+      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest'
      uses: actions/setup-python@v4
      with:
        python-version: '${{ matrix.python-version }}'
@@ -90,45 +71,26 @@ jobs:
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py

-    - name: Validate Models on CPU
+    - name: Validate CPU Models
      if: matrix.suite == 'cpu'
-      run: |
-        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k cpu --update_tank
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
-        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
-
-    - name: Validate Models on NVIDIA GPU
-      if: matrix.suite == 'cuda'
-      run: |
-        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k cuda --update_tank
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
-        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
-
-    - name: Validate Vulkan Models (MacOS)
-      if: matrix.suite == 'vulkan' && matrix.os == 'MacStudio'
-      run: |
-        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        echo "VULKAN SDK PATH wo setup: $VULKAN_SDK"
-        cd /Users/anush/VulkanSDK/1.3.224.1/
-        source setup-env.sh
-        cd $GITHUB_WORKSPACE
-        echo "VULKAN SDK PATH with setup: $VULKAN_SDK"
-        echo $PATH
-        pip list | grep -E "torch|iree"
-        pytest -s --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
-
-    - name: Validate Vulkan Models (a100)
-      if: matrix.suite == 'vulkan' && matrix.os != 'MacStudio'
      run: |
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} -s --local_tank_cache="/data/anush/shark_cache" tank/test_models.py -k vulkan --update_tank
+        pytest -k 'cpu' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 
+
+    - name: Validate GPU Models
+      if: matrix.suite == 'gpu'
+      run: |
+        cd $GITHUB_WORKSPACE
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest -k "gpu" --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 
+
+    - name: Validate Vulkan Models
+      if: matrix.suite == 'vulkan'
+      run: |
+        cd $GITHUB_WORKSPACE
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest -k 'vulkan' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@ MANIFEST
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
+*.spec

 # Installer logs
 pip-log.txt
@@ -162,14 +163,7 @@ cython_debug/
 # Shark related artefacts
 *venv/
 shark_tmp/
-*.vmfb
-.use-iree
-tank/dict_configs.py

 # ORT related artefacts
 cache_models/
 onnx_models/
-
-#web logging
-web/logs/
-web/stored_results/stable_diffusion/
--- a/218
+++ b/218
@@ -1,218 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-    1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-    2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-    3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-    4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-    5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-    6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-    7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-    8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-    9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-    END OF TERMS AND CONDITIONS
-
-    APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-    Copyright [yyyy] [name of copyright owner]
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-
---- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
--- a/README.md
+++ b/README.md
@@ -5,123 +5,25 @@ High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerator
 [![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
 [![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)

+## Communication Channels

-## Installation (Windows, Linux and macOS)
-
-## Check out the code
-
-```shell
-git clone https://github.com/nod-ai/SHARK.git
-cd SHARK
-```
-
-## Setup your Python VirtualEnvironment and Dependencies
-
-### Windows 10/11 Users
-
-* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
-
-* Install Git for Windows from [here](https://git-scm.com/download/win)
-
-#### Allow the install script to run in Powershell
-```powershell
-set-executionpolicy remotesigned
-```
-
-#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
-```powershell
-./setup_venv.ps1 #You can re-run this script to get the latest version
-```
-
-### Linux / macOS Users
-
-```shell
-./setup_venv.sh
-source shark.venv/bin/activate
-```
+*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
+*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc


-### Run Stable Diffusion on your device - WebUI
-
-#### Windows 10/11 Users
-```powershell
-(shark.venv) PS C:\Users\nod\SHARK> cd web
-(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
-```
-#### Linux Users
-```shell
-(shark.venv) > cd web
-(shark.venv) > python index.py
-```
-
-#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
-
-
-<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
-
-
-
-### Run Stable Diffusion on your device - Commandline
-
-#### Install your hardware drivers
-* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree)
-* [macOS Users] Download and install the latest Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home)
-* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
-
-Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
-
-
-#### Windows 10/11 Users
-```powershell
-(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
-```
-
-#### Linux / macOS Users
-```shell
-python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
-```
-
-You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
-
-The output on a 6900XT would like:
-
-```shell 
-44it [00:08,  5.14it/s]i = 44 t = 120 (191ms)
-45it [00:08,  5.15it/s]i = 45 t = 100 (191ms)
-46it [00:08,  5.16it/s]i = 46 t = 80 (191ms)
-47it [00:09,  5.16it/s]i = 47 t = 60 (193ms)
-48it [00:09,  5.15it/s]i = 48 t = 40 (195ms)
-49it [00:09,  5.12it/s]i = 49 t = 20 (196ms)
-50it [00:09,  5.14it/s]
-Average step time: 192.8154182434082ms/it
-Total image generation runtime (s): 10.390909433364868
-(shark.venv) PS C:\g\shark>
-```
-
-Here are some samples generated:
-
-![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
-
-![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
-
-
-
-For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
-
-Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
-
+## Installation

 <details>
-  <summary>Binary Installation</summary>
+  <summary>Installation (Linux and macOS)</summary>

 ### Setup a new pip Virtual Environment

 This step sets up a new VirtualEnv for Python

 ```shell
-python --version #Check you have 3.10 on Linux, macOS or Windows Powershell
+python --version #Check you have 3.7->3.10 on Linux or 3.10 on macOS
 python -m venv shark_venv
-source shark_venv/bin/activate   # Use shark_venv/Scripts/activate on Windows
+source shark_venv/bin/activate

 # If you are using conda create and activate a new conda env

@@ -136,21 +38,16 @@ python -m pip install --upgrade pip
 This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10

 ```shell
-pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install nodai-shark -f https://github.com/nod-ai/SHARK/releases -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/shark-runtime/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 ```
-
-### Run shark tank model tests.
-```shell
-pytest tank/test_models.py
-```
-See tank/README.md for a more detailed walkthrough of our pytest suite and CLI.
+If you are on an Intel macOS machine you need this [workaround](https://github.com/nod-ai/SHARK/issues/102) for an upstream issue.

 ### Download and run Resnet50 sample

 ```shell
 curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/resnet50_script.py
 #Install deps for test script
-pip install --pre torch torchvision torchaudio tqdm pillow gsutil --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install --pre torch torchvision torchaudio tqdm pillow --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 python ./resnet50_script.py --device="cpu"  #use cuda or vulkan or metal
 ```

@@ -164,78 +61,78 @@ python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
 </details>


-
 <details>
-  <summary>Development, Testing and Benchmarks</summary>
+  <summary>Source Installation</summary>

-If you want to use Python3.10 and with TF Import tools you can use the environment variables like:
-Set `USE_IREE=1` to use upstream IREE
-```
-# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
-```
+## Check out the code

-### Run any of the hundreds of SHARK tank models via the test framework
 ```shell
-python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
-# Or a pytest
-pytest tank/test_models.py -k "MiniLM"
+git clone https://github.com/nod-ai/SHARK.git
 ```
-  

-If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
+## Setup your Python VirtualEnvironment and Dependencies
+```shell
+# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
+./setup_venv.sh
+source shark.venv/bin/activate
+```
+For example if you want to use Python3.10 and upstream IREE with TF Import tools you can use the environment variables like:
+```
+# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 USE_IREE=1 ./setup_venv.sh 
+```
+
+If you are a Torch-mlir developer or an IREE developer and want to test local changes you can uninstall
 the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
-with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
+with Python bindings and set your PYTHONPATH as mentioned [here](https://google.github.io/iree/bindings/python/)
 for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
 for Torch-MLIR.

-### How to use your locally built Torch-MLIR with SHARK
+### Run a demo script
 ```shell
-1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
-2.) Run `pip uninstall torch-mlir`.
-3.) Go to your local Torch-MLIR directory.
-4.) Activate mlir_venv virtual envirnoment.
-5.) Run `pip uninstall -r requirements.txt`.
-6.) Run `pip install -r requirements.txt`.
-7.) Build Torch-MLIR.
-8.) Activate shark.venv virtual environment from the Torch-MLIR directory.
-8.) Run `export PYTHONPATH=`pwd`/build/tools/torch-mlir/python_packages/torch_mlir:`pwd`/examples` in the Torch-MLIR directory.
-9.) Go to the SHARK directory.
-```
-Now the SHARK will use your locally build Torch-MLIR repo.
-
-
-## Benchmarking Dispatches
-
-To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your command line argument.  
-If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`.  E.G. `--dispatch_benchmarks="0 1 2 10"`
-
-if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled.  E.G:
-
-```
-shark_module = SharkInference(
-        mlir_model,
-        func_name,
-        device=args.device,
-        mlir_dialect="tm_tensor",
-        dispatch_benchmarks="all",
-        dispatch_benchmarks_dir="results"
-    )
+python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
+# Or a pytest
+pytest tank/tf/hf_masked_lm/albert-base-v2_test.py::AlbertBaseModuleTest::test_module_static_cpu
 ```

-Output will include:
- An ordered list ordered-dispatches.txt of all the dispatches with their runtime
- Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
- An .mlir file containing the dispatch benchmark 
- A compiled .vmfb file containing the dispatch benchmark
- An .mlir file containing just the hal executable
- A compiled .vmfb file of the hal executable
- A .txt file containing benchmark output


-See tank/README.md for instructions on how to run model tests and benchmarks from the SHARK tank.

 </details>

+
+<details>
+  <summary>Testing</summary>
+
+### Run all model tests on CPU/GPU/VULKAN/Metal
+```shell
+pytest tank
+
+# If on Linux for quicker results:
+pytest tank -n auto
+```
+
+### Running specific tests
+```shell
+# Run tests for a specific model:
+pytest tank/<MODEL_NAME> #i.e., pytest tank/bert-base-uncased
+
+# Run tests for a specific case:
+pytest tank/<MODEL_NAME>/<MODEL_TEST>.py::<MODEL>ModuleTest::<CASE>
+# i.e., pytest tank/bert-base-uncased/bert-base-uncased_test.py::BertModuleTest::test_module_static_cpu
+# For frontends other than pytorch, if available for a model, add frontend to filename: tank/bert-base-uncased/bert-base-uncased_tf_test.py
+
+# Run all tests, including tests for benchmarking and SHARK modules:
+# From base SHARK directory,
+pytest
+```
+
+### Run all model benchmark tests on CPU/GPU/VULKAN/Metal
+```shell
+pytest benchmarks
+```
+</details>
+
+
 <details>
  <summary>API Reference</summary>

@@ -286,26 +183,160 @@ result = shark_module.forward((arg0, arg1))
 ```
 </details>

+
 ## Supported and Validated Models

-SHARK is maintained to support the latest innovations in ML Models: 
+<details>
+  <summary>PyTorch Models</summary>

-| TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------|----------|-------------|
-| BERT                | :green_heart:         | :green_heart:         | :green_heart:            |
-| DistilBERT         | :green_heart:         | :green_heart:         | :green_heart:            |
-| GPT2         | :green_heart:         | :green_heart:         | :green_heart:            |
-| BLOOM         | :green_heart:         | :green_heart:         | :green_heart:            |
-| Stable Diffusion         | :green_heart:         | :green_heart:         | :green_heart:            |
-| Vision Transformer       | :green_heart:         | :green_heart:         | :green_heart:            |
-| ResNet50         | :green_heart:         | :green_heart:         | :green_heart:            |
+### Huggingface PyTorch Models

-For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).
+| Hugging Face Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :green_heart: (JIT)          | :green_heart:         | :green_heart:         | :green_heart:            |
+| Albert              | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
+| BigBird             | :green_heart: (AOT)            |          |          |             |
+| DistilBERT          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
+| GPT2                | :broken_heart: (AOT)            |          |          |             |
+| MobileBert          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |

-## Communication Channels
+### Torchvision  Models

-*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
-*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|--------------------|----------------------|----------|----------|-------------|
+| AlexNet            | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| DenseNet121        | :green_heart: (Script)         |          |          |             |
+| MNasNet1_0         | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| MobileNetV2        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| MobileNetV3        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Unet               | :broken_heart: (Script)         |          |          |             |
+| Resnet18           | :green_heart: (Script)         | :green_heart:         |  :green_heart:        | :green_heart:            |
+| Resnet50           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| Resnet101           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| Resnext50_32x4d    | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| ShuffleNet_v2      | :broken_heart: (Script)         |          |          |             |
+| SqueezeNet         | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| EfficientNet       | :green_heart: (Script)         |          |          |             |
+| Regnet             | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Resnest            | :broken_heart: (Script)         |          |          |             |
+| Vision Transformer | :green_heart: (Script)         |          |          |             |
+| VGG 16             | :green_heart: (Script)         | :green_heart:         |   :green_heart:       |             |
+| Wide Resnet        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| RAFT               | :broken_heart: (JIT)            |          |          |             |
+
+For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
+
+### PyTorch Training Models
+
+| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>JAX Models</summary>
+
+
+### JAX  Models
+
+| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| DALL-E                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>TFLite Models</summary>
+
+### TFLite Models
+
+| Models | TOSA/LinAlg  | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected      | :green_heart:           | :green_heart:         |          |             |
+| albert | :green_heart:           | :green_heart:         |          |             |
+| asr_conformer | :green_heart:           | :green_heart:         |          |             |
+| bird_classifier | :green_heart:           | :green_heart:         |          |             |
+| cartoon_gan | :green_heart:           | :green_heart:         |          |             |
+| craft_text | :green_heart:           | :green_heart:         |          |             |
+| deeplab_v3 | :green_heart:           | :green_heart:         |          |             |
+| densenet | :green_heart:           | :green_heart:         |          |             |
+| east_text_detector | :green_heart:           | :green_heart:         |          |             |
+| efficientnet_lite0_int8 | :green_heart:           | :green_heart:         |          |             |
+| efficientnet | :green_heart:           | :green_heart:         |          |             |
+| gpt2 | :green_heart:           | :green_heart:         |          |             |
+| image_stylization | :green_heart:           | :green_heart:         |          |             |
+| inception_v4 | :green_heart:           | :green_heart:         |          |             |
+| inception_v4_uint8 | :green_heart:           | :green_heart:         |          |             |
+| lightning_fp16 | :green_heart:           | :green_heart:         |          |             |
+| lightning_i8 | :green_heart:           | :green_heart:         |          |             |
+| lightning | :green_heart:           | :green_heart:         |          |             |
+| magenta | :green_heart:           | :green_heart:         |          |             |
+| midas | :green_heart:           | :green_heart:         |          |             |
+| mirnet | :green_heart:           | :green_heart:         |          |             |
+| mnasnet | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_edgetpu_s_float | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_edgetpu_s_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilebert | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_tf2_float | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_tf2_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_ssd_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v3-large | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v3-large_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v35-int8 | :green_heart:           | :green_heart:         |          |             |
+| nasnet | :green_heart:           | :green_heart:         |          |             |
+| person_detect | :green_heart:           | :green_heart:         |          |             |
+| posenet | :green_heart:           | :green_heart:         |          |             |
+| resnet_50_int8 | :green_heart:           | :green_heart:         |          |             |
+| rosetta | :green_heart:           | :green_heart:         |          |             |
+| spice | :green_heart:           | :green_heart:         |          |             |
+| squeezenet | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_fpnlite | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
+| ssd_spaghettinet_large | :green_heart:           | :green_heart:         |          |             |
+| ssd_spaghettinet_large_uint8 | :green_heart:           | :green_heart:         |          |             |
+| visual_wake_words_i8 | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>TF Models</summary>
+
+### Tensorflow Models (Inference)
+
+| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| albert-base-v2              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| DistilBERT          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| CamemBert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| ConvBert              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| Deberta              |            |         |          |             |
+| electra          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| funnel              |            |         |          |             |
+| layoutlm              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| longformer              |            |         |          |             |
+| mobile-bert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| remembert              |            |         |          |             |
+| tapas              |            |         |          |             |
+| flaubert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| roberta                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| xlm-roberta              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| mpnet              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+
+</details>

 ## Related Projects

--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -42,7 +42,7 @@ class TFHuggingFaceLanguage(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=tf_bert_input, jit_compile=True)
+    @tf.function(input_signature=tf_bert_input)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-IMPORTER=1 ./setup_venv.sh
-source $GITHUB_WORKSPACE/shark.venv/bin/activate
-python generate_sharktank.py --upload=False --ci_tank_dir=True
--- a/build_tools/scrape_releases.py
+++ b/build_tools/scrape_releases.py
@@ -1,37 +0,0 @@
-"""Scrapes the github releases API to generate a static pip-install-able releases page.
-
-See https://github.com/llvm/torch-mlir/issues/1374
-"""
-import argparse
-import json
-
-import requests
-
-# Parse arguments
-parser = argparse.ArgumentParser()
-parser.add_argument("owner", type=str)
-parser.add_argument("repo", type=str)
-args = parser.parse_args()
-
-# Get releases
-response = requests.get(
-    f"https://api.github.com/repos/{args.owner}/{args.repo}/releases"
-)
-body = json.loads(response.content)
-
-# Parse releases
-releases = []
-for row in body:
-    for asset in row["assets"]:
-        releases.append((asset["name"], asset["browser_download_url"]))
-
-# Output HTML
-html = """<!DOCTYPE html>
-<html>
-  <body>
-"""
-for name, url in releases:
-    html += f"    <a href='{url}'>{name}</a><br />\n"
-html += """  </body>
-</html>"""
-print(html)
--- a/build_tools/shark_versions.txt
+++ b/build_tools/shark_versions.txt
@@ -1,8 +0,0 @@
-# IREE Compiler/ Runtime Version:
-20221207.350
-# SHARK Compiler/ Runtime Version:
-20221207.236
-# Torch-MLIR Version for IREE:
-20221207.680
-# Torch-MLIR Version for SHARK:
-20221207.680
--- a/conftest.py
+++ b/conftest.py
@@ -1,5 +1,17 @@
 def pytest_addoption(parser):
    # Attaches SHARK command-line arguments to the pytest machinery.
+    parser.addoption(
+        "--save_mlir",
+        action="store_true",
+        default="False",
+        help="Pass option to save input MLIR",
+    )
+    parser.addoption(
+        "--save_vmfb",
+        action="store_true",
+        default="False",
+        help="Pass option to save IREE output .vmfb",
+    )
    parser.addoption(
        "--benchmark",
        action="store_true",
@@ -7,56 +19,8 @@ def pytest_addoption(parser):
        help="Pass option to benchmark and write results.csv",
    )
    parser.addoption(
-        "--onnx_bench",
+        "--save_temps",
        action="store_true",
        default="False",
-        help="Add ONNX benchmark results to pytest benchmarks.",
-    )
-    parser.addoption(
-        "--tf32",
-        action="store_true",
-        default="False",
-        help="Use TensorFloat-32 calculations.",
-    )
-    parser.addoption(
-        "--save_repro",
-        action="store_true",
-        default="False",
-        help="Pass option to save reproduction artifacts to SHARK/shark_tmp/test_case/",
-    )
-    parser.addoption(
-        "--save_fails",
-        action="store_true",
-        default="False",
-        help="Save reproduction artifacts for a test case only if it fails. Default is False.",
-    )
-    parser.addoption(
-        "--ci",
-        action="store_true",
-        default="False",
-        help="Enables uploading of reproduction artifacts upon test case failure during iree-compile or validation. Must be passed with --ci_sha option ",
-    )
-    parser.addoption(
-        "--update_tank",
-        action="store_false",
-        default="False",
-        help="Update local shark tank with latest artifacts.",
-    )
-    parser.addoption(
-        "--ci_sha",
-        action="store",
-        default="None",
-        help="Passes the github SHA of the CI workflow to include in google storage directory for reproduction artifacts.",
-    )
-    parser.addoption(
-        "--local_tank_cache",
-        action="store",
-        default="",
-        help="Specify the directory in which all downloaded shark_tank artifacts will be cached.",
-    )
-    parser.addoption(
-        "--tank_url",
-        type=str,
-        default="gs://shark_tank/latest",
-        help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
+        help="Saves IREE reproduction artifacts for filing upstream issues.",
    )
--- a/cpp/.gitignore
+++ b/cpp/.gitignore
@@ -1,3 +0,0 @@
-*.mlir
-*.vmfb
-*.ini
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,52 +0,0 @@
-# Copyright 2022 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-cmake_minimum_required(VERSION 3.21...3.23)
-
-#-------------------------------------------------------------------------------
-# Project configuration
-#-------------------------------------------------------------------------------
-
-project(iree-samples C CXX)
-set(CMAKE_C_STANDARD 11)
-set(CMAKE_CXX_STANDARD 17)
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-
-#-------------------------------------------------------------------------------
-# Core project dependency
-#-------------------------------------------------------------------------------
-
-message(STATUS "Fetching core IREE repo (this may take a few minutes)...")
-# Note: for log output, set -DFETCHCONTENT_QUIET=OFF,
-# see https://gitlab.kitware.com/cmake/cmake/-/issues/18238#note_440475
-
-include(FetchContent)
-
-FetchContent_Declare(
-  iree
-  GIT_REPOSITORY https://github.com/nod-ai/shark-runtime.git
-  GIT_TAG shark 
-  GIT_SUBMODULES_RECURSE OFF
-  GIT_SHALLOW OFF
-  GIT_PROGRESS ON
-  USES_TERMINAL_DOWNLOAD ON
-)
-
-# Extend module path to find MLIR CMake modules.
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_BINARY_DIR}/lib/cmake/mlir")
-
-# Disable core project features not needed for these out of tree samples.
-set(IREE_BUILD_TESTS OFF CACHE BOOL "" FORCE)
-set(IREE_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
-
-FetchContent_MakeAvailable(iree)
-FetchContent_GetProperties(iree SOURCE_DIR IREE_SOURCE_DIR)
-
-#-------------------------------------------------------------------------------
-# Individual samples
-#-------------------------------------------------------------------------------
-
-add_subdirectory(vulkan_gui)
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -1,82 +0,0 @@
-# SHARK C/C++ Samples
-
-These C/C++ samples can be built using CMake. The samples depend on the main
-SHARK-Runtime project's C/C++ sources, including both the runtime and the compiler. 
-
-Individual samples may require additional dependencies. Watch CMake's output
-for information about which you are missing for individual samples.
-
-On Windows we recommend using https://github.com/microsoft/vcpkg to download packages for
-your system. The general setup flow looks like
-
-*Install and activate SHARK*
-
-```bash
-source shark.venv/bin/activate #follow main repo instructions to setup your venv
-```
-
-*Install Dependencies*
-
-```bash
-vcpkg install [library] --triplet [your platform]
-vcpkg integrate install
-
-# Then pass `-DCMAKE_TOOLCHAIN_FILE=[check logs for path]` when configuring CMake
-```
-
-In Ubuntu Linux you can install
-
-```bash
-sudo apt install libsdl2-dev
-```
-
-*Build*
-```bash
-cd cpp
-cmake -GNinja -B build/
-cmake --build build/
-```
-
-*Prepare the model*
-```bash
-wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvm-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
-```
-*Prepare the input*
-
-```bash
-python save_img.py
-```
-Note that this requires tensorflow, e.g.
-```bash
-python -m pip install tensorflow
-```
-
-*Run the vulkan_gui*
-```bash
-./build/vulkan_gui/iree-samples-resnet-vulkan-gui
-```
-
-## Other models
-A tool for benchmarking other models is built and can be invoked with a command like the following
-```bash
-./build/vulkan_gui/iree-vulkan-gui --module-file=path/to/.vmfb --function_input=...
-```
-see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
-```bash
-wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
-./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
-```
-VAE and Autoencoder are also available
-```bash
-# VAE
-wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
-./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32
-
-# CLIP Autoencoder
-wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
-./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
-```
--- a/cpp/dog_imagenet.jpg
+++ b/cpp/dog_imagenet.jpg
--- a/cpp/save_img.py
+++ b/cpp/save_img.py
@@ -1,18 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from shark.shark_inference import SharkInference
-
-
-def load_and_preprocess_image(fname: str):
-    image = tf.io.read_file(fname)
-    image = tf.image.decode_image(image, channels=3)
-    image = tf.image.resize(image, (224, 224))
-    image = image[tf.newaxis, :]
-    # preprocessing pipeline
-    input_tensor = tf.keras.applications.resnet50.preprocess_input(image)
-    return input_tensor
-
-
-data = load_and_preprocess_image("dog_imagenet.jpg").numpy()
-
-data.tofile("dog.bin")
--- a/cpp/vision_inference/CMakeLists.txt
+++ b/cpp/vision_inference/CMakeLists.txt
@@ -1,84 +0,0 @@
-# Copyright 2022 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR
-   NOT IREE_HAL_EXECUTABLE_LOADER_EMBEDDED_ELF)
-  message(STATUS "Missing LLVM backend and/or embeddded elf loader, skipping vision_inference sample")
-  return()
-endif()
-
-# vcpkg install stb
-#   tested with version 2021-09-10
-find_package(Stb)
-if(NOT Stb_FOUND)
-  message(STATUS "Could not find Stb, skipping vision inference sample")
-  return()
-endif()
-
-# Compile mnist.mlir to mnist.vmfb.
-set(_COMPILE_TOOL_EXECUTABLE $<TARGET_FILE:iree-compile>)
-set(_COMPILE_ARGS)
-list(APPEND _COMPILE_ARGS "--iree-input-type=mhlo")
-list(APPEND _COMPILE_ARGS "--iree-hal-target-backends=llvm-cpu")
-list(APPEND _COMPILE_ARGS "${IREE_SOURCE_DIR}/samples/models/mnist.mlir")
-list(APPEND _COMPILE_ARGS "-o")
-list(APPEND _COMPILE_ARGS "mnist.vmfb")
-add_custom_command(
-  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
-  COMMAND ${_COMPILE_TOOL_EXECUTABLE} ${_COMPILE_ARGS}
-  DEPENDS ${_COMPILE_TOOL_EXECUTABLE} "${IREE_SOURCE_DIR}/samples/models/mnist.mlir"
-)
-# Embed mnist.vmfb into a C file as mnist_bytecode_module_c.[h/c]
-set(_EMBED_DATA_EXECUTABLE $<TARGET_FILE:generate_embed_data>)
-set(_EMBED_ARGS)
-list(APPEND _EMBED_ARGS "--output_header=mnist_bytecode_module_c.h")
-list(APPEND _EMBED_ARGS "--output_impl=mnist_bytecode_module_c.c")
-list(APPEND _EMBED_ARGS "--identifier=iree_samples_vision_inference_mnist_bytecode_module")
-list(APPEND _EMBED_ARGS "--flatten")
-list(APPEND _EMBED_ARGS "${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb")
-add_custom_command(
-  OUTPUT "mnist_bytecode_module_c.h" "mnist_bytecode_module_c.c"
-  COMMAND ${_EMBED_DATA_EXECUTABLE} ${_EMBED_ARGS}
-  DEPENDS ${_EMBED_DATA_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/mnist.vmfb
-)
-# Define a library target for mnist_bytecode_module_c.
-add_library(iree_samples_vision_inference_mnist_bytecode_module_c OBJECT)
-target_sources(iree_samples_vision_inference_mnist_bytecode_module_c
-  PRIVATE
-    mnist_bytecode_module_c.h
-    mnist_bytecode_module_c.c
-)
-
-# Define the sample executable.
-set(_NAME "iree-run-mnist-module")
-add_executable(${_NAME} "")
-target_sources(${_NAME}
-  PRIVATE
-    "image_util.h"
-    "image_util.c"
-    "iree-run-mnist-module.c"
-)
-set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-run-mnist-module")
-target_include_directories(${_NAME} PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
-)
-target_include_directories(${_NAME} PRIVATE
-    ${Stb_INCLUDE_DIR}
-)
-target_link_libraries(${_NAME}
-  iree_base_base
-  iree_base_tracing
-  iree_hal_hal
-  iree_runtime_runtime
-  iree_samples_vision_inference_mnist_bytecode_module_c
-)
-
-# Define a target that copies the test image into the build directory.
-add_custom_target(iree_samples_vision_inference_test_image
-  COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/mnist_test.png" "${CMAKE_CURRENT_BINARY_DIR}/mnist_test.png")
-add_dependencies(${_NAME} iree_samples_vision_inference_test_image)
-
-message(STATUS "Configured vision_inference sample successfully")
--- a/cpp/vision_inference/README.md
+++ b/cpp/vision_inference/README.md
@@ -1,8 +0,0 @@
-# Vision Inference Sample (C code)
-
-This sample demonstrates how to run a MNIST handwritten digit detection vision
-model on an image using IREE's C API.
-
-A similar sample is implemented using a Python script and IREE's command line
-tools over in the primary iree repository at
-https://github.com/iree-org/iree/tree/main/samples/vision_inference
--- a/cpp/vision_inference/image_util.c
+++ b/cpp/vision_inference/image_util.c
@@ -1,224 +0,0 @@
-// Copyright 2021 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "image_util.h"
-
-#include <math.h>
-
-#include "iree/base/internal/flags.h"
-#include "iree/base/tracing.h"
-
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-
-iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
-    const uint8_t* pixel_data, iree_host_size_t buffer_length,
-    const float* input_range, iree_host_size_t range_length,
-    float* out_buffer) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  if (range_length != 2) {
-    IREE_TRACE_ZONE_END(z0);
-    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                            "range defined as 2-element [min, max] array.");
-  }
-  float input_scale = fabsf(input_range[1] - input_range[0]) / 2.0f;
-  float input_offset = (input_range[0] + input_range[1]) / 2.0f;
-  const float kUint8Mean = 127.5f;
-  for (int i = 0; i < buffer_length; ++i) {
-    out_buffer[i] =
-        (((float)(pixel_data[i])) - kUint8Mean) / kUint8Mean * input_scale +
-        input_offset;
-  }
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-
-iree_status_t iree_tools_utils_load_pixel_data_impl(
-    const iree_string_view_t filename, const iree_hal_dim_t* shape,
-    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
-    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
-  int img_dims[3];
-  if (stbi_info(filename.data, img_dims, &(img_dims[1]), &(img_dims[2])) == 0) {
-    return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
-                            (int)filename.size, filename.data);
-  }
-  if (!(element_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 ||
-        element_type == IREE_HAL_ELEMENT_TYPE_SINT_8 ||
-        element_type == IREE_HAL_ELEMENT_TYPE_UINT_8)) {
-    char element_type_str[16];
-    IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
-        element_type, sizeof(element_type_str), element_type_str, NULL));
-    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                            "element type %s not supported", element_type_str);
-  }
-  switch (shape_rank) {
-    case 2: {  // Assume tensor <height x width>
-      if (img_dims[2] != 1 || (shape[0] != img_dims[1]) ||
-          (shape[1] != img_dims[0])) {
-        return iree_make_status(
-            IREE_STATUS_INVALID_ARGUMENT,
-            "image size: %dx%dx%d, expected: %" PRIdim "x%" PRIdim, img_dims[0],
-            img_dims[1], img_dims[2], shape[1], shape[0]);
-      }
-      break;
-    }
-    case 3: {  // Assume tensor <height x width x channel>
-      if (shape[0] != img_dims[1] || shape[1] != img_dims[0] ||
-          shape[2] != img_dims[2]) {
-        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                                "image size: %dx%dx%d, expected: %" PRIdim
-                                "x%" PRIdim "x%" PRIdim,
-                                img_dims[0], img_dims[1], img_dims[2], shape[1],
-                                shape[0], shape[2]);
-      }
-      break;
-    }
-    case 4: {  // Assume tensor <batch x height x width x channel>
-      if (shape[1] != img_dims[1] || shape[2] != img_dims[0] ||
-          shape[3] != img_dims[2]) {
-        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                                "image size: %dx%dx%d, expected: %" PRIdim
-                                "x%" PRIdim "x%" PRIdim,
-                                img_dims[0], img_dims[1], img_dims[2], shape[2],
-                                shape[1], shape[3]);
-      }
-      break;
-    }
-    default:
-      return iree_make_status(
-          IREE_STATUS_INVALID_ARGUMENT,
-          "Input buffer shape rank %" PRIhsz " not supported", shape_rank);
-  }
-  // Drop the alpha channel if present.
-  int req_ch = (img_dims[2] >= 3) ? 3 : 0;
-  *out_pixel_data = stbi_load(filename.data, img_dims, &(img_dims[1]),
-                              &(img_dims[2]), req_ch);
-  if (*out_pixel_data == NULL) {
-    return iree_make_status(IREE_STATUS_NOT_FOUND, "can't load image %.*s",
-                            (int)filename.size, filename.data);
-  }
-  *out_buffer_length =
-      img_dims[0] * img_dims[1] * (img_dims[2] > 3 ? 3 : img_dims[2]);
-  return iree_ok_status();
-}
-
-iree_status_t iree_tools_utils_load_pixel_data(
-    const iree_string_view_t filename, const iree_hal_dim_t* shape,
-    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
-    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  iree_status_t result = iree_tools_utils_load_pixel_data_impl(
-      filename, shape, shape_rank, element_type, out_pixel_data,
-      out_buffer_length);
-  IREE_TRACE_ZONE_END(z0);
-  return result;
-}
-
-iree_status_t iree_tools_utils_buffer_view_from_image(
-    const iree_string_view_t filename, const iree_hal_dim_t* shape,
-    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
-    iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  *out_buffer_view = NULL;
-  if (element_type != IREE_HAL_ELEMENT_TYPE_SINT_8 &&
-      element_type != IREE_HAL_ELEMENT_TYPE_UINT_8) {
-    IREE_TRACE_ZONE_END(z0);
-    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                            "element type should be i8 or u8");
-  }
-
-  iree_status_t result;
-  uint8_t* pixel_data = NULL;
-  iree_host_size_t buffer_length;
-  result = iree_tools_utils_load_pixel_data(
-      filename, shape, shape_rank, element_type, &pixel_data, &buffer_length);
-  if (iree_status_is_ok(result)) {
-    iree_host_size_t element_byte =
-        iree_hal_element_dense_byte_count(element_type);
-    // SINT_8 and UINT_8 perform direct buffer wrap.
-    result = iree_hal_buffer_view_allocate_buffer(
-        allocator, shape_rank, shape, element_type,
-        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
-        (iree_hal_buffer_params_t){
-            .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
-            .access = IREE_HAL_MEMORY_ACCESS_READ,
-            .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
-                     IREE_HAL_BUFFER_USAGE_TRANSFER,
-        },
-        iree_make_const_byte_span(pixel_data, element_byte * buffer_length),
-        out_buffer_view);
-  }
-  stbi_image_free(pixel_data);
-  IREE_TRACE_ZONE_END(z0);
-  return result;
-}
-
-typedef struct iree_tools_utils_buffer_view_load_params_t {
-  const uint8_t* pixel_data;
-  iree_host_size_t pixel_data_length;
-  const float* input_range;
-  iree_host_size_t input_range_length;
-} iree_tools_utils_buffer_view_load_params_t;
-static iree_status_t iree_tools_utils_buffer_view_load_image_rescaled(
-    iree_hal_buffer_mapping_t* mapping, void* user_data) {
-  iree_tools_utils_buffer_view_load_params_t* params =
-      (iree_tools_utils_buffer_view_load_params_t*)user_data;
-  return iree_tools_utils_pixel_rescaled_to_buffer(
-      params->pixel_data, params->pixel_data_length, params->input_range,
-      params->input_range_length, (float*)mapping->contents.data);
-}
-
-iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
-    const iree_string_view_t filename, const iree_hal_dim_t* shape,
-    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
-    iree_hal_allocator_t* allocator, const float* input_range,
-    iree_host_size_t input_range_length,
-    iree_hal_buffer_view_t** out_buffer_view) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  *out_buffer_view = NULL;
-  if (element_type != IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
-    IREE_TRACE_ZONE_END(z0);
-    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                            "element type should be f32");
-  }
-
-  // Classic row-major image layout.
-  iree_hal_encoding_type_t encoding_type =
-      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR;
-
-  // Load pixel data from the file into a new host memory allocation (the only
-  // interface stb_image provides). A real application would want to use the
-  // generation callback to directly decode the image into the target mapped
-  // device buffer.
-  uint8_t* pixel_data = NULL;
-  iree_host_size_t buffer_length = 0;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_tools_utils_load_pixel_data(filename, shape, shape_rank,
-                                           element_type, &pixel_data,
-                                           &buffer_length));
-
-  iree_tools_utils_buffer_view_load_params_t params = {
-      .pixel_data = pixel_data,
-      .pixel_data_length = buffer_length,
-      .input_range = input_range,
-      .input_range_length = input_range_length,
-  };
-  iree_status_t status = iree_hal_buffer_view_generate_buffer(
-      allocator, shape_rank, shape, element_type, encoding_type,
-      (iree_hal_buffer_params_t){
-          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
-                  IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
-          .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
-                   IREE_HAL_BUFFER_USAGE_TRANSFER |
-                   IREE_HAL_BUFFER_USAGE_MAPPING,
-      },
-      iree_tools_utils_buffer_view_load_image_rescaled, &params,
-      out_buffer_view);
-
-  stbi_image_free(pixel_data);
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
--- a/cpp/vision_inference/image_util.h
+++ b/cpp/vision_inference/image_util.h
@@ -1,77 +0,0 @@
-// Copyright 2021 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
-#define IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
-
-#include "iree/base/api.h"
-#include "iree/hal/api.h"
-#include "iree/hal/buffer_view.h"
-
-#if __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// Loads the image at |filename| into |out_pixel_data| and sets
-// |out_buffer_length| to its length.
-//
-// The image dimension must match the width, height, and channel in|shape|,
-// while 2 <= |shape_rank| <= 4 to match the image tensor format.
-//
-// The file must be in a format supported by stb_image.h.
-// The returned |out_pixel_data| buffer must be released by the caller.
-iree_status_t iree_tools_utils_load_pixel_data(
-    const iree_string_view_t filename, const iree_hal_dim_t* shape,
-    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
-    uint8_t** out_pixel_data, iree_host_size_t* out_buffer_length);
-
-// Parse the content in an image file in |filename| into a HAL buffer view
-// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
-// |shape_rank|, and |element_type|, while being allocated by |allocator|.
-//
-// The |element_type| has to be SINT_8 or UINT_8. For FLOAT_32, use
-// |iree_tools_utils_buffer_view_from_image_rescaled| instead.
-//
-// The returned |out_buffer_view| must be released by the caller.
-iree_status_t iree_tools_utils_buffer_view_from_image(
-    const iree_string_view_t filename, const iree_hal_dim_t* shape,
-    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
-    iree_hal_allocator_t* allocator, iree_hal_buffer_view_t** out_buffer_view);
-
-// Parse the content in an image file in |filename| into a HAL buffer view
-// |out_buffer_view|. |out_buffer_view| properties are defined by |shape|,
-// |shape_rank|, and |element_type|, while being allocated by |allocator|.
-// The value in |out_buffer_view| is rescaled with |input_range|.
-//
-// The |element_type| has to be FLOAT_32, For SINT_8 or UINT_8, use
-// |iree_tools_utils_buffer_view_from_image| instead.
-//
-// The returned |out_buffer_view| must be released by the caller.
-iree_status_t iree_tools_utils_buffer_view_from_image_rescaled(
-    const iree_string_view_t filename, const iree_hal_dim_t* shape,
-    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
-    iree_hal_allocator_t* allocator, const float* input_range,
-    iree_host_size_t input_range_length,
-    iree_hal_buffer_view_t** out_buffer_view);
-
-// Normalize uint8_t |pixel_data| of the size |buffer_length| to float buffer
-// |out_buffer| with the range |input_range|.
-//
-// float32_x = (uint8_x - 127.5) / 127.5 * input_scale + input_offset, where
-// input_scale = abs(|input_range[0]| - |input_range[1]| / 2
-// input_offset = |input_range[0]| + |input_range[1]| / 2
-//
-// |out_buffer| needs to be allocated before the call.
-iree_status_t iree_tools_utils_pixel_rescaled_to_buffer(
-    const uint8_t* pixel_data, iree_host_size_t pixel_count,
-    const float* input_range, iree_host_size_t input_range_length,
-    float* out_buffer);
-
-#if __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // IREE_SAMPLES_VISION_INFERENCE_IMAGE_UTIL_H_
--- a/cpp/vision_inference/iree-run-mnist-module.c
+++ b/cpp/vision_inference/iree-run-mnist-module.c
@@ -1,121 +0,0 @@
-// Copyright 2021 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-// This sample uses image_util to load a hand-written image as an
-// iree_hal_buffer_view_t then passes it to the bytecode module built from
-// mnist.mlir on the CPU backend with the local-task driver.
-
-#include <float.h>
-
-#include "image_util.h"
-#include "iree/runtime/api.h"
-#include "mnist_bytecode_module_c.h"
-
-iree_status_t Run(const iree_string_view_t image_path) {
-  iree_runtime_instance_options_t instance_options;
-  iree_runtime_instance_options_initialize(IREE_API_VERSION_LATEST,
-                                           &instance_options);
-  iree_runtime_instance_options_use_all_available_drivers(&instance_options);
-  iree_runtime_instance_t* instance = NULL;
-  IREE_RETURN_IF_ERROR(iree_runtime_instance_create(
-      &instance_options, iree_allocator_system(), &instance));
-
-  // TODO(#5724): move device selection into the compiled modules.
-  iree_hal_device_t* device = NULL;
-  IREE_RETURN_IF_ERROR(iree_runtime_instance_try_create_default_device(
-      instance, iree_make_cstring_view("local-task"), &device));
-
-  // Create one session per loaded module to hold the module state.
-  iree_runtime_session_options_t session_options;
-  iree_runtime_session_options_initialize(&session_options);
-  iree_runtime_session_t* session = NULL;
-  IREE_RETURN_IF_ERROR(iree_runtime_session_create_with_device(
-      instance, &session_options, device,
-      iree_runtime_instance_host_allocator(instance), &session));
-  iree_hal_device_release(device);
-
-  const struct iree_file_toc_t* module_file =
-      iree_samples_vision_inference_mnist_bytecode_module_create();
-
-  IREE_RETURN_IF_ERROR(iree_runtime_session_append_bytecode_module_from_memory(
-      session, iree_make_const_byte_span(module_file->data, module_file->size),
-      iree_allocator_null()));
-
-  iree_runtime_call_t call;
-  IREE_RETURN_IF_ERROR(iree_runtime_call_initialize_by_name(
-      session, iree_make_cstring_view("module.predict"), &call));
-
-  // Prepare the input hal buffer view with image_util library.
-  // The input of the mmist model is single 28x28 pixel image as a
-  // tensor<1x28x28x1xf32>, with pixels in [0.0, 1.0].
-  iree_hal_buffer_view_t* buffer_view = NULL;
-  iree_hal_dim_t buffer_shape[] = {1, 28, 28, 1};
-  iree_hal_element_type_t hal_element_type = IREE_HAL_ELEMENT_TYPE_FLOAT_32;
-  float input_range[2] = {0.0f, 1.0f};
-  IREE_RETURN_IF_ERROR(
-      iree_tools_utils_buffer_view_from_image_rescaled(
-          image_path, buffer_shape, IREE_ARRAYSIZE(buffer_shape),
-          hal_element_type, iree_hal_device_allocator(device), input_range,
-          IREE_ARRAYSIZE(input_range), &buffer_view),
-      "load image");
-  IREE_RETURN_IF_ERROR(
-      iree_runtime_call_inputs_push_back_buffer_view(&call, buffer_view));
-  iree_hal_buffer_view_release(buffer_view);
-
-  IREE_RETURN_IF_ERROR(iree_runtime_call_invoke(&call, /*flags=*/0));
-
-  // Get the result buffers from the invocation.
-  iree_hal_buffer_view_t* ret_buffer_view = NULL;
-  IREE_RETURN_IF_ERROR(
-      iree_runtime_call_outputs_pop_front_buffer_view(&call, &ret_buffer_view));
-
-  // Read back the results. The output of the mnist model is a 1x10 prediction
-  // confidence values for each digit in [0, 9].
-  float predictions[1 * 10] = {0.0f};
-  IREE_RETURN_IF_ERROR(iree_hal_device_transfer_d2h(
-      iree_runtime_session_device(session),
-      iree_hal_buffer_view_buffer(ret_buffer_view), 0, predictions,
-      sizeof(predictions), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
-      iree_infinite_timeout()));
-  iree_hal_buffer_view_release(ret_buffer_view);
-
-  // Get the highest index from the output.
-  float result_val = FLT_MIN;
-  int result_idx = 0;
-  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(predictions); ++i) {
-    if (predictions[i] > result_val) {
-      result_val = predictions[i];
-      result_idx = i;
-    }
-  }
-  fprintf(stdout, "Detected number: %d\n", result_idx);
-
-  iree_runtime_call_deinitialize(&call);
-  iree_runtime_session_release(session);
-  iree_runtime_instance_release(instance);
-  return iree_ok_status();
-}
-
-int main(int argc, char** argv) {
-  if (argc > 2) {
-    fprintf(stderr, "Usage: iree-run-mnist-module <image file>\n");
-    return -1;
-  }
-  iree_string_view_t image_path;
-  if (argc == 1) {
-    image_path = iree_make_cstring_view("mnist_test.png");
-  } else {
-    image_path = iree_make_cstring_view(argv[1]);
-  }
-  iree_status_t result = Run(image_path);
-  if (!iree_status_is_ok(result)) {
-    iree_status_fprint(stderr, result);
-    iree_status_ignore(result);
-    return -1;
-  }
-  iree_status_ignore(result);
-  return 0;
-}
--- a/cpp/vision_inference/mnist_test.png
+++ b/cpp/vision_inference/mnist_test.png
--- a/cpp/vulkan_gui/CMakeLists.txt
+++ b/cpp/vulkan_gui/CMakeLists.txt
@@ -1,116 +0,0 @@
-# Copyright 2022 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-if(NOT IREE_TARGET_BACKEND_VULKAN_SPIRV OR
-   NOT IREE_HAL_DRIVER_VULKAN)
-  message(STATUS "Missing Vulkan backend and/or driver, skipping vulkan_gui sample")
-  return()
-endif()
-
-# This target statically links against Vulkan.
-# One way to achieve this is by installing the Vulkan SDK from
-# https://vulkan.lunarg.com/.
-include(FindVulkan)
-if(NOT Vulkan_FOUND)
-  message(STATUS "Could not find Vulkan, skipping vulkan_gui sample")
-  return()
-endif()
-
-# vcpkg install sdl2[vulkan]
-#   tested with versions 2.0.14#4 - 2.0.22#1
-find_package(SDL2)
-if(NOT SDL2_FOUND)
-  message(STATUS "Could not find SDL2, skipping vulkan_gui sample")
-  return()
-endif()
-
-FetchContent_Declare(
-  imgui
-  GIT_REPOSITORY https://github.com/ocornut/imgui
-  GIT_TAG        master
-)
-
-FetchContent_MakeAvailable(imgui)
-
-# Dear ImGui
-set(IMGUI_DIR ${CMAKE_BINARY_DIR}/_deps/imgui-src)
-message("Looking for Imgui in ${IMGUI_DIR}")
-include_directories(${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
-
-
-function(iree_vulkan_sample)
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME"
-    "SRCS"
-    ${ARGN}
-  )
-
-
-  # Define the sample executable.
-  set(_NAME "${_RULE_NAME}")
-  set(SRCS "${_RULE_SRCS}")
-  add_executable(${_NAME} "")
-  target_sources(${_NAME}
-    PRIVATE
-      ${SRCS}
-      "${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
-      "${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
-      "${IMGUI_DIR}/imgui.cpp"
-      "${IMGUI_DIR}/imgui_draw.cpp"
-      "${IMGUI_DIR}/imgui_demo.cpp"
-      "${IMGUI_DIR}/imgui_tables.cpp"
-      "${IMGUI_DIR}/imgui_widgets.cpp"
-  )
-  set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_NAME}")
-  target_include_directories(${_NAME} PUBLIC
-      $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
-  )
-  target_link_libraries(${_NAME}
-    SDL2::SDL2
-    Vulkan::Vulkan
-    iree_runtime_runtime
-    iree_base_internal_main
-    iree_hal_drivers_vulkan_registration_registration
-    iree_modules_hal_hal
-    iree_vm_vm
-    iree_vm_bytecode_module
-    iree_vm_cc
-    iree_tooling_vm_util_cc
-    iree_tooling_context_util
-  )
-
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-    set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
-  else()
-    set(_GUI_LINKOPTS "")
-  endif()
-
-  target_link_options(${_NAME}
-    PRIVATE
-      ${_GUI_LINKOPTS}
-  )
-endfunction()
-
-iree_vulkan_sample(
-    NAME
-      iree-samples-resnet-vulkan-gui
-
-    SRCS
-      vulkan_resnet_inference_gui.cc
-)
-
-iree_vulkan_sample(
-    NAME
-      iree-vulkan-gui
-
-    SRCS
-      vulkan_inference_gui.cc
-)
-
-message(STATUS "Configured vulkan_gui sample successfully")
--- a/cpp/vulkan_gui/simple_mul.mlir
+++ b/cpp/vulkan_gui/simple_mul.mlir
@@ -1,4 +0,0 @@
-func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  %0 = "arith.mulf"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
--- a/cpp/vulkan_gui/snail_imagenet.jpg
+++ b/cpp/vulkan_gui/snail_imagenet.jpg
--- a/cpp/vulkan_gui/stb_image.h
+++ b/cpp/vulkan_gui/stb_image.h
--- a/cpp/vulkan_gui/vulkan_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_inference_gui.cc
@@ -1,957 +0,0 @@
-// Copyright 2019 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-// Vulkan Graphics + IREE API Integration Sample.
-
-#include <SDL.h>
-#include <SDL_vulkan.h>
-#include <imgui.h>
-#include <imgui_impl_sdl.h>
-#include <imgui_impl_vulkan.h>
-#include <vulkan/vulkan.h>
-
-
-#include <cstring>
-#include <set>
-#include <vector>
-#include <fstream>
-#include <array>
-#include <cstdio>
-#include <cstdlib>
-#include <iterator>
-#include <string>
-#include <utility>
-
-#include "iree/hal/drivers/vulkan/api.h"
-
-// IREE's C API:
-#include "iree/base/api.h"
-#include "iree/hal/api.h"
-#include "iree/hal/drivers/vulkan/registration/driver_module.h"
-#include "iree/modules/hal/module.h"
-#include "iree/vm/api.h"
-#include "iree/vm/bytecode_module.h"
-#include "iree/vm/ref_cc.h"
-
-// iree-run-module
-#include "iree/base/internal/flags.h"
-#include "iree/base/status_cc.h"
-#include "iree/base/tracing.h"
-#include "iree/modules/hal/types.h"
-#include "iree/tooling/comparison.h"
-#include "iree/tooling/context_util.h"
-#include "iree/tooling/vm_util_cc.h"
-
-// Other dependencies (helpers, etc.)
-#include "iree/base/internal/main.h"
-
-#define IMGUI_UNLIMITED_FRAME_RATE
-
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-
-IREE_FLAG(string, entry_function, "",
-          "Name of a function contained in the module specified by module_file "
-          "to run.");
-
-// TODO(benvanik): move --function_input= flag into a util.
-static iree_status_t parse_function_io(iree_string_view_t flag_name,
-                                       void* storage,
-                                       iree_string_view_t value) {
-  auto* list = (std::vector<std::string>*)storage;
-  list->push_back(std::string(value.data, value.size));
-  return iree_ok_status();
-}
-static void print_function_io(iree_string_view_t flag_name, void* storage,
-                              FILE* file) {
-  auto* list = (std::vector<std::string>*)storage;
-  if (list->empty()) {
-    fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
-  } else {
-    for (size_t i = 0; i < list->size(); ++i) {
-      fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
-              list->at(i).c_str());
-    }
-  }
-}
-static std::vector<std::string> FLAG_function_inputs;
-IREE_FLAG_CALLBACK(
-    parse_function_io, print_function_io, &FLAG_function_inputs, function_input,
-    "An input (a) value or (b) buffer of the format:\n"
-    "  (a) scalar value\n"
-    "     value\n"
-    "     e.g.: --function_input=\"3.14\"\n"
-    "  (b) buffer:\n"
-    "     [shape]xtype=[value]\n"
-    "     e.g.: --function_input=\"2x2xi32=1 2 3 4\"\n"
-    "Optionally, brackets may be used to separate the element values:\n"
-    "  2x2xi32=[[1 2][3 4]]\n"
-    "Raw binary files can be read to provide buffer contents:\n"
-    "  2x2xi32=@some/file.bin\n"
-    "numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
-    "  @some.npy\n"
-    "Each occurrence of the flag indicates an input in the order they were\n"
-    "specified on the command line.");
-
-typedef struct iree_file_toc_t {
-  const char* name;             // the file's original name
-  char* data;             // beginning of the file
-  size_t size;                  // length of the file
-} iree_file_toc_t;
-
-bool load_file(const char* filename, char** pOut, size_t* pSize)
-{
-    FILE* f = fopen(filename, "rb");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Can't open %s\n", filename);
-        return false;
-    }
-
-    fseek(f, 0L, SEEK_END);
-    *pSize = ftell(f);
-    fseek(f, 0L, SEEK_SET);
-
-    *pOut = (char*)malloc(*pSize);
-
-    size_t size = fread(*pOut, *pSize, 1, f);
-
-    fclose(f);
-
-    return size != 0;
-}
-
-static VkAllocationCallbacks* g_Allocator = NULL;
-static VkInstance g_Instance = VK_NULL_HANDLE;
-static VkPhysicalDevice g_PhysicalDevice = VK_NULL_HANDLE;
-static VkDevice g_Device = VK_NULL_HANDLE;
-static uint32_t g_QueueFamily = (uint32_t)-1;
-static VkQueue g_Queue = VK_NULL_HANDLE;
-static VkPipelineCache g_PipelineCache = VK_NULL_HANDLE;
-static VkDescriptorPool g_DescriptorPool = VK_NULL_HANDLE;
-
-static ImGui_ImplVulkanH_Window g_MainWindowData;
-static uint32_t g_MinImageCount = 2;
-static bool g_SwapChainRebuild = false;
-static int g_SwapChainResizeWidth = 0;
-static int g_SwapChainResizeHeight = 0;
-
-static void check_vk_result(VkResult err) {
-  if (err == 0) return;
-  fprintf(stderr, "VkResult: %d\n", err);
-  abort();
-}
-
-// Returns the names of the Vulkan layers used for the given IREE
-// |extensibility_set| and |features|.
-std::vector<const char*> GetIreeLayers(
-    iree_hal_vulkan_extensibility_set_t extensibility_set,
-    iree_hal_vulkan_features_t features) {
-  iree_host_size_t required_count;
-  iree_hal_vulkan_query_extensibility_set(
-      features, extensibility_set, /*string_capacity=*/0, &required_count,
-      /*out_string_values=*/NULL);
-  std::vector<const char*> layers(required_count);
-  iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
-                                          layers.size(), &required_count,
-                                          layers.data());
-  return layers;
-}
-
-// Returns the names of the Vulkan extensions used for the given IREE
-// |extensibility_set| and |features|.
-std::vector<const char*> GetIreeExtensions(
-    iree_hal_vulkan_extensibility_set_t extensibility_set,
-    iree_hal_vulkan_features_t features) {
-  iree_host_size_t required_count;
-  iree_hal_vulkan_query_extensibility_set(
-      features, extensibility_set, /*string_capacity=*/0, &required_count,
-      /*out_string_values=*/NULL);
-  std::vector<const char*> extensions(required_count);
-  iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
-                                          extensions.size(), &required_count,
-                                          extensions.data());
-  return extensions;
-}
-
-// Returns the names of the Vulkan extensions used for the given IREE
-// |vulkan_features|.
-std::vector<const char*> GetDeviceExtensions(
-    VkPhysicalDevice physical_device,
-    iree_hal_vulkan_features_t vulkan_features) {
-  std::vector<const char*> iree_required_extensions = GetIreeExtensions(
-      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
-      vulkan_features);
-  std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
-      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
-      vulkan_features);
-
-  uint32_t extension_count = 0;
-  check_vk_result(vkEnumerateDeviceExtensionProperties(
-      physical_device, nullptr, &extension_count, nullptr));
-  std::vector<VkExtensionProperties> extension_properties(extension_count);
-  check_vk_result(vkEnumerateDeviceExtensionProperties(
-      physical_device, nullptr, &extension_count, extension_properties.data()));
-
-  // Merge extensions lists, including optional and required for simplicity.
-  std::set<const char*> ext_set;
-  ext_set.insert("VK_KHR_swapchain");
-  ext_set.insert(iree_required_extensions.begin(),
-                 iree_required_extensions.end());
-  for (int i = 0; i < iree_optional_extensions.size(); ++i) {
-    const char* optional_extension = iree_optional_extensions[i];
-    for (int j = 0; j < extension_count; ++j) {
-      if (strcmp(optional_extension, extension_properties[j].extensionName) ==
-          0) {
-        ext_set.insert(optional_extension);
-        break;
-      }
-    }
-  }
-  std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
-  return extensions;
-}
-
-std::vector<const char*> GetInstanceLayers(
-    iree_hal_vulkan_features_t vulkan_features) {
-  // Query the layers that IREE wants / needs.
-  std::vector<const char*> required_layers = GetIreeLayers(
-      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_REQUIRED, vulkan_features);
-  std::vector<const char*> optional_layers = GetIreeLayers(
-      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL, vulkan_features);
-
-  // Query the layers that are available on the Vulkan ICD.
-  uint32_t layer_property_count = 0;
-  check_vk_result(
-      vkEnumerateInstanceLayerProperties(&layer_property_count, NULL));
-  std::vector<VkLayerProperties> layer_properties(layer_property_count);
-  check_vk_result(vkEnumerateInstanceLayerProperties(&layer_property_count,
-                                                     layer_properties.data()));
-
-  // Match between optional/required and available layers.
-  std::vector<const char*> layers;
-  for (const char* layer_name : required_layers) {
-    bool found = false;
-    for (const auto& layer_property : layer_properties) {
-      if (std::strcmp(layer_name, layer_property.layerName) == 0) {
-        found = true;
-        layers.push_back(layer_name);
-        break;
-      }
-    }
-    if (!found) {
-      fprintf(stderr, "Required layer %s not available\n", layer_name);
-      abort();
-    }
-  }
-  for (const char* layer_name : optional_layers) {
-    for (const auto& layer_property : layer_properties) {
-      if (std::strcmp(layer_name, layer_property.layerName) == 0) {
-        layers.push_back(layer_name);
-        break;
-      }
-    }
-  }
-
-  return layers;
-}
-
-std::vector<const char*> GetInstanceExtensions(
-    SDL_Window* window, iree_hal_vulkan_features_t vulkan_features) {
-  // Ask SDL for its list of required instance extensions.
-  uint32_t sdl_extensions_count = 0;
-  SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count, NULL);
-  std::vector<const char*> sdl_extensions(sdl_extensions_count);
-  SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count,
-                                   sdl_extensions.data());
-
-  std::vector<const char*> iree_required_extensions = GetIreeExtensions(
-      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_REQUIRED,
-      vulkan_features);
-  std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
-      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
-      vulkan_features);
-
-  // Merge extensions lists, including optional and required for simplicity.
-  std::set<const char*> ext_set;
-  ext_set.insert(sdl_extensions.begin(), sdl_extensions.end());
-  ext_set.insert(iree_required_extensions.begin(),
-                 iree_required_extensions.end());
-  ext_set.insert(iree_optional_extensions.begin(),
-                 iree_optional_extensions.end());
-  std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
-  return extensions;
-}
-
-void SetupVulkan(iree_hal_vulkan_features_t vulkan_features,
-                 const char** instance_layers, uint32_t instance_layers_count,
-                 const char** instance_extensions,
-                 uint32_t instance_extensions_count,
-                 const VkAllocationCallbacks* allocator, VkInstance* instance,
-                 uint32_t* queue_family_index,
-                 VkPhysicalDevice* physical_device, VkQueue* queue,
-                 VkDevice* device, VkDescriptorPool* descriptor_pool) {
-  VkResult err;
-
-  // Create Vulkan Instance
-  {
-    VkInstanceCreateInfo create_info = {};
-    create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
-    create_info.enabledLayerCount = instance_layers_count;
-    create_info.ppEnabledLayerNames = instance_layers;
-    create_info.enabledExtensionCount = instance_extensions_count;
-    create_info.ppEnabledExtensionNames = instance_extensions;
-    err = vkCreateInstance(&create_info, allocator, instance);
-    check_vk_result(err);
-  }
-
-  // Select GPU
-  {
-    uint32_t gpu_count;
-    err = vkEnumeratePhysicalDevices(*instance, &gpu_count, NULL);
-    check_vk_result(err);
-    IM_ASSERT(gpu_count > 0);
-
-    VkPhysicalDevice* gpus =
-        (VkPhysicalDevice*)malloc(sizeof(VkPhysicalDevice) * gpu_count);
-    err = vkEnumeratePhysicalDevices(*instance, &gpu_count, gpus);
-    check_vk_result(err);
-
-    // Use the first reported GPU for simplicity.
-    *physical_device = gpus[0];
-
-    VkPhysicalDeviceProperties properties;
-    vkGetPhysicalDeviceProperties(*physical_device, &properties);
-    fprintf(stdout, "Selected Vulkan device: '%s'\n", properties.deviceName);
-    free(gpus);
-  }
-
-  // Select queue family. We want a single queue with graphics and compute for
-  // simplicity, but we could also discover and use separate queues for each.
-  {
-    uint32_t count;
-    vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, NULL);
-    VkQueueFamilyProperties* queues = (VkQueueFamilyProperties*)malloc(
-        sizeof(VkQueueFamilyProperties) * count);
-    vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, queues);
-    for (uint32_t i = 0; i < count; i++) {
-      if (queues[i].queueFlags &
-          (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)) {
-        *queue_family_index = i;
-        break;
-      }
-    }
-    free(queues);
-    IM_ASSERT(*queue_family_index != (uint32_t)-1);
-  }
-
-  // Create Logical Device (with 1 queue)
-  {
-    std::vector<const char*> device_extensions =
-        GetDeviceExtensions(*physical_device, vulkan_features);
-    const float queue_priority[] = {1.0f};
-    VkDeviceQueueCreateInfo queue_info = {};
-    queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
-    queue_info.queueFamilyIndex = *queue_family_index;
-    queue_info.queueCount = 1;
-    queue_info.pQueuePriorities = queue_priority;
-    VkDeviceCreateInfo create_info = {};
-    create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
-    create_info.queueCreateInfoCount = 1;
-    create_info.pQueueCreateInfos = &queue_info;
-    create_info.enabledExtensionCount =
-        static_cast<uint32_t>(device_extensions.size());
-    create_info.ppEnabledExtensionNames = device_extensions.data();
-
-    // Enable timeline semaphores.
-    VkPhysicalDeviceFeatures2 features2;
-    memset(&features2, 0, sizeof(features2));
-    features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
-    create_info.pNext = &features2;
-    VkPhysicalDeviceTimelineSemaphoreFeatures semaphore_features;
-    memset(&semaphore_features, 0, sizeof(semaphore_features));
-    semaphore_features.sType =
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES;
-    semaphore_features.pNext = features2.pNext;
-    features2.pNext = &semaphore_features;
-    semaphore_features.timelineSemaphore = VK_TRUE;
-
-    err = vkCreateDevice(*physical_device, &create_info, allocator, device);
-    check_vk_result(err);
-    vkGetDeviceQueue(*device, *queue_family_index, 0, queue);
-  }
-
-  // Create Descriptor Pool
-  {
-    VkDescriptorPoolSize pool_sizes[] = {
-        {VK_DESCRIPTOR_TYPE_SAMPLER, 1000},
-        {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1000},
-        {VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1000},
-        {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1000},
-        {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1000},
-        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, 1000},
-        {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1000},
-        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1000},
-        {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1000},
-        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC, 1000},
-        {VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, 1000}};
-    VkDescriptorPoolCreateInfo pool_info = {};
-    pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
-    pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
-    pool_info.maxSets = 1000 * IREE_ARRAYSIZE(pool_sizes);
-    pool_info.poolSizeCount = (uint32_t)IREE_ARRAYSIZE(pool_sizes);
-    pool_info.pPoolSizes = pool_sizes;
-    err =
-        vkCreateDescriptorPool(*device, &pool_info, allocator, descriptor_pool);
-    check_vk_result(err);
-  }
-}
-
-void SetupVulkanWindow(ImGui_ImplVulkanH_Window* wd,
-                       const VkAllocationCallbacks* allocator,
-                       VkInstance instance, uint32_t queue_family_index,
-                       VkPhysicalDevice physical_device, VkDevice device,
-                       VkSurfaceKHR surface, int width, int height,
-                       uint32_t min_image_count) {
-  wd->Surface = surface;
-
-  // Check for WSI support
-  VkBool32 res;
-  vkGetPhysicalDeviceSurfaceSupportKHR(physical_device, queue_family_index,
-                                       wd->Surface, &res);
-  if (res != VK_TRUE) {
-    fprintf(stderr, "Error no WSI support on physical device 0\n");
-    exit(-1);
-  }
-
-  // Select Surface Format
-  const VkFormat requestSurfaceImageFormat[] = {
-      VK_FORMAT_B8G8R8A8_UNORM, VK_FORMAT_R8G8B8A8_UNORM,
-      VK_FORMAT_B8G8R8_UNORM, VK_FORMAT_R8G8B8_UNORM};
-  const VkColorSpaceKHR requestSurfaceColorSpace =
-      VK_COLORSPACE_SRGB_NONLINEAR_KHR;
-  wd->SurfaceFormat = ImGui_ImplVulkanH_SelectSurfaceFormat(
-      physical_device, wd->Surface, requestSurfaceImageFormat,
-      (size_t)IREE_ARRAYSIZE(requestSurfaceImageFormat),
-      requestSurfaceColorSpace);
-
-  // Select Present Mode
-#ifdef IMGUI_UNLIMITED_FRAME_RATE
-  VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_MAILBOX_KHR,
-                                      VK_PRESENT_MODE_IMMEDIATE_KHR,
-                                      VK_PRESENT_MODE_FIFO_KHR};
-#else
-  VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_FIFO_KHR};
-#endif
-  wd->PresentMode = ImGui_ImplVulkanH_SelectPresentMode(
-      physical_device, wd->Surface, &present_modes[0],
-      IREE_ARRAYSIZE(present_modes));
-
-  // Create SwapChain, RenderPass, Framebuffer, etc.
-  IM_ASSERT(min_image_count >= 2);
-  ImGui_ImplVulkanH_CreateOrResizeWindow(instance, physical_device, device, wd,
-                                         queue_family_index, allocator, width,
-                                         height, min_image_count);
-
-  // Set clear color.
-  ImVec4 clear_color = ImVec4(0.45f, 0.55f, 0.60f, 1.00f);
-  memcpy(&wd->ClearValue.color.float32[0], &clear_color, 4 * sizeof(float));
-}
-
-void RenderFrame(ImGui_ImplVulkanH_Window* wd, VkDevice device, VkQueue queue) {
-  VkResult err;
-
-  VkSemaphore image_acquired_semaphore =
-      wd->FrameSemaphores[wd->SemaphoreIndex].ImageAcquiredSemaphore;
-  VkSemaphore render_complete_semaphore =
-      wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
-  err = vkAcquireNextImageKHR(device, wd->Swapchain, UINT64_MAX,
-                              image_acquired_semaphore, VK_NULL_HANDLE,
-                              &wd->FrameIndex);
-  check_vk_result(err);
-
-  ImGui_ImplVulkanH_Frame* fd = &wd->Frames[wd->FrameIndex];
-  {
-    err = vkWaitForFences(
-        device, 1, &fd->Fence, VK_TRUE,
-        UINT64_MAX);  // wait indefinitely instead of periodically checking
-    check_vk_result(err);
-
-    err = vkResetFences(device, 1, &fd->Fence);
-    check_vk_result(err);
-  }
-  {
-    err = vkResetCommandPool(device, fd->CommandPool, 0);
-    check_vk_result(err);
-    VkCommandBufferBeginInfo info = {};
-    info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-    info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-    err = vkBeginCommandBuffer(fd->CommandBuffer, &info);
-    check_vk_result(err);
-  }
-  {
-    VkRenderPassBeginInfo info = {};
-    info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
-    info.renderPass = wd->RenderPass;
-    info.framebuffer = fd->Framebuffer;
-    info.renderArea.extent.width = wd->Width;
-    info.renderArea.extent.height = wd->Height;
-    info.clearValueCount = 1;
-    info.pClearValues = &wd->ClearValue;
-    vkCmdBeginRenderPass(fd->CommandBuffer, &info, VK_SUBPASS_CONTENTS_INLINE);
-  }
-
-  // Record Imgui Draw Data and draw funcs into command buffer
-  ImGui_ImplVulkan_RenderDrawData(ImGui::GetDrawData(), fd->CommandBuffer);
-
-  // Submit command buffer
-  vkCmdEndRenderPass(fd->CommandBuffer);
-  {
-    VkPipelineStageFlags wait_stage =
-        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-    VkSubmitInfo info = {};
-    info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-    info.waitSemaphoreCount = 1;
-    info.pWaitSemaphores = &image_acquired_semaphore;
-    info.pWaitDstStageMask = &wait_stage;
-    info.commandBufferCount = 1;
-    info.pCommandBuffers = &fd->CommandBuffer;
-    info.signalSemaphoreCount = 1;
-    info.pSignalSemaphores = &render_complete_semaphore;
-
-    err = vkEndCommandBuffer(fd->CommandBuffer);
-    check_vk_result(err);
-    err = vkQueueSubmit(queue, 1, &info, fd->Fence);
-    check_vk_result(err);
-  }
-}
-
-void PresentFrame(ImGui_ImplVulkanH_Window* wd, VkQueue queue) {
-  VkSemaphore render_complete_semaphore =
-      wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
-  VkPresentInfoKHR info = {};
-  info.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
-  info.waitSemaphoreCount = 1;
-  info.pWaitSemaphores = &render_complete_semaphore;
-  info.swapchainCount = 1;
-  info.pSwapchains = &wd->Swapchain;
-  info.pImageIndices = &wd->FrameIndex;
-  VkResult err = vkQueuePresentKHR(queue, &info);
-  check_vk_result(err);
-  wd->SemaphoreIndex =
-      (wd->SemaphoreIndex + 1) %
-      wd->ImageCount;  // Now we can use the next set of semaphores
-}
-
-static void CleanupVulkan() {
-  vkDestroyDescriptorPool(g_Device, g_DescriptorPool, g_Allocator);
-
-  vkDestroyDevice(g_Device, g_Allocator);
-  vkDestroyInstance(g_Instance, g_Allocator);
-}
-
-static void CleanupVulkanWindow() {
-  ImGui_ImplVulkanH_DestroyWindow(g_Instance, g_Device, &g_MainWindowData,
-                                  g_Allocator);
-}
-
-namespace iree {
-
-extern "C" int iree_main(int argc, char** argv) {
-
-  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
-  if (argc > 1) {
-    // Avoid iree-run-module spinning endlessly on stdin if the user uses single
-    // dashes for flags.
-    printf(
-        "[ERROR] unexpected positional argument (expected none)."
-        " Did you use pass a flag with a single dash ('-')?"
-        " Use '--' instead.\n");
-    return 1;
-  }
-
-  // --------------------------------------------------------------------------
-  // Create a window.
-  if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) != 0) {
-    fprintf(stderr, "Failed to initialize SDL\n");
-    abort();
-    return 1;
-  }
-
-  // Setup window
-  // clang-format off
-  SDL_WindowFlags window_flags = (SDL_WindowFlags)(
-      SDL_WINDOW_VULKAN | SDL_WINDOW_RESIZABLE | SDL_WINDOW_ALLOW_HIGHDPI);
-  // clang-format on
-  SDL_Window* window = SDL_CreateWindow(
-      "IREE Samples - Vulkan Inference GUI", SDL_WINDOWPOS_CENTERED,
-      SDL_WINDOWPOS_CENTERED, 1280, 720, window_flags);
-  if (window == nullptr)
-  {
-    const char* sdl_err = SDL_GetError();
-    fprintf(stderr, "Error, SDL_CreateWindow returned: %s\n", sdl_err);
-    abort();
-    return 1;
-  }
-
-  // Setup Vulkan
-  iree_hal_vulkan_features_t iree_vulkan_features =
-      static_cast<iree_hal_vulkan_features_t>(
-          IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS |
-          IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
-  std::vector<const char*> layers = GetInstanceLayers(iree_vulkan_features);
-  std::vector<const char*> extensions =
-      GetInstanceExtensions(window, iree_vulkan_features);
-  SetupVulkan(iree_vulkan_features, layers.data(),
-              static_cast<uint32_t>(layers.size()), extensions.data(),
-              static_cast<uint32_t>(extensions.size()), g_Allocator,
-              &g_Instance, &g_QueueFamily, &g_PhysicalDevice, &g_Queue,
-              &g_Device, &g_DescriptorPool);
-
-  // Create Window Surface
-  VkSurfaceKHR surface;
-  VkResult err;
-  if (SDL_Vulkan_CreateSurface(window, g_Instance, &surface) == 0) {
-    fprintf(stderr, "Failed to create Vulkan surface.\n");
-    abort();
-    return 1;
-  }
-
-  // Create Framebuffers
-  int w, h;
-  SDL_GetWindowSize(window, &w, &h);
-  ImGui_ImplVulkanH_Window* wd = &g_MainWindowData;
-  SetupVulkanWindow(wd, g_Allocator, g_Instance, g_QueueFamily,
-                    g_PhysicalDevice, g_Device, surface, w, h, g_MinImageCount);
-
-  // Setup Dear ImGui context
-  IMGUI_CHECKVERSION();
-  ImGui::CreateContext();
-  ImGuiIO& io = ImGui::GetIO();
-  (void)io;
-
-  ImGui::StyleColorsDark();
-
-  // Setup Platform/Renderer bindings
-  ImGui_ImplSDL2_InitForVulkan(window);
-  ImGui_ImplVulkan_InitInfo init_info = {};
-  init_info.Instance = g_Instance;
-  init_info.PhysicalDevice = g_PhysicalDevice;
-  init_info.Device = g_Device;
-  init_info.QueueFamily = g_QueueFamily;
-  init_info.Queue = g_Queue;
-  init_info.PipelineCache = g_PipelineCache;
-  init_info.DescriptorPool = g_DescriptorPool;
-  init_info.Allocator = g_Allocator;
-  init_info.MinImageCount = g_MinImageCount;
-  init_info.ImageCount = wd->ImageCount;
-  init_info.CheckVkResultFn = check_vk_result;
-  ImGui_ImplVulkan_Init(&init_info, wd->RenderPass);
-
-  // Upload Fonts
-  {
-    // Use any command queue
-    VkCommandPool command_pool = wd->Frames[wd->FrameIndex].CommandPool;
-    VkCommandBuffer command_buffer = wd->Frames[wd->FrameIndex].CommandBuffer;
-
-    err = vkResetCommandPool(g_Device, command_pool, 0);
-    check_vk_result(err);
-    VkCommandBufferBeginInfo begin_info = {};
-    begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-    begin_info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-    err = vkBeginCommandBuffer(command_buffer, &begin_info);
-    check_vk_result(err);
-
-    ImGui_ImplVulkan_CreateFontsTexture(command_buffer);
-
-    VkSubmitInfo end_info = {};
-    end_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-    end_info.commandBufferCount = 1;
-    end_info.pCommandBuffers = &command_buffer;
-    err = vkEndCommandBuffer(command_buffer);
-    check_vk_result(err);
-    err = vkQueueSubmit(g_Queue, 1, &end_info, VK_NULL_HANDLE);
-    check_vk_result(err);
-
-    err = vkDeviceWaitIdle(g_Device);
-    check_vk_result(err);
-    ImGui_ImplVulkan_DestroyFontUploadObjects();
-  }
-
-  // Demo state.
-  bool show_iree_window = true;
-  // --------------------------------------------------------------------------
-  // Setup IREE.
-
-  // Check API version.
-  iree_api_version_t actual_version;
-  iree_status_t status =
-      iree_api_version_check(IREE_API_VERSION_LATEST, &actual_version);
-  if (iree_status_is_ok(status)) {
-    fprintf(stdout, "IREE runtime API version: %d\n", actual_version);
-  } else {
-    fprintf(stderr, "Unsupported runtime API version: %d\n", actual_version);
-    abort();
-  }
-
-  // Create a runtime Instance.
-  iree_vm_instance_t* iree_instance = nullptr;
-  IREE_CHECK_OK(
-      iree_vm_instance_create(iree_allocator_system(), &iree_instance));
-
-  // Register HAL drivers and VM module types.
-  IREE_CHECK_OK(iree_hal_vulkan_driver_module_register(
-      iree_hal_driver_registry_default()));
-  IREE_CHECK_OK(iree_hal_module_register_all_types(iree_instance));
-
-  // Create IREE Vulkan Driver and Device, sharing our VkInstance/VkDevice.
-  fprintf(stdout, "Creating Vulkan driver/device\n");
-  // Load symbols from our static `vkGetInstanceProcAddr` for IREE to use.
-  iree_hal_vulkan_syms_t* iree_vk_syms = nullptr;
-  IREE_CHECK_OK(iree_hal_vulkan_syms_create(
-      reinterpret_cast<void*>(&vkGetInstanceProcAddr), iree_allocator_system(),
-      &iree_vk_syms));
-  // Create the driver sharing our VkInstance.
-  iree_hal_driver_t* iree_vk_driver = nullptr;
-  iree_string_view_t driver_identifier = iree_make_cstring_view("vulkan");
-  iree_hal_vulkan_driver_options_t driver_options;
-  driver_options.api_version = VK_API_VERSION_1_0;
-  driver_options.requested_features = static_cast<iree_hal_vulkan_features_t>(
-      IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
-  IREE_CHECK_OK(iree_hal_vulkan_driver_create_using_instance(
-      driver_identifier, &driver_options, iree_vk_syms, g_Instance,
-      iree_allocator_system(), &iree_vk_driver));
-  // Create a device sharing our VkDevice and queue.
-  // We could also create a separate (possibly low priority) compute queue for
-  // IREE, and/or provide a dedicated transfer queue.
-  iree_string_view_t device_identifier = iree_make_cstring_view("vulkan");
-  iree_hal_vulkan_queue_set_t compute_queue_set;
-  compute_queue_set.queue_family_index = g_QueueFamily;
-  compute_queue_set.queue_indices = 1 << 0;
-  iree_hal_vulkan_queue_set_t transfer_queue_set;
-  transfer_queue_set.queue_indices = 0;
-  iree_hal_device_t* iree_vk_device = nullptr;
-  IREE_CHECK_OK(iree_hal_vulkan_wrap_device(
-      device_identifier, &driver_options.device_options, iree_vk_syms,
-      g_Instance, g_PhysicalDevice, g_Device, &compute_queue_set,
-      &transfer_queue_set, iree_allocator_system(), &iree_vk_device));
-  // Create a HAL module using the HAL device.
-  iree_vm_module_t* hal_module = nullptr;
-  IREE_CHECK_OK(iree_hal_module_create(iree_instance, iree_vk_device,
-                                       IREE_HAL_MODULE_FLAG_NONE,
-                                       iree_allocator_system(), &hal_module));
-
-
-  // Load bytecode module
-  //iree_file_toc_t module_file_toc;
-  //const char network_model[] = "resnet50_tf.vmfb";
-  //fprintf(stdout, "Loading: %s\n", network_model);
-  //if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
-  //{
-  //    abort();
-  //    return 1;
-  //}
-  //fprintf(stdout, "module size: %zu\n", module_file_toc.size);
-
-  iree_vm_module_t* bytecode_module = nullptr;
-  iree_status_t module_status = iree_tooling_load_module_from_flags(
-      iree_instance, iree_allocator_system(), &bytecode_module);
-  if (!iree_status_is_ok(module_status))
-    return -1;
-  //IREE_CHECK_OK(iree_vm_bytecode_module_create(
-  //    iree_instance,
-  //    iree_const_byte_span_t{
-  //        reinterpret_cast<const uint8_t*>(module_file_toc.data),
-  //        module_file_toc.size},
-  //    iree_allocator_null(), iree_allocator_system(), &bytecode_module));
-  //// Query for details about what is in the loaded module.
-  //iree_vm_module_signature_t bytecode_module_signature =
-  //    iree_vm_module_signature(bytecode_module);
-  //fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
-  //        bytecode_module_signature.export_function_count);
-  //for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
-  //  iree_vm_function_t function;
-  //  IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
-  //      bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
-  //  auto function_name = iree_vm_function_name(&function);
-  //  auto function_signature = iree_vm_function_signature(&function);
-
-  //  fprintf(stdout, "  %d: '%.*s' with calling convention '%.*s'\n", i,
-  //          (int)function_name.size, function_name.data,
-  //          (int)function_signature.calling_convention.size,
-  //          function_signature.calling_convention.data);
-  //}
-
-  // Allocate a context that will hold the module state across invocations.
-  iree_vm_context_t* iree_context = nullptr;
-  std::vector<iree_vm_module_t*> modules = {hal_module, bytecode_module};
-  IREE_CHECK_OK(iree_vm_context_create_with_modules(
-      iree_instance, IREE_VM_CONTEXT_FLAG_NONE, modules.size(), modules.data(),
-      iree_allocator_system(), &iree_context));
-  fprintf(stdout, "Context with modules is ready for use\n");
-
-  // Lookup the entry point function.
-  iree_vm_function_t main_function;
-  const char kMainFunctionName[] = "module.forward";
-  IREE_CHECK_OK(iree_vm_context_resolve_function(
-      iree_context,
-      iree_string_view_t{kMainFunctionName, sizeof(kMainFunctionName) - 1},
-      &main_function));
-  iree_string_view_t main_function_name = iree_vm_function_name(&main_function);
-  fprintf(stdout, "Resolved main function named '%.*s'\n",
-          (int)main_function_name.size, main_function_name.data);
-
-  // --------------------------------------------------------------------------
-
-        // Write inputs into mappable buffers.
-        iree_hal_allocator_t* allocator =
-            iree_hal_device_allocator(iree_vk_device);
-        //iree_hal_memory_type_t input_memory_type =
-        //    static_cast<iree_hal_memory_type_t>(
-        //        IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
-        //        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
-        //iree_hal_buffer_usage_t input_buffer_usage =
-        //    static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
-        //iree_hal_buffer_params_t buffer_params;
-        //buffer_params.type = input_memory_type;
-        //buffer_params.usage = input_buffer_usage;
-        //buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;
-
-       // Wrap input buffers in buffer views.
-
-        vm::ref<iree_vm_list_t> inputs;
-        iree_status_t input_status = ParseToVariantList(
-            allocator,
-            iree::span<const std::string>{FLAG_function_inputs.data(),
-                                          FLAG_function_inputs.size()},
-            iree_allocator_system(), &inputs);
-        if (!iree_status_is_ok(input_status))
-            return -1;
-        //vm::ref<iree_vm_list_t> inputs;
-        //IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
-
-        //iree_hal_buffer_view_t* input0_buffer_view = nullptr;
-        //constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
-        //IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
-        //    allocator,
-        //    /*shape_rank=*/4, /*shape=*/input_buffer_shape,
-        //    IREE_HAL_ELEMENT_TYPE_FLOAT_32,
-        //    IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
-        //    iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
-        //    &input0_buffer_view));
-
-        //auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
-        //IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));
-
-        // Prepare outputs list to accept results from the invocation.
-
-        vm::ref<iree_vm_list_t> outputs;
-        constexpr iree_hal_dim_t kOutputCount = 1000;
-        IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, kOutputCount * sizeof(float), iree_allocator_system(), &outputs));
-
-  // --------------------------------------------------------------------------
-
-  // Main loop.
-  bool done = false;
-  while (!done) {
-    SDL_Event event;
-
-    while (SDL_PollEvent(&event)) {
-      if (event.type == SDL_QUIT) {
-        done = true;
-      }
-
-      ImGui_ImplSDL2_ProcessEvent(&event);
-      if (event.type == SDL_QUIT) done = true;
-      if (event.type == SDL_WINDOWEVENT &&
-          event.window.event == SDL_WINDOWEVENT_RESIZED &&
-          event.window.windowID == SDL_GetWindowID(window)) {
-        g_SwapChainResizeWidth = (int)event.window.data1;
-        g_SwapChainResizeHeight = (int)event.window.data2;
-        g_SwapChainRebuild = true;
-      }
-    }
-
-    if (g_SwapChainRebuild) {
-      g_SwapChainRebuild = false;
-      ImGui_ImplVulkan_SetMinImageCount(g_MinImageCount);
-      ImGui_ImplVulkanH_CreateOrResizeWindow(
-          g_Instance, g_PhysicalDevice, g_Device, &g_MainWindowData,
-          g_QueueFamily, g_Allocator, g_SwapChainResizeWidth,
-          g_SwapChainResizeHeight, g_MinImageCount);
-      g_MainWindowData.FrameIndex = 0;
-    }
-
-    // Start the Dear ImGui frame
-    ImGui_ImplVulkan_NewFrame();
-    ImGui_ImplSDL2_NewFrame(window);
-    ImGui::NewFrame();
-
-    // Custom window.
-    {
-      ImGui::Begin("IREE Vulkan Integration Demo", &show_iree_window);
-
-      ImGui::Separator();
-
-      // ImGui Inputs for two input tensors.
-      // Run computation whenever any of the values changes.
-      static bool dirty = true;
-      if (dirty) {
-
-        // Synchronously invoke the function.
-        IREE_CHECK_OK(iree_vm_invoke(iree_context, main_function,
-                                     IREE_VM_INVOCATION_FLAG_NONE,
-                                     /*policy=*/nullptr, inputs.get(),
-                                     outputs.get(), iree_allocator_system()));
-
-
-        // we want to run continuously so we can use tools like RenderDoc, RGP, etc...
-        dirty = true;
-      }
-
-      // Framerate counter.
-      ImGui::Text("Application average %.3f ms/frame (%.1f FPS)",
-                  1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate);
-
-      ImGui::End();
-    }
-
-    // Rendering
-    ImGui::Render();
-    RenderFrame(wd, g_Device, g_Queue);
-
-    PresentFrame(wd, g_Queue);
-  }
-  // --------------------------------------------------------------------------
-
-  // --------------------------------------------------------------------------
-  // Cleanup
-  iree_vm_module_release(hal_module);
-  iree_vm_module_release(bytecode_module);
-  iree_vm_context_release(iree_context);
-  iree_hal_device_release(iree_vk_device);
-  iree_hal_allocator_release(allocator);
-  iree_hal_driver_release(iree_vk_driver);
-  iree_hal_vulkan_syms_release(iree_vk_syms);
-  iree_vm_instance_release(iree_instance);
-
-  err = vkDeviceWaitIdle(g_Device);
-  check_vk_result(err);
-  ImGui_ImplVulkan_Shutdown();
-  ImGui_ImplSDL2_Shutdown();
-  ImGui::DestroyContext();
-
-  CleanupVulkanWindow();
-  CleanupVulkan();
-
-  SDL_DestroyWindow(window);
-  SDL_Quit();
-  // --------------------------------------------------------------------------
-
-  return 0;
-}
-
-}  // namespace iree
--- a/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -2,23 +2,19 @@
 """SHARK Tank"""
 # python generate_sharktank.py, you have to give a csv tile with [model_name, model_download_url]
 # will generate local shark tank folder like this:
-#   HOME
-#     /.local
-#       /shark_tank
-#           /albert_lite_base
-#           /...model_name...
+#   /SHARK
+#     /gen_shark_tank
+#       /albert_lite_base
+#       /...model_name...
 #

 import os
 import csv
 import argparse
 from shark.shark_importer import SharkImporter
-from shark.parser import shark_args
 import tensorflow as tf
-import subprocess as sp
 import hashlib
 import numpy as np
-from pathlib import Path

 visible_default = tf.config.list_physical_devices("GPU")
 try:
@@ -30,6 +26,9 @@ except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

+# All generated models and metadata will be saved under this directory.
+WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
+

 def create_hash(file_name):
    with open(file_name, "rb") as f:
@@ -43,7 +42,6 @@ def create_hash(file_name):
 def save_torch_model(torch_model_list):
    from tank.model_utils import get_hf_model
    from tank.model_utils import get_vision_model
-    from tank.model_utils import get_hf_img_cls_model

    with open(torch_model_list) as csvfile:
        torch_reader = csv.reader(csvfile, delimiter=",")
@@ -52,10 +50,8 @@ def save_torch_model(torch_model_list):
            torch_model_name = row[0]
            tracing_required = row[1]
            model_type = row[2]
-            is_dynamic = row[3]

            tracing_required = False if tracing_required == "False" else True
-            is_dynamic = False if is_dynamic == "False" else True

            model = None
            input = None
@@ -63,8 +59,6 @@ def save_torch_model(torch_model_list):
                model, input, _ = get_vision_model(torch_model_name)
            elif model_type == "hf":
                model, input, _ = get_hf_model(torch_model_name)
-            elif model_type == "hf_img_cls":
-                model, input, _ = get_hf_img_cls_model(torch_model_name)

            torch_model_name = torch_model_name.replace("/", "_")
            torch_model_dir = os.path.join(
@@ -90,22 +84,17 @@ def save_torch_model(torch_model_list):
            )
            np.save(os.path.join(torch_model_dir, "hash"), np.array(mlir_hash))
            # Generate torch dynamic models.
-            if is_dynamic:
-                mlir_importer.import_debug(
-                    is_dynamic=True,
-                    tracing_required=tracing_required,
-                    dir=torch_model_dir,
-                    model_name=torch_model_name + "_dynamic",
-                )
+            mlir_importer.import_debug(
+                is_dynamic=True,
+                tracing_required=tracing_required,
+                dir=torch_model_dir,
+                model_name=torch_model_name + "_dynamic",
+            )


 def save_tf_model(tf_model_list):
-    from tank.model_utils_tf import (
-        get_causal_image_model,
-        get_causal_lm_model,
-        get_keras_model,
-        get_TFhf_model,
-    )
+    from tank.model_utils_tf import get_causal_lm_model
+    from tank.model_utils_tf import get_causal_image_model

    with open(tf_model_list) as csvfile:
        tf_reader = csv.reader(csvfile, delimiter=",")
@@ -116,15 +105,11 @@ def save_tf_model(tf_model_list):

            model = None
            input = None
-            print(f"Generating artifacts for model {tf_model_name}")
+            print(model_type)
            if model_type == "hf":
                model, input, _ = get_causal_lm_model(tf_model_name)
            if model_type == "img":
                model, input, _ = get_causal_image_model(tf_model_name)
-            if model_type == "keras":
-                model, input, _ = get_keras_model(tf_model_name)
-            if model_type == "TFhf":
-                model, input, _ = get_TFhf_model(tf_model_name)

            tf_model_name = tf_model_name.replace("/", "_")
            tf_model_dir = os.path.join(WORKDIR, str(tf_model_name) + "_tf")
@@ -205,14 +190,14 @@ if __name__ == "__main__":
    parser.add_argument(
        "--torch_model_csv",
        type=lambda x: is_valid_file(x),
-        default="./tank/torch_model_list.csv",
+        default="./tank/pytorch/torch_model_list.csv",
        help="""Contains the file with torch_model name and args.
-             Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
+             Please see: https://github.com/nod-ai/SHARK/blob/main/tank/pytorch/torch_model_list.csv""",
    )
    parser.add_argument(
        "--tf_model_csv",
        type=lambda x: is_valid_file(x),
-        default="./tank/tf_model_list.csv",
+        default="./tank/tf/tf_model_list.csv",
        help="Contains the file with tf model name and args.",
    )
    parser.add_argument(
@@ -221,21 +206,9 @@ if __name__ == "__main__":
        default="./tank/tflite/tflite_model_list.csv",
        help="Contains the file with tf model name and args.",
    )
-    parser.add_argument(
-        "--ci_tank_dir",
-        type=bool,
-        default=False,
-    )
    parser.add_argument("--upload", type=bool, default=False)

    args = parser.parse_args()
-
-    home = str(Path.home())
-    if args.ci_tank_dir == True:
-        WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
-    else:
-        WORKDIR = os.path.join(home, ".local/shark_tank/")
-
    if args.torch_model_csv:
        save_torch_model(args.torch_model_csv)

@@ -246,6 +219,5 @@ if __name__ == "__main__":
        save_tflite_model(args.tflite_model_csv)

    if args.upload:
-        git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
-        print("uploading files to gs://shark_tank/" + git_hash)
-        os.system(f"gsutil cp -r {WORKDIR}* gs://shark_tank/" + git_hash)
+        print("uploading files to gs://shark_tank/")
+        os.system("gsutil cp -r ./gen_shark_tank/* gs://shark_tank/")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,9 +4,9 @@ requires = [
    "wheel",
    "packaging",

-    "numpy>=1.22.4",
-    "torch-mlir>=20221021.633",
-    "iree-compiler>=20221022.190",
-    "iree-runtime>=20221022.190",
+    "numpy==1.22.4",
+    "torch-mlir>=20220428.420",
+    "iree-compiler>=20220427.13",
+    "iree-runtime>=20220427.13",
 ]
 build-backend = "setuptools.build_meta"
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
 addopts = --verbose -p no:warnings
-norecursedirs = inference tank/tflite examples benchmarks shark 
+norecursedirs = inference tank/tflite 
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -1,4 +1,4 @@
-f https://download.pytorch.org/whl/nightly/cpu/
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 --pre

 numpy
@@ -19,17 +19,13 @@ tensorflow-macos
 tensorflow-metal
 #tf-models-nightly
 #tensorflow-text-nightly
-transformers
+transformers==4.18.0
 tensorflow-probability
 #jax[cpu]

 # tflitehub dependencies.
 Pillow

-# web dependecies.
-gradio
-altair
-
 # Testing and support.
 #lit
 #pyyaml
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -2,6 +2,7 @@
 --pre

 numpy==1.22.4
+torch
 torchvision

 tqdm
@@ -13,12 +14,10 @@ iree-tools-tf

 # TensorFlow and JAX.
 gin-config
-tensorflow==2.10
-keras==2.10
+tensorflow
 #tf-models-nightly
 #tensorflow-text-nightly
-transformers
-diffusers
+transformers==4.18.0
 #tensorflow-probability
 #jax[cpu]

@@ -29,13 +28,6 @@ Pillow
 # Testing and support.
 lit
 pyyaml
-python-dateutil
-sacremoses
-
-# web dependecies.
-gradio
-altair
-scipy

 #ONNX and ORT for benchmarking
 #--extra-index-url https://test.pypi.org/simple/
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,23 +1,13 @@
 setuptools
 wheel
-pyinstaller

 # SHARK Runner
 tqdm

 # SHARK Downloader
-google-cloud-storage
+gsutil

 # Testing
 pytest
 pytest-xdist
 Pillow
-parameterized
-
-# Add transformers, diffusers and scipy since it most commonly used
-transformers
-diffusers
-scipy
-ftfy
-gradio
-altair
--- a/setup.py
+++ b/setup.py
@@ -6,19 +6,7 @@ import os
 with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

-with open("build_tools/shark_versions.txt", "r") as sv:
-    lines = [line.rstrip() for line in sv]
-    TM_VERSION = lines[7]
-    IREE_VERSION = lines[3]
-
-
 PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
-backend_deps = []
-if "NO_BACKEND" in os.environ.keys():
-    backend_deps = [
-        f"iree-compiler=={IREE_VERSION}",
-        f"iree-runtime>={IREE_VERSION}",
-    ]

 setup(
    name="nodai-SHARK",
@@ -39,11 +27,12 @@ setup(
        "Operating System :: OS Independent",
    ],
    packages=find_packages(exclude=("examples")),
-    python_requires=">=3.9",
+    python_requires=">=3.7",
    install_requires=[
        "numpy",
        "PyYAML",
-        f"torch-mlir=={TM_VERSION}",
-    ]
-    + backend_deps,
+        "torch-mlir>=20220428.420",
+        "iree-compiler>=20220427.13",
+        "iree-runtime>=20220427.13",
+    ],
 )
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -1,39 +0,0 @@
-#Write-Host "Installing python"
-
-#Start-Process winget install Python.Python.3.10 '/quiet InstallAllUsers=1 PrependPath=1' -wait -NoNewWindow
-
-#Write-Host "python installation completed successfully"
-
-#Write-Host "Reload environment variables"
-#$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-#Write-Host "Reloaded environment variables"
-
-
-# redirect stderr into stdout
-$p = &{python -V} 2>&1
-# check if an ErrorRecord was returned
-$version = if($p -is [System.Management.Automation.ErrorRecord])
-{
-    # grab the version string from the error message
-    $p.Exception.Message
-}
-else
-{
-    # otherwise return as is
-    $p
-}
-
-Write-Host "Python version found is"
-Write-Host $p
-
-
-Write-Host "Installing Build Dependencies"
-python -m venv .\shark.venv\
-.\shark.venv\Scripts\activate
-pip install -r requirements.txt
-pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
-pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
-Write-Host "Building SHARK..."
-pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
-Write-Host "Build and installation completed successfully"
-Write-Host "Source your venv with ./shark.venv/Scripts/activate"
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -7,8 +7,6 @@
 # VENV_DIR=myshark.venv #create a venv called myshark.venv
 # USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
 # IMPORTER=1 #Install importer deps
-# BENCHMARK=1 #Install benchmark deps
-# NO_BACKEND=1 #Don't install iree or shark backend
 # if you run the script from a conda env it will install in your conda env

 TD="$(cd $(dirname $0) && pwd)"
@@ -76,19 +74,11 @@ fi
 $PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
 $PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
 if [ "$torch_mlir_bin" = true ]; then
-  TM_VERSION=$(sed '8q;d' build_tools/shark_versions.txt)
-  if [[ $(uname -s) = 'Darwin' ]]; then
-    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
-    $PYTHON -m pip install --pre --no-cache-dir  torch-mlir==${TM_VERSION} -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
-  elif [[ ! -z "${NIGHTLY}" ]]; then
-    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
+  $PYTHON -m pip install --find-links https://github.com/llvm/torch-mlir/releases torch-mlir --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  if [ $? -eq 0 ];then
+    echo "Successfully Installed torch-mlir"
  else
-    $PYTHON -m pip install --pre torch-mlir==${TM_VERSION} -f https://llvm.github.io/torch-mlir/package-index/
-    if [ $? -eq 0 ]; then
-      echo "Successfully Installed torch-mlir"
-    else
-      echo "Could not install torch-mlir" >&2
-    fi
+    echo "Could not install torch-mlir" >&2
  fi
 else
  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
@@ -97,65 +87,34 @@ else
  exit 1
 fi
 if [[ -z "${USE_IREE}" ]]; then
-  rm .use-iree
-  RUNTIME="https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html"
-  RUNTIME_VERSION=$(sed '4q;d' build_tools/shark_versions.txt)
-  TM_VERSION=$(sed '8q;d' build_tools/shark_versions.txt)
+  RUNTIME="nod-ai/SHARK-Runtime"
 else
-  touch ./.use-iree
-  RUNTIME="https://iree-org.github.io/iree/pip-release-links.html"
-  RUNTIME_VERSION=$(sed '2q;d' build_tools/shark_versions.txt)
-  TM_VERSION=$(sed '6q;d' build_tools/shark_versions.txt)
-fi
-if [[ -z "${NO_BACKEND}" ]]; then
-  echo "Installing ${RUNTIME}..."
-  $PYTHON -m pip install --upgrade --find-links ${RUNTIME} iree-compiler==${RUNTIME_VERSION} iree-runtime==${RUNTIME_VERSION}
-else
-  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
+  RUNTIME="google/iree"
 fi
+echo "Installing ${RUNTIME}..."
+$PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime

 if [[ ! -z "${IMPORTER}" ]]; then
  echo "${Yellow}Installing importer tools.."
  if [[ $(uname -s) = 'Linux' ]]; then
    echo "${Yellow}Linux detected.. installing Linux importer tools"
-    #Always get the importer tools from upstream IREE
-    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://test.pypi.org/simple/ --extra-index-url https://download.pytorch.org/whl/nightly/cu116
  elif [[ $(uname -s) = 'Darwin' ]]; then
    echo "${Yellow}macOS detected.. installing macOS importer tools"
    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
-    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  fi
 fi

-$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/torch/
+$PYTHON -m pip install -e . --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://github.com/llvm/torch-mlir/releases -f https://github.com/${RUNTIME}/releases

-
-if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
+if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
  $PYTHON -m pip uninstall -y torch torchvision
-  $PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
+  $PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch + cu117."
+    echo "Successfully Installed torch + cu116."
  else
-    echo "Could not install torch + cu117." >&2
-  fi
-fi
-
-if [[ ! -z "${ONNX}" ]]; then
-  echo "${Yellow}Installing ONNX and onnxruntime for benchmarks..."
-  $PYTHON -m pip install onnx onnxruntime psutil
-  if [ $? -eq 0 ];then
-    echo "Successfully installed ONNX and ONNX runtime."
-  else
-    echo "Could not install ONNX." >&2
-  fi
-fi
-
-if [[ ! -z "${NIGHTLY}" ]]; then
-  $PYTHON -m pip install --upgrade --pre iree-compiler iree-runtime torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f $RUNTIME -f https://download.pytorch.org/whl/nightly/torch/
-  if [ $? -eq 0 ];then
-    echo "Successfully Installed latest packages for nightly job."
-  else
-    echo "Could not install latest IREE and Torch-MLIR." >&2
+    echo "Could not install torch + cu116." >&2
  fi
 fi

--- a/shark/examples/shark_dynamo/basic_examples.py
+++ b/shark/examples/shark_dynamo/basic_examples.py
@@ -1,70 +0,0 @@
-import torchdynamo
-import torch
-import torch_mlir
-from shark.sharkdynamo.utils import make_shark_compiler
-
-
-import warnings, logging
-
-warnings.simplefilter("ignore")
-torchdynamo.config.log_level = logging.ERROR
-
-
-torchdynamo.reset()
-
-
-@torchdynamo.optimize(
-    make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
-)
-def foo(t):
-    return 2 * t
-
-
-example_input = torch.rand((2, 3))
-x = foo(example_input)
-print(x)
-
-
-torchdynamo.reset()
-
-
-@torchdynamo.optimize(
-    make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
-)
-def foo(a, b):
-    x = a / (a + 1)
-    if b.sum() < 0:
-        b = b * -1
-    return x * b
-
-
-print(foo(torch.rand((2, 3)), -torch.rand((2, 3))))
-
-
-torchdynamo.reset()
-
-
-@torchdynamo.optimize(
-    make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
-)
-def foo(a):
-    for i in range(10):
-        a += 1.0
-    return a
-
-
-print(foo(torch.rand((1, 2))))
-
-torchdynamo.reset()
-
-
-@torchdynamo.optimize(
-    make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
-)
-def test_unsupported_types(t, y):
-    return t, 2 * y
-
-
-str_input = "hello"
-tensor_input = torch.randn(2)
-print(test_unsupported_types(str_input, tensor_input))
--- a/shark/examples/shark_eager/squeezenet_lockstep.py
+++ b/shark/examples/shark_eager/squeezenet_lockstep.py
@@ -1,73 +0,0 @@
-import torch
-import numpy as np
-
-model = torch.hub.load(
-    "pytorch/vision:v0.10.0", "squeezenet1_0", pretrained=True
-)
-model.eval()
-
-# from PIL import Image
-# from torchvision import transforms
-# import urllib
-#
-# url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-# try: urllib.URLopener().retrieve(url, filename)
-# except: urllib.request.urlretrieve(url, filename)
-#
-#
-# input_image = Image.open(filename)
-# preprocess = transforms.Compose([
-#     transforms.Resize(256),
-#     transforms.CenterCrop(224),
-#     transforms.ToTensor(),
-#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-# ])
-# input_tensor = preprocess(input_image)
-# input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
-# print(input_batch.shape) # size = [1, 3, 224, 224]
-
-# The above is code for generating sample inputs from an image. We can just use
-# random values for accuracy testing though
-input_batch = torch.randn(1, 3, 224, 224)
-
-
-# Focus on CPU for now
-if False and torch.cuda.is_available():
-    input_batch = input_batch.to("cuda")
-    model.to("cuda")
-
-with torch.no_grad():
-    output = model(input_batch)
-# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
-golden_confidences = output[0]
-# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
-golden_probabilities = torch.nn.functional.softmax(
-    golden_confidences, dim=0
-).numpy()
-
-golden_confidences = golden_confidences.numpy()
-
-from shark.torch_mlir_lockstep_tensor import TorchMLIRLockstepTensor
-
-input_detached_clone = input_batch.clone()
-eager_input_batch = TorchMLIRLockstepTensor(input_detached_clone)
-
-print("getting torch-mlir result")
-
-output = model(eager_input_batch)
-
-static_output = output.elem
-confidences = static_output[0]
-probabilities = torch.nn.functional.softmax(
-    torch.from_numpy(confidences), dim=0
-).numpy()
-
-print("The obtained result via shark is: ", confidences)
-print("The golden result is:", golden_confidences)
-
-np.testing.assert_allclose(
-    golden_confidences, confidences, rtol=1e-02, atol=1e-03
-)
-np.testing.assert_allclose(
-    golden_probabilities, probabilities, rtol=1e-02, atol=1e-03
-)
--- a/shark/examples/shark_inference/CLIPModel_tf.py
+++ b/shark/examples/shark_inference/CLIPModel_tf.py
@@ -22,7 +22,7 @@ class CLIPModule(tf.Module):
            input_ids=x, attention_mask=y, pixel_values=z
        )

-    @tf.function(input_signature=clip_vit_inputs, jit_compile=True)
+    @tf.function(input_signature=clip_vit_inputs)
    def forward(self, input_ids, attention_mask, pixel_values):
        return self.m.predict(
            input_ids, attention_mask, pixel_values
--- a/shark/examples/shark_inference/ESRGAN/README.md
+++ b/shark/examples/shark_inference/ESRGAN/README.md
@@ -1,15 +0,0 @@
-## Running ESRGAN
-
-```
-1. pip install numpy opencv-python
-2. mkdir InputImages
-   (this is where all the input images will reside in)
-3. mkdir OutputImages
-   (this is where the model will generate all the images)
-4. mkdir models
-   (save the .pth checkpoint file here)
-5. python esrgan.py
-```
-
- Download [RRDB_ESRGAN_x4.pth](https://drive.google.com/drive/u/0/folders/17VYV_SoZZesU6mbxz2dMAIccSSlqLecY) and place it in the `models` directory as mentioned above in step 4.
- Credits : [ESRGAN](https://github.com/xinntao/ESRGAN)
--- a/shark/examples/shark_inference/ESRGAN/esrgan.py
+++ b/shark/examples/shark_inference/ESRGAN/esrgan.py
@@ -1,240 +0,0 @@
-from ast import arg
-import os.path as osp
-import glob
-import cv2
-import numpy as np
-import torch
-
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-from shark.shark_inference import SharkInference
-import torch_mlir
-import tempfile
-import functools
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def make_layer(block, n_layers):
-    layers = []
-    for _ in range(n_layers):
-        layers.append(block())
-    return nn.Sequential(*layers)
-
-
-class ResidualDenseBlock_5C(nn.Module):
-    def __init__(self, nf=64, gc=32, bias=True):
-        super(ResidualDenseBlock_5C, self).__init__()
-        # gc: growth channel, i.e. intermediate channels
-        self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
-        self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
-        self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
-        self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
-        self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-
-        # initialization
-        # mutil.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
-
-    def forward(self, x):
-        x1 = self.lrelu(self.conv1(x))
-        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
-        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
-        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
-        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
-        return x5 * 0.2 + x
-
-
-class RRDB(nn.Module):
-    """Residual in Residual Dense Block"""
-
-    def __init__(self, nf, gc=32):
-        super(RRDB, self).__init__()
-        self.RDB1 = ResidualDenseBlock_5C(nf, gc)
-        self.RDB2 = ResidualDenseBlock_5C(nf, gc)
-        self.RDB3 = ResidualDenseBlock_5C(nf, gc)
-
-    def forward(self, x):
-        out = self.RDB1(x)
-        out = self.RDB2(out)
-        out = self.RDB3(out)
-        return out * 0.2 + x
-
-
-class RRDBNet(nn.Module):
-    def __init__(self, in_nc, out_nc, nf, nb, gc=32):
-        super(RRDBNet, self).__init__()
-        RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
-
-        self.conv_first = nn.Conv2d(in_nc, nf, 3, 1, 1, bias=True)
-        self.RRDB_trunk = make_layer(RRDB_block_f, nb)
-        self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        #### upsampling
-        self.upconv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        self.upconv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        self.HRconv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
-
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-
-    def forward(self, x):
-        fea = self.conv_first(x)
-        trunk = self.trunk_conv(self.RRDB_trunk(fea))
-        fea = fea + trunk
-
-        fea = self.lrelu(
-            self.upconv1(F.interpolate(fea, scale_factor=2, mode="nearest"))
-        )
-        fea = self.lrelu(
-            self.upconv2(F.interpolate(fea, scale_factor=2, mode="nearest"))
-        )
-        out = self.conv_last(self.lrelu(self.HRconv(fea)))
-
-        return out
-
-
-############### Parsing args #####################
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument("--device", type=str, default="cpu", help="the device to use")
-p.add_argument(
-    "--mlir_loc",
-    type=str,
-    default=None,
-    help="location of the model's mlir file",
-)
-args = p.parse_args()
-###################################################
-
-
-def inference(input_m):
-    return model(input_m)
-
-
-def load_mlir(mlir_loc):
-    import os
-
-    if mlir_loc == None:
-        return None
-    print(f"Trying to load the model from {mlir_loc}.")
-    with open(os.path.join(mlir_loc)) as f:
-        mlir_module = f.read()
-    return mlir_module
-
-
-def compile_through_fx(model, inputs, mlir_loc=None):
-
-    module = load_mlir(mlir_loc)
-    if module == None:
-        fx_g = make_fx(
-            model,
-            decomposition_table=get_decompositions(
-                [
-                    torch.ops.aten.embedding_dense_backward,
-                    torch.ops.aten.native_layer_norm_backward,
-                    torch.ops.aten.slice_backward,
-                    torch.ops.aten.select_backward,
-                    torch.ops.aten.norm.ScalarOpt_dim,
-                    torch.ops.aten.native_group_norm,
-                    torch.ops.aten.upsample_bilinear2d.vec,
-                    torch.ops.aten.split.Tensor,
-                    torch.ops.aten.split_with_sizes,
-                ]
-            ),
-        )(inputs)
-
-        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-        fx_g.recompile()
-
-        def strip_overloads(gm):
-            """
-            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-            Args:
-                gm(fx.GraphModule): The input Fx graph module to be modified
-            """
-            for node in gm.graph.nodes:
-                if isinstance(node.target, torch._ops.OpOverload):
-                    node.target = node.target.overloadpacket
-            gm.recompile()
-
-        strip_overloads(fx_g)
-
-        ts_g = torch.jit.script(fx_g)
-
-        print("Torchscript graph generated successfully")
-        module = torch_mlir.compile(
-            ts_g,
-            inputs,
-            torch_mlir.OutputType.LINALG_ON_TENSORS,
-            use_tracing=False,
-            verbose=False,
-        )
-
-    mlir_model = str(module)
-    func_name = "forward"
-    shark_module = SharkInference(
-        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
-    )
-    shark_module.compile()
-
-    return shark_module
-
-
-model_path = "models/RRDB_ESRGAN_x4.pth"  # models/RRDB_ESRGAN_x4.pth OR models/RRDB_PSNR_x4.pth
-# device = torch.device('cuda')  # if you want to run on CPU, change 'cuda' -> cpu
-device = torch.device("cpu")
-
-test_img_folder = "InputImages/*"
-
-model = RRDBNet(3, 3, 64, 23, gc=32)
-model.load_state_dict(torch.load(model_path), strict=True)
-model.eval()
-model = model.to(device)
-
-print("Model path {:s}. \nTesting...".format(model_path))
-
-if __name__ == "__main__":
-    idx = 0
-    for path in glob.glob(test_img_folder):
-        idx += 1
-        base = osp.splitext(osp.basename(path))[0]
-        print(idx, base)
-        # read images
-        img = cv2.imread(path, cv2.IMREAD_COLOR)
-        img = img * 1.0 / 255
-        img = torch.from_numpy(
-            np.transpose(img[:, :, [2, 1, 0]], (2, 0, 1))
-        ).float()
-        img_LR = img.unsqueeze(0)
-        img_LR = img_LR.to(device)
-
-        with torch.no_grad():
-            shark_module = compile_through_fx(inference, img_LR)
-            shark_output = shark_module.forward((img_LR,))
-            shark_output = torch.from_numpy(shark_output)
-            shark_output = (
-                shark_output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
-            )
-            esrgan_output = (
-                model(img_LR).data.squeeze().float().cpu().clamp_(0, 1).numpy()
-            )
-        # SHARK OUTPUT
-        shark_output = np.transpose(shark_output[[2, 1, 0], :, :], (1, 2, 0))
-        shark_output = (shark_output * 255.0).round()
-        cv2.imwrite(
-            "OutputImages/{:s}_rlt_shark_output.png".format(base), shark_output
-        )
-        print("Generated SHARK's output")
-        # ESRGAN OUTPUT
-        esrgan_output = np.transpose(esrgan_output[[2, 1, 0], :, :], (1, 2, 0))
-        esrgan_output = (esrgan_output * 255.0).round()
-        cv2.imwrite(
-            "OutputImages/{:s}_rlt_esrgan_output.png".format(base),
-            esrgan_output,
-        )
-        print("Generated ESRGAN's output")
--- a/shark/examples/shark_inference/albert_maskfill_pt.py
+++ b/shark/examples/shark_inference/albert_maskfill_pt.py
@@ -18,23 +18,14 @@ class AlbertModule(torch.nn.Module):
        self.model.eval()

    def forward(self, input_ids, attention_mask):
-        return self.model(
-            input_ids=input_ids, attention_mask=attention_mask
-        ).logits
-
+        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits

 if __name__ == "__main__":
    # Prepping Data
    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
    text = "This [MASK] is very tasty."
-    encoded_inputs = tokenizer(
-        text,
-        padding="max_length",
-        truncation=True,
-        max_length=MAX_SEQUENCE_LENGTH,
-        return_tensors="pt",
-    )
-    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
+    encoded_inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt")
+    inputs = (encoded_inputs["input_ids"],encoded_inputs["attention_mask"])
    mlir_importer = SharkImporter(
        AlbertModule(),
        inputs,
@@ -43,46 +34,26 @@ if __name__ == "__main__":
    minilm_mlir, func_name = mlir_importer.import_mlir(
        is_dynamic=False, tracing_required=True
    )
-    shark_module = SharkInference(
-        minilm_mlir, func_name, mlir_dialect="linalg"
-    )
+    shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="linalg")
    shark_module.compile()
    token_logits = torch.tensor(shark_module.forward(inputs))
-    mask_id = torch.where(
-        encoded_inputs["input_ids"] == tokenizer.mask_token_id
-    )[1]
+    mask_id = torch.where(encoded_inputs["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_id, :]
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    for token in top_5_tokens:
-        print(
-            f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
-        )
+        print(f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'")    
    while True:
        try:
            new_text = input("Give me a sentence with [MASK] to fill: ")
-            encoded_inputs = tokenizer(
-                new_text,
-                padding="max_length",
-                truncation=True,
-                max_length=MAX_SEQUENCE_LENGTH,
-                return_tensors="pt",
-            )
-            inputs = (
-                encoded_inputs["input_ids"],
-                encoded_inputs["attention_mask"],
-            )
+            encoded_inputs = tokenizer(new_text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt")
+            inputs = (encoded_inputs["input_ids"],encoded_inputs["attention_mask"])
            token_logits = torch.tensor(shark_module.forward(inputs))
-            mask_id = torch.where(
-                encoded_inputs["input_ids"] == tokenizer.mask_token_id
-            )[1]
+            mask_id = torch.where(encoded_inputs["input_ids"] == tokenizer.mask_token_id)[1]
            mask_token_logits = token_logits[0, mask_id, :]
-            top_5_tokens = (
-                torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-            )
+            top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
            for token in top_5_tokens:
-                print(
-                    f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
-                )
+                print(f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'")    
        except KeyboardInterrupt:
            print("Exiting program.")
            break
+
--- a/shark/examples/shark_inference/albert_maskfill_tf.py
+++ b/shark/examples/shark_inference/albert_maskfill_tf.py
@@ -18,17 +18,15 @@ BATCH_SIZE = 1
 # Create a set of inputs
 t5_inputs = [
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
-    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
 ]
-
-
 class AlbertModule(tf.Module):
    def __init__(self):
        super(AlbertModule, self).__init__()
        self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
-        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
+        self.m.predict = lambda x,y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=t5_inputs, jit_compile=True)
+    @tf.function(input_signature=t5_inputs)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

@@ -38,14 +36,8 @@ if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
    # text = "This is a great [MASK]."
    text = "This [MASK] is very tasty."
-    encoded_inputs = tokenizer(
-        text,
-        padding="max_length",
-        truncation=True,
-        max_length=MAX_SEQUENCE_LENGTH,
-        return_tensors="tf",
-    )
-    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
+    encoded_inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf")
+    inputs = (encoded_inputs["input_ids"],encoded_inputs["attention_mask"])
    mlir_importer = SharkImporter(
        AlbertModule(),
        inputs,
@@ -59,42 +51,22 @@ if __name__ == "__main__":
    output_idx = 0
    data_idx = 1
    token_logits = shark_module.forward(inputs)[output_idx][data_idx]
-    mask_id = np.where(
-        tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id
-    )
+    mask_id = np.where(tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id)
    mask_token_logits = token_logits[0, mask_id, :]
    top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[0:5]
    for token in top_5_tokens:
-        print(
-            f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
-        )
+        print(f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'")    
    while True:
        try:
            new_text = input("Give me a sentence with [MASK] to fill: ")
-            encoded_inputs = tokenizer(
-                new_text,
-                padding="max_length",
-                truncation=True,
-                max_length=MAX_SEQUENCE_LENGTH,
-                return_tensors="tf",
-            )
-            inputs = (
-                encoded_inputs["input_ids"],
-                encoded_inputs["attention_mask"],
-            )
+            encoded_inputs = tokenizer(new_text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf")
+            inputs = (encoded_inputs["input_ids"],encoded_inputs["attention_mask"])
            token_logits = shark_module.forward(inputs)[output_idx][data_idx]
-            mask_id = np.where(
-                tf.squeeze(encoded_inputs["input_ids"])
-                == tokenizer.mask_token_id
-            )
+            mask_id = np.where(tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id)
            mask_token_logits = token_logits[0, mask_id, :]
-            top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[
-                0:5
-            ]
+            top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[0:5]
            for token in top_5_tokens:
-                print(
-                    f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
-                )
+                print(f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'")    
        except KeyboardInterrupt:
            print("Exiting program.")
            sys.exit()
--- a/shark/examples/shark_inference/albert_maskfill_torch.py
+++ b/shark/examples/shark_inference/albert_maskfill_torch.py
@@ -0,0 +1,35 @@
+from PIL import Image
+import requests
+
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import torch
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+from iree import runtime as ireert
+import os
+import numpy as np
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+if __name__ == "__main__":
+    # Prepping Data
+    model = AutoModelForMaskedLM.from_pretrained("albert-base-v2")
+    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
+    text = "This [MASK] is very tasty."
+    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt")
+    token_logits = model(**inputs).logits
+    print(token_logits)
+    # Find the location of [MASK] and extract its logits
+    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+    mask_token_logits = token_logits[0, mask_token_index, :]
+    # print(mask_token_logits)
+    # Pick the [MASK] candidates with the highest logits
+    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+    print(np.argsort(mask_token_logits.detach().numpy()))
+    # print(top_5_tokens)
+
+    for token in top_5_tokens:
+        print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
--- a/shark/examples/shark_inference/bloom_tank.py
+++ b/shark/examples/shark_inference/bloom_tank.py
@@ -1,14 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
-
-mlir_model, func_name, inputs, golden_out = download_model(
-    "bloom", frontend="torch"
-)
-
-shark_module = SharkInference(
-    mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"
-)
-shark_module.compile()
-result = shark_module.forward(inputs)
-print("The obtained result via shark is: ", result)
-print("The golden result is:", golden_out)
--- a/shark/examples/shark_inference/gpt2_tf.py
+++ b/shark/examples/shark_inference/gpt2_tf.py
@@ -19,7 +19,7 @@ class GPT2Module(tf.Module):

        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=gpt2_inputs, jit_compile=True)
+    @tf.function(input_signature=gpt2_inputs)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

--- a/shark/examples/shark_inference/minilm_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_benchmark_tf.py
@@ -26,7 +26,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input, jit_compile=True)
+    @tf.function(input_signature=bert_input)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/shark/examples/shark_inference/minilm_jit.py
+++ b/shark/examples/shark_inference/minilm_jit.py
@@ -1,15 +1,14 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
+from shark.shark_downloader import download_torch_model


-mlir_model, func_name, inputs, golden_out = download_model(
-    "microsoft/MiniLM-L12-H384-uncased",
-    frontend="torch",
+mlir_model, func_name, inputs, golden_out = download_torch_model(
+    "microsoft/MiniLM-L12-H384-uncased"
 )


 shark_module = SharkInference(
-    mlir_model, func_name, device="cpu", mlir_dialect="linalg"
+    mlir_model, func_name, mlir_dialect="linalg"
 )
 shark_module.compile()
 result = shark_module.forward(inputs)
--- a/shark/examples/shark_inference/minilm_tf.py
+++ b/shark/examples/shark_inference/minilm_tf.py
@@ -26,7 +26,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input, jit_compile=True)
+    @tf.function(input_signature=bert_input)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/shark/examples/shark_inference/resnest.py
+++ b/shark/examples/shark_inference/resnest.py
@@ -23,7 +23,7 @@ input = torch.randn(1, 3, 224, 224)

 mlir_importer = SharkImporter(
    ResnestModule(),
-    (input,),
+    (input),
    frontend="torch",
 )

@@ -33,7 +33,9 @@ mlir_importer = SharkImporter(

 print(golden_out)

-shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
+shark_module = SharkInference(
+    vision_mlir, func_name, device="cpu", mlir_dialect="linalg"
+)
 shark_module.compile()
-result = shark_module.forward((input,))
+result = shark_module.forward((input))
 print("Obtained result", result)
--- a/shark/examples/shark_inference/resnet50_fp16.py
+++ b/shark/examples/shark_inference/resnet50_fp16.py
@@ -1,76 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.parser import shark_args
-
-import torch
-import numpy as np
-import sys
-import torchvision.models as models
-import torch_mlir
-
-torch.manual_seed(0)
-
-
-class VisionModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.model = models.resnet50(pretrained=True)
-        self.train(False)
-
-    def forward(self, input):
-        return self.model.forward(input)
-
-
-model = VisionModule()
-test_input = torch.randn(1, 3, 224, 224)
-actual_out = model(test_input)
-
-test_input_fp16 = test_input.to(device=torch.device("cuda"), dtype=torch.half)
-model_fp16 = model.half()
-model_fp16.eval()
-model_fp16.to("cuda")
-actual_out_fp16 = model_fp16(test_input_fp16)
-
-ts_g = torch.jit.trace(model_fp16, [test_input_fp16])
-
-module = torch_mlir.compile(
-    ts_g,
-    (test_input_fp16),
-    torch_mlir.OutputType.LINALG_ON_TENSORS,
-    use_tracing=True,
-    verbose=False,
-)
-
-# from contextlib import redirect_stdout
-
-# with open('resnet50_fp16_linalg_ir.mlir', 'w') as f:
-#     with redirect_stdout(f):
-#         print(module.operation.get_asm())
-
-mlir_model = module
-func_name = "forward"
-
-shark_module = SharkInference(
-    mlir_model, func_name, device="cuda", mlir_dialect="linalg"
-)
-shark_module.compile()
-
-
-def shark_result(x):
-    x_ny = x.cpu().detach().numpy()
-    inputs = (x_ny,)
-    result = shark_module.forward(inputs)
-    return torch.from_numpy(result)
-
-
-observed_out = shark_result(test_input_fp16)
-
-print("Golden result:", actual_out_fp16)
-print("SHARK result:", observed_out)
-
-actual_out_fp16 = actual_out_fp16.to(device=torch.device("cpu"))
-
-print(
-    torch.testing.assert_allclose(
-        actual_out_fp16, observed_out, rtol=1e-2, atol=1e-2
-    )
-)
--- a/shark/examples/shark_inference/resnet50_script.py
+++ b/shark/examples/shark_inference/resnet50_script.py
@@ -5,7 +5,7 @@ import torchvision.models as models
 from torchvision import transforms
 import sys
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
+from shark.shark_downloader import download_torch_model


 ################################## Preprocessing inputs and model ############
@@ -66,14 +66,10 @@ labels = load_labels()


 ## Can pass any img or input to the forward module.
-mlir_model, func_name, inputs, golden_out = download_model(
-    "resnet50", frontend="torch"
-)
+mlir_model, func_name, inputs, golden_out = download_torch_model("resnet50")

 shark_module = SharkInference(mlir_model, func_name, mlir_dialect="linalg")
 shark_module.compile()
-path = shark_module.save_module()
-shark_module.load_module(path)
 result = shark_module.forward((img.detach().numpy(),))

 print("The top 3 results obtained via shark_runner is:")
--- a/shark/examples/shark_inference/simple_dlrm.py
+++ b/shark/examples/shark_inference/simple_dlrm.py
@@ -1,392 +0,0 @@
-# Description: an implementation of a deep learning recommendation model (DLRM)
-# The model input consists of dense and sparse features. The former is a vector
-# of floating point values. The latter is a list of sparse indices into
-# embedding tables, which consist of vectors of floating point values.
-# The selected vectors are passed to mlp networks denoted by triangles,
-# in some cases the vectors are interacted through operators (Ops).
-#
-# output:
-#                         vector of values
-# model:                        |
-#                              /\
-#                             /__\
-#                               |
-#       _____________________> Op  <___________________
-#     /                         |                      \
-#    /\                        /\                      /\
-#   /__\                      /__\           ...      /__\
-#    |                          |                       |
-#    |                         Op                      Op
-#    |                    ____/__\_____           ____/__\____
-#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
-# input:
-# [ dense features ]     [sparse indices] , ..., [sparse indices]
-#
-# More precise definition of model layers:
-# 1) fully connected layers of an mlp
-# z = f(y)
-# y = Wx + b
-#
-# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
-# z = Op(e1,...,ek)
-# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
-#
-# 3) Operator Op can be one of the following
-# Sum(e1,...,ek) = e1 + ... + ek
-# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
-# Cat(e1,...,ek) = [e1', ..., ek']'
-# where ' denotes transpose operation
-#
-# References:
-# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
-# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
-# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
-# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
-# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
-# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
-# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
-
-
-import argparse
-import sys
-import numpy as np
-import torch
-import torch.nn as nn
-from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
-
-
-torch.manual_seed(0)
-np.random.seed(0)
-
-
-### define dlrm in PyTorch ###
-class DLRM_Net(nn.Module):
-    def create_mlp(self, ln, sigmoid_layer):
-        # build MLP layer by layer
-        layers = nn.ModuleList()
-        for i in range(0, ln.size - 1):
-            n = ln[i]
-            m = ln[i + 1]
-
-            # construct fully connected operator
-            LL = nn.Linear(int(n), int(m), bias=True)
-
-            # initialize the weights
-            # with torch.no_grad():
-            # custom Xavier input, output or two-sided fill
-
-            mean = 0.0  # std_dev = np.sqrt(variance)
-            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
-            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
-            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
-            bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
-            LL.weight.data = torch.tensor(W, requires_grad=True)
-            LL.bias.data = torch.tensor(bt, requires_grad=True)
-
-            # approach 2
-            # LL.weight.data.copy_(torch.tensor(W))
-            # LL.bias.data.copy_(torch.tensor(bt))
-            # approach 3
-            # LL.weight = Parameter(torch.tensor(W),requires_grad=True)
-            # LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
-            layers.append(LL)
-
-            # construct sigmoid or relu operator
-            if i == sigmoid_layer:
-                layers.append(nn.Sigmoid())
-            else:
-                layers.append(nn.ReLU())
-
-        # approach 1: use ModuleList
-        # return layers
-        # approach 2: use Sequential container to wrap all layers
-        return torch.nn.Sequential(*layers)
-
-    def create_emb(self, m, ln, weighted_pooling=None):
-        emb_l = nn.ModuleList()
-        v_W_l = []
-        for i in range(0, ln.size):
-            n = ln[i]
-
-            # construct embedding operator
-            EE = nn.EmbeddingBag(n, m, mode="sum")
-            # initialize embeddings
-            # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
-            W = np.random.uniform(
-                low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
-            ).astype(np.float32)
-            # approach 1
-            print(W)
-            EE.weight.data = torch.tensor(W, requires_grad=True)
-            # approach 2
-            # EE.weight.data.copy_(torch.tensor(W))
-            # approach 3
-            # EE.weight = Parameter(torch.tensor(W),requires_grad=True)
-            if weighted_pooling is None:
-                v_W_l.append(None)
-            else:
-                v_W_l.append(torch.ones(n, dtype=torch.float32))
-            emb_l.append(EE)
-        return emb_l, v_W_l
-
-    def __init__(
-        self,
-        m_spa=None,
-        ln_emb=None,
-        ln_bot=None,
-        ln_top=None,
-        arch_interaction_op=None,
-        arch_interaction_itself=False,
-        sigmoid_bot=-1,
-        sigmoid_top=-1,
-        weighted_pooling=None,
-    ):
-        super(DLRM_Net, self).__init__()
-
-        if (
-            (m_spa is not None)
-            and (ln_emb is not None)
-            and (ln_bot is not None)
-            and (ln_top is not None)
-            and (arch_interaction_op is not None)
-        ):
-
-            # save arguments
-            self.output_d = 0
-            self.arch_interaction_op = arch_interaction_op
-            self.arch_interaction_itself = arch_interaction_itself
-            if weighted_pooling is not None and weighted_pooling != "fixed":
-                self.weighted_pooling = "learned"
-            else:
-                self.weighted_pooling = weighted_pooling
-
-            # create operators
-            self.emb_l, w_list = self.create_emb(
-                m_spa, ln_emb, weighted_pooling
-            )
-            if self.weighted_pooling == "learned":
-                self.v_W_l = nn.ParameterList()
-                for w in w_list:
-                    self.v_W_l.append(nn.Parameter(w))
-            else:
-                self.v_W_l = w_list
-            self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
-            self.top_l = self.create_mlp(ln_top, sigmoid_top)
-
-    def apply_mlp(self, x, layers):
-        return layers(x)
-
-    def apply_emb(self, lS_o, lS_i, emb_l, v_W_l):
-        # WARNING: notice that we are processing the batch at once. We implicitly
-        # assume that the data is laid out such that:
-        # 1. each embedding is indexed with a group of sparse indices,
-        #   corresponding to a single lookup
-        # 2. for each embedding the lookups are further organized into a batch
-        # 3. for a list of embedding tables there is a list of batched lookups
-        # TORCH-MLIR
-        # We are passing all the embeddings as arguments for easy parsing.
-
-        ly = []
-        for k, sparse_index_group_batch in enumerate(lS_i):
-            sparse_offset_group_batch = lS_o[k]
-
-            # embedding lookup
-            # We are using EmbeddingBag, which implicitly uses sum operator.
-            # The embeddings are represented as tall matrices, with sum
-            # happening vertically across 0 axis, resulting in a row vector
-            # E = emb_l[k]
-
-            if v_W_l[k] is not None:
-                per_sample_weights = v_W_l[k].gather(
-                    0, sparse_index_group_batch
-                )
-            else:
-                per_sample_weights = None
-
-            E = emb_l[k]
-            V = E(
-                sparse_index_group_batch,
-                sparse_offset_group_batch,
-                per_sample_weights=per_sample_weights,
-            )
-
-            ly.append(V)
-
-        return ly
-
-    def interact_features(self, x, ly):
-
-        if self.arch_interaction_op == "dot":
-            # concatenate dense and sparse features
-            (batch_size, d) = x.shape
-            T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
-            # perform a dot product
-            Z = torch.bmm(T, torch.transpose(T, 1, 2))
-            # append dense feature with the interactions (into a row vector)
-            # approach 1: all
-            # Zflat = Z.view((batch_size, -1))
-            # approach 2: unique
-            _, ni, nj = Z.shape
-            # approach 1: tril_indices
-            # offset = 0 if self.arch_interaction_itself else -1
-            # li, lj = torch.tril_indices(ni, nj, offset=offset)
-            # approach 2: custom
-            offset = 1 if self.arch_interaction_itself else 0
-            li = torch.tensor(
-                [i for i in range(ni) for j in range(i + offset)]
-            )
-            lj = torch.tensor(
-                [j for i in range(nj) for j in range(i + offset)]
-            )
-            Zflat = Z[:, li, lj]
-            # concatenate dense features and interactions
-            R = torch.cat([x] + [Zflat], dim=1)
-        elif self.arch_interaction_op == "cat":
-            # concatenation features (into a row vector)
-            R = torch.cat([x] + ly, dim=1)
-        else:
-            sys.exit(
-                "ERROR: --arch-interaction-op="
-                + self.arch_interaction_op
-                + " is not supported"
-            )
-
-        return R
-
-    def forward(self, dense_x, lS_o, *lS_i):
-        return self.sequential_forward(dense_x, lS_o, lS_i)
-
-    def sequential_forward(self, dense_x, lS_o, lS_i):
-        # process dense features (using bottom mlp), resulting in a row vector
-        x = self.apply_mlp(dense_x, self.bot_l)
-        # debug prints
-        # print("intermediate")
-        # print(x.detach().cpu().numpy())
-
-        # process sparse features(using embeddings), resulting in a list of row vectors
-        ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
-        # for y in ly:
-        #     print(y.detach().cpu().numpy())
-
-        # interact features (dense and sparse)
-        z = self.interact_features(x, ly)
-        # print(z.detach().cpu().numpy())
-
-        # obtain probability of a click (using top mlp)
-        p = self.apply_mlp(z, self.top_l)
-
-        # # clamp output if needed
-        # if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
-        # z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
-        # else:
-        # z = p
-
-        return p
-
-
-def dash_separated_ints(value):
-    vals = value.split("-")
-    for val in vals:
-        try:
-            int(val)
-        except ValueError:
-            raise argparse.ArgumentTypeError(
-                "%s is not a valid dash separated list of ints" % value
-            )
-
-    return value
-
-
-# model related parameters
-parser = argparse.ArgumentParser(
-    description="Train Deep Learning Recommendation Model (DLRM)"
-)
-parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
-parser.add_argument(
-    "--arch-embedding-size", type=dash_separated_ints, default="4-3-2"
-)
-# j will be replaced with the table number
-parser.add_argument(
-    "--arch-mlp-bot", type=dash_separated_ints, default="4-3-2"
-)
-parser.add_argument(
-    "--arch-mlp-top", type=dash_separated_ints, default="8-2-1"
-)
-parser.add_argument(
-    "--arch-interaction-op", type=str, choices=["dot", "cat"], default="dot"
-)
-parser.add_argument(
-    "--arch-interaction-itself", action="store_true", default=False
-)
-parser.add_argument("--weighted-pooling", type=str, default=None)
-
-args = parser.parse_args()
-
-ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
-ln_top = np.fromstring(args.arch_mlp_top, dtype=int, sep="-")
-m_den = ln_bot[0]
-ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
-m_spa = args.arch_sparse_feature_size
-ln_emb = np.asarray(ln_emb)
-num_fea = ln_emb.size + 1  # num sparse + num dense features
-
-
-# Initialize the model.
-dlrm_model = DLRM_Net(
-    m_spa=m_spa,
-    ln_emb=ln_emb,
-    ln_bot=ln_bot,
-    ln_top=ln_top,
-    arch_interaction_op=args.arch_interaction_op,
-)
-
-
-# Inputs to the model.
-dense_inp = torch.tensor([[0.6965, 0.2861, 0.2269, 0.5513]])
-vs0 = torch.tensor([[0], [0], [0]], dtype=torch.int64)
-vsi = torch.tensor([1, 2, 3]), torch.tensor([1]), torch.tensor([1])
-
-input_dlrm = (dense_inp, vs0, *vsi)
-
-golden_output = dlrm_model(dense_inp, vs0, *vsi)
-
-mlir_importer = SharkImporter(
-    dlrm_model,
-    input_dlrm,
-    frontend="torch",
-)
-
-(dlrm_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
-    tracing_required=True
-)
-
-shark_module = SharkInference(
-    dlrm_mlir, func_name, device="vulkan", mlir_dialect="linalg"
-)
-shark_module.compile()
-result = shark_module.forward(input_dlrm)
-np.testing.assert_allclose(
-    golden_output.detach().numpy(), result, rtol=1e-02, atol=1e-03
-)
-
-
-# Verified via torch-mlir.
-# import torch_mlir
-# from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
-
-
-# module = torch_mlir.compile(
-# dlrm_model, inputs, use_tracing=True, output_type="linalg-on-tensors"
-# )
-# backend = refbackend.RefBackendLinalgOnTensorsBackend()
-# compiled = backend.compile(module)
-# jit_module = backend.load(compiled)
-
-# dense_numpy = dense_inp.numpy()
-# vs0_numpy = vs0.numpy()
-# vsi_numpy = [inp.numpy() for inp in vsi]
-
-# numpy_inp = (dense_numpy, vs0_numpy, *vsi_numpy)
-
-# print(jit_module.forward(*numpy_inp))
--- a/shark/examples/shark_inference/sparse_arch.py
+++ b/shark/examples/shark_inference/sparse_arch.py
@@ -1,314 +0,0 @@
-import torch
-from torch import nn
-from torchrec.datasets.utils import Batch
-from torchrec.modules.crossnet import LowRankCrossNet
-from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
-from torchrec.modules.embedding_configs import EmbeddingBagConfig
-from torchrec.modules.embedding_modules import EmbeddingBagCollection
-from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
-from typing import Dict, List, Optional, Tuple
-from torchrec.models.dlrm import (
-    choose,
-    DenseArch,
-    DLRM,
-    InteractionArch,
-    SparseArch,
-    OverArch,
-)
-from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
-import numpy as np
-
-torch.manual_seed(0)
-
-np.random.seed(0)
-
-
-def calculate_offsets(tensor_list, prev_values, prev_offsets):
-    offset_init = 0
-    offset_list = []
-    values_list = []
-
-    if prev_offsets != None:
-        offset_init = prev_values.shape[-1]
-    for tensor in tensor_list:
-        offset_list.append(offset_init)
-        offset_init += tensor.shape[0]
-
-    concatendated_tensor_list = torch.cat(tensor_list)
-
-    if prev_values != None:
-        concatendated_tensor_list = torch.cat(
-            [prev_values, concatendated_tensor_list]
-        )
-
-    concatenated_offsets = torch.tensor(offset_list)
-
-    if prev_offsets != None:
-        concatenated_offsets = torch.cat([prev_offsets, concatenated_offsets])
-
-    return concatendated_tensor_list, concatenated_offsets
-
-
-# Have to make combined_keys as dict as to which embedding bags they
-# point to. {f1: 0, f3: 0, f2: 1}
-# The result will be a triple containing values, indices and pointer tensor.
-def to_list(key_jagged, combined_keys):
-    key_jagged_dict = key_jagged.to_dict()
-    combined_list = []
-
-    for key in combined_keys:
-        prev_values, prev_offsets = calculate_offsets(
-            key_jagged_dict[key].to_dense(), None, None
-        )
-        print(prev_values)
-        print(prev_offsets)
-        combined_list.append(prev_values)
-        combined_list.append(prev_offsets)
-        combined_list.append(torch.tensor(combined_keys[key]))
-
-    return combined_list
-
-
-class SparseArchShark(nn.Module):
-    def create_emb(self, embedding_dim, num_embeddings_list):
-        embedding_list = nn.ModuleList()
-        for i in range(0, num_embeddings_list.size):
-            num_embeddings = num_embeddings_list[i]
-            EE = nn.EmbeddingBag(num_embeddings, embedding_dim, mode="sum")
-            W = np.random.uniform(
-                low=-np.sqrt(1 / num_embeddings),
-                high=np.sqrt(1 / num_embeddings),
-                size=(num_embeddings, embedding_dim),
-            ).astype(np.float32)
-            EE.weight.data = torch.tensor(W, requires_grad=True)
-            embedding_list.append(EE)
-        return embedding_list
-
-    def __init__(
-        self,
-        embedding_dim,
-        total_features,
-        num_embeddings_list,
-    ):
-        super(SparseArchShark, self).__init__()
-        self.embedding_dim = embedding_dim
-        self.num_features = total_features
-        self.embedding_list = self.create_emb(
-            embedding_dim, num_embeddings_list
-        )
-
-    def forward(self, *batched_inputs):
-
-        concatenated_list = []
-        input_enum, embedding_enum = 0, 0
-
-        for k in range(len(batched_inputs) // 3):
-            values = batched_inputs[input_enum]
-            input_enum += 1
-            offsets = batched_inputs[input_enum]
-            input_enum += 1
-            embedding_pointer = int(batched_inputs[input_enum])
-            input_enum += 1
-
-            E = self.embedding_list[embedding_pointer]
-            V = E(values, offsets)
-            concatenated_list.append(V)
-
-        return torch.cat(concatenated_list, dim=1).reshape(
-            -1, self.num_features, self.embedding_dim
-        )
-
-
-def test_sparse_arch() -> None:
-
-    D = 3
-    eb1_config = EmbeddingBagConfig(
-        name="t1",
-        embedding_dim=D,
-        num_embeddings=10,
-        feature_names=["f1", "f3"],
-    )
-    eb2_config = EmbeddingBagConfig(
-        name="t2",
-        embedding_dim=D,
-        num_embeddings=10,
-        feature_names=["f2"],
-    )
-
-    ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
-
-    w1 = ebc.embedding_bags["t1"].weight
-    w2 = ebc.embedding_bags["t2"].weight
-
-    sparse_arch = SparseArch(ebc)
-
-    keys = ["f1", "f2", "f3", "f4", "f5"]
-    offsets = torch.tensor([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 19])
-    features = KeyedJaggedTensor.from_offsets_sync(
-        keys=keys,
-        values=torch.tensor(
-            [1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 3]
-        ),
-        offsets=offsets,
-    )
-    sparse_archi = SparseArchShark(D, 3, np.array([10, 10]))
-    sparse_archi.embedding_list[0].weight = w1
-    sparse_archi.embedding_list[1].weight = w2
-    inputs = to_list(features, {"f1": 0, "f3": 0, "f2": 1})
-
-    test_results = sparse_archi(*inputs)
-    sparse_features = sparse_arch(features)
-
-    torch.allclose(
-        sparse_features,
-        test_results,
-        rtol=1e-4,
-        atol=1e-4,
-    )
-
-
-test_sparse_arch()
-
-
-class DLRMShark(nn.Module):
-    def __init__(
-        self,
-        embedding_dim,
-        total_features,
-        num_embeddings_list,
-        dense_in_features: int,
-        dense_arch_layer_sizes: List[int],
-        over_arch_layer_sizes: List[int],
-    ) -> None:
-        super().__init__()
-
-        self.sparse_arch: SparseArchShark = SparseArchShark(
-            embedding_dim, total_features, num_embeddings_list
-        )
-        num_sparse_features: int = total_features
-
-        self.dense_arch = DenseArch(
-            in_features=dense_in_features,
-            layer_sizes=dense_arch_layer_sizes,
-        )
-
-        self.inter_arch = InteractionArch(
-            num_sparse_features=num_sparse_features,
-        )
-
-        over_in_features: int = (
-            embedding_dim
-            + choose(num_sparse_features, 2)
-            + num_sparse_features
-        )
-
-        self.over_arch = OverArch(
-            in_features=over_in_features,
-            layer_sizes=over_arch_layer_sizes,
-        )
-
-    def forward(
-        self, dense_features: torch.Tensor, *sparse_features
-    ) -> torch.Tensor:
-
-        embedded_dense = self.dense_arch(dense_features)
-        embedded_sparse = self.sparse_arch(*sparse_features)
-        concatenated_dense = self.inter_arch(
-            dense_features=embedded_dense, sparse_features=embedded_sparse
-        )
-        logits = self.over_arch(concatenated_dense)
-        return logits
-
-
-def test_dlrm() -> None:
-    B = 2
-    D = 8
-    dense_in_features = 100
-
-    eb1_config = EmbeddingBagConfig(
-        name="t1",
-        embedding_dim=D,
-        num_embeddings=100,
-        feature_names=["f1", "f3"],
-    )
-    eb2_config = EmbeddingBagConfig(
-        name="t2",
-        embedding_dim=D,
-        num_embeddings=100,
-        feature_names=["f2"],
-    )
-
-    ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
-
-    sparse_features = KeyedJaggedTensor.from_offsets_sync(
-        keys=["f1", "f3", "f2"],
-        values=torch.tensor([1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 3]),
-        offsets=torch.tensor([0, 2, 4, 6, 8, 10, 11]),
-    )
-    ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
-    sparse_nn = DLRM(
-        embedding_bag_collection=ebc,
-        dense_in_features=dense_in_features,
-        dense_arch_layer_sizes=[20, D],
-        over_arch_layer_sizes=[5, 1],
-    )
-    sparse_nn_nod = DLRMShark(
-        embedding_dim=8,
-        total_features=3,
-        num_embeddings_list=np.array([100, 100]),
-        dense_in_features=dense_in_features,
-        dense_arch_layer_sizes=[20, D],
-        over_arch_layer_sizes=[5, 1],
-    )
-
-    dense_features = torch.rand((B, dense_in_features))
-
-    x = to_list(sparse_features, {"f1": 0, "f3": 0, "f2": 1})
-
-    w1 = ebc.embedding_bags["t1"].weight
-    w2 = ebc.embedding_bags["t2"].weight
-
-    sparse_nn_nod.sparse_arch.embedding_list[0].weight = w1
-    sparse_nn_nod.sparse_arch.embedding_list[1].weight = w2
-
-    sparse_nn_nod.dense_arch.load_state_dict(sparse_nn.dense_arch.state_dict())
-    sparse_nn_nod.inter_arch.load_state_dict(sparse_nn.inter_arch.state_dict())
-    sparse_nn_nod.over_arch.load_state_dict(sparse_nn.over_arch.state_dict())
-
-    logits = sparse_nn(
-        dense_features=dense_features,
-        sparse_features=sparse_features,
-    )
-    logits_nod = sparse_nn_nod(dense_features, *x)
-
-    # print(logits)
-    # print(logits_nod)
-
-    # Import the module and print.
-    mlir_importer = SharkImporter(
-        sparse_nn_nod,
-        (dense_features, *x),
-        frontend="torch",
-    )
-
-    (dlrm_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
-        tracing_required=True
-    )
-
-    shark_module = SharkInference(
-        dlrm_mlir, func_name, device="cpu", mlir_dialect="linalg"
-    )
-    shark_module.compile()
-    result = shark_module.forward(inputs)
-    np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-    torch.allclose(
-        logits,
-        logits_nod,
-        rtol=1e-4,
-        atol=1e-4,
-    )
-
-
-test_dlrm()
--- a/shark/examples/shark_inference/stable_diff.py
+++ b/shark/examples/shark_inference/stable_diff.py
@@ -1,272 +0,0 @@
-from transformers import CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
-import torch
-from PIL import Image
-from diffusers import LMSDiscreteScheduler
-from tqdm.auto import tqdm
-from shark.shark_inference import SharkInference
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-import torch_mlir
-import tempfile
-import numpy as np
-
-# pip install diffusers
-# pip install scipy
-
-############### Parsing args #####################
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a photograph of an astronaut riding a horse",
-    help="the text prompt to use",
-)
-p.add_argument("--device", type=str, default="cpu", help="the device to use")
-p.add_argument("--steps", type=int, default=10, help="the device to use")
-p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
-p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
-args = p.parse_args()
-
-#####################################################
-
-
-def load_mlir(mlir_loc):
-    import os
-
-    if mlir_loc == None:
-        return None
-    print(f"Trying to load the model from {mlir_loc}.")
-    with open(os.path.join(mlir_loc)) as f:
-        mlir_module = f.read()
-    return mlir_module
-
-
-def compile_through_fx(model, inputs, mlir_loc=None, extra_args=[]):
-
-    module = load_mlir(mlir_loc)
-    if mlir_loc == None:
-        fx_g = make_fx(
-            model,
-            decomposition_table=get_decompositions(
-                [
-                    torch.ops.aten.embedding_dense_backward,
-                    torch.ops.aten.native_layer_norm_backward,
-                    torch.ops.aten.slice_backward,
-                    torch.ops.aten.select_backward,
-                    torch.ops.aten.norm.ScalarOpt_dim,
-                    torch.ops.aten.native_group_norm,
-                    torch.ops.aten.upsample_bilinear2d.vec,
-                    torch.ops.aten.split.Tensor,
-                    torch.ops.aten.split_with_sizes,
-                ]
-            ),
-        )(*inputs)
-
-        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-        fx_g.recompile()
-
-        def strip_overloads(gm):
-            """
-            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-            Args:
-                gm(fx.GraphModule): The input Fx graph module to be modified
-            """
-            for node in gm.graph.nodes:
-                if isinstance(node.target, torch._ops.OpOverload):
-                    node.target = node.target.overloadpacket
-            gm.recompile()
-
-        strip_overloads(fx_g)
-
-        ts_g = torch.jit.script(fx_g)
-
-        module = torch_mlir.compile(
-            ts_g,
-            inputs,
-            torch_mlir.OutputType.LINALG_ON_TENSORS,
-            use_tracing=False,
-            verbose=False,
-        )
-
-    mlir_model = module
-    func_name = "forward"
-
-    shark_module = SharkInference(
-        mlir_model,
-        func_name,
-        device=args.device,
-        mlir_dialect="tm_tensor",
-    )
-    shark_module.compile(extra_args)
-
-    return shark_module
-
-
-if __name__ == "__main__":
-
-    YOUR_TOKEN = "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"
-
-    # 1. Load the autoencoder model which will be used to decode the latents into image space.
-    vae = AutoencoderKL.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
-        subfolder="vae",
-        use_auth_token=YOUR_TOKEN,
-    )
-
-    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    text_encoder = CLIPTextModel.from_pretrained(
-        "openai/clip-vit-large-patch14"
-    )
-
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="vae",
-                use_auth_token=YOUR_TOKEN,
-            )
-
-        def forward(self, input):
-            return self.vae.decode(input, return_dict=False)[0]
-
-    vae = VaeModel()
-    vae_input = torch.rand(1, 4, 64, 64)
-    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
-
-    # Wrap the unet model to return tuples.
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-        def forward(self, x, y, z):
-            return self.unet.forward(x, y, z, return_dict=False)[0]
-
-    # 3. The UNet model for generating the latents.
-    unet = UnetModel()
-    latent_model_input = torch.rand([2, 4, 64, 64])
-    text_embeddings = torch.rand([2, 77, 768])
-    shark_unet = compile_through_fx(
-        unet,
-        (latent_model_input, torch.tensor([1.0]), text_embeddings),
-        args.mlir_loc,
-        ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
-    )
-
-    # torch.jit.script(unet)
-
-    scheduler = LMSDiscreteScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-    )
-
-    prompt = [args.prompt]
-
-    height = 512  # default height of Stable Diffusion
-    width = 512  # default width of Stable Diffusion
-
-    num_inference_steps = args.steps  # Number of denoising steps
-
-    guidance_scale = 7.5  # Scale for classifier-free guidance
-
-    generator = torch.manual_seed(
-        42
-    )  # Seed generator to create the inital latent noise
-
-    batch_size = len(prompt)
-
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-
-    text_embeddings = text_encoder(text_input.input_ids)[0]
-
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-        [""] * batch_size,
-        padding="max_length",
-        max_length=max_length,
-        return_tensors="pt",
-    )
-    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
-
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-    latents = torch.randn(
-        (batch_size, unet.in_channels, height // 8, width // 8),
-        generator=generator,
-    )
-    # latents = latents.to(torch_device)
-
-    scheduler.set_timesteps(num_inference_steps)
-
-    latents = latents * scheduler.sigmas[0]
-    # print(latents, latents.shape)
-
-    for i, t in tqdm(enumerate(scheduler.timesteps)):
-
-        print(f"i = {i} t = {t}")
-        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-        latent_model_input = torch.cat([latents] * 2)
-        sigma = scheduler.sigmas[i]
-        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
-
-        # predict the noise residual
-
-        # with torch.no_grad():
-        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
-
-        latent_model_input_numpy = latent_model_input.detach().numpy()
-        text_embeddings_numpy = text_embeddings.detach().numpy()
-
-        noise_pred = shark_unet.forward(
-            (
-                latent_model_input_numpy,
-                np.array([t]).astype(np.float32),
-                text_embeddings_numpy,
-            )
-        )
-        noise_pred = torch.from_numpy(noise_pred)
-
-        # perform guidance
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        noise_pred = noise_pred_uncond + guidance_scale * (
-            noise_pred_text - noise_pred_uncond
-        )
-
-        # compute the previous noisy sample x_t -> x_t-1
-        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
-
-    # print("Latents shape : ", latents.shape)
-
-    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    latents_numpy = latents.detach().numpy()
-    image = shark_vae.forward((latents_numpy,))
-    image = torch.from_numpy(image)
-
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_f16.py
+++ b/shark/examples/shark_inference/stable_diff_f16.py
@@ -1,280 +0,0 @@
-from transformers import CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
-import torch
-from PIL import Image
-from diffusers import LMSDiscreteScheduler
-from tqdm.auto import tqdm
-from shark.shark_inference import SharkInference
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-import torch_mlir
-import tempfile
-import numpy as np
-
-# pip install diffusers
-# pip install scipy
-
-############### Parsing args #####################
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a photograph of an astronaut riding a horse",
-    help="the text prompt to use",
-)
-p.add_argument("--device", type=str, default="cpu", help="the device to use")
-p.add_argument("--steps", type=int, default=50, help="the device to use")
-p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
-p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
-args = p.parse_args()
-
-#####################################################
-
-
-def fp16_unet():
-    from shark.shark_downloader import download_model
-
-    mlir_model, func_name, inputs, golden_out = download_model(
-        "stable_diff_f16_18_OCT",
-        tank_url="gs://shark_tank/prashant_nod",
-        frontend="torch",
-    )
-    shark_module = SharkInference(
-        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
-    )
-    shark_module.compile()
-    return shark_module
-
-
-def load_mlir(mlir_loc):
-    import os
-
-    if mlir_loc == None:
-        return None
-    print(f"Trying to load the model from {mlir_loc}.")
-    with open(os.path.join(mlir_loc)) as f:
-        mlir_module = f.read()
-    return mlir_module
-
-
-def compile_through_fx(model, inputs, mlir_loc=None):
-
-    module = load_mlir(mlir_loc)
-    if mlir_loc == None:
-        fx_g = make_fx(
-            model,
-            decomposition_table=get_decompositions(
-                [
-                    torch.ops.aten.embedding_dense_backward,
-                    torch.ops.aten.native_layer_norm_backward,
-                    torch.ops.aten.slice_backward,
-                    torch.ops.aten.select_backward,
-                    torch.ops.aten.norm.ScalarOpt_dim,
-                    torch.ops.aten.native_group_norm,
-                    torch.ops.aten.upsample_bilinear2d.vec,
-                    torch.ops.aten.split.Tensor,
-                    torch.ops.aten.split_with_sizes,
-                ]
-            ),
-        )(*inputs)
-
-        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-        fx_g.recompile()
-
-        def strip_overloads(gm):
-            """
-            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-            Args:
-                gm(fx.GraphModule): The input Fx graph module to be modified
-            """
-            for node in gm.graph.nodes:
-                if isinstance(node.target, torch._ops.OpOverload):
-                    node.target = node.target.overloadpacket
-            gm.recompile()
-
-        strip_overloads(fx_g)
-
-        ts_g = torch.jit.script(fx_g)
-
-        module = torch_mlir.compile(
-            ts_g,
-            inputs,
-            torch_mlir.OutputType.LINALG_ON_TENSORS,
-            use_tracing=False,
-            verbose=False,
-        )
-
-    mlir_model = module
-    func_name = "forward"
-
-    shark_module = SharkInference(
-        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
-    )
-    shark_module.compile()
-
-    return shark_module
-
-
-if __name__ == "__main__":
-
-    YOUR_TOKEN = "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"
-
-    # 1. Load the autoencoder model which will be used to decode the latents into image space.
-    vae = AutoencoderKL.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
-        subfolder="vae",
-        use_auth_token=YOUR_TOKEN,
-    )
-
-    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    text_encoder = CLIPTextModel.from_pretrained(
-        "openai/clip-vit-large-patch14"
-    )
-
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="vae",
-                use_auth_token=YOUR_TOKEN,
-            )
-
-        def forward(self, input):
-            return self.vae.decode(input, return_dict=False)[0]
-
-    vae = VaeModel()
-    vae_input = torch.rand(1, 4, 64, 64)
-    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
-
-    # Wrap the unet model to return tuples.
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                "CompVis/stable-diffusion-v1-4",
-                subfolder="unet",
-                use_auth_token=YOUR_TOKEN,
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-    def forward(self, x, y, z):
-        return self.unet.forward(x, y, z, return_dict=False)[0]
-
-    # # 3. The UNet model for generating the latents.
-    unet = UnetModel()
-
-    shark_unet = fp16_unet()
-
-    scheduler = LMSDiscreteScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-    )
-
-    prompt = [args.prompt]
-
-    height = 512  # default height of Stable Diffusion
-    width = 512  # default width of Stable Diffusion
-
-    num_inference_steps = args.steps  # Number of denoising steps
-
-    guidance_scale = 7.5  # Scale for classifier-free guidance
-
-    generator = torch.manual_seed(
-        42
-    )  # Seed generator to create the inital latent noise
-
-    batch_size = len(prompt)
-
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-
-    text_embeddings = text_encoder(text_input.input_ids)[0]
-
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-        [""] * batch_size,
-        padding="max_length",
-        max_length=max_length,
-        return_tensors="pt",
-    )
-    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
-
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-    latents = torch.randn(
-        (batch_size, unet.in_channels, height // 8, width // 8),
-        generator=generator,
-    )
-    # latents = latents.to(torch_device)
-
-    scheduler.set_timesteps(num_inference_steps)
-
-    latents = latents * scheduler.sigmas[0]
-    # print(latents, latents.shape)
-
-    for i, t in tqdm(enumerate(scheduler.timesteps)):
-
-        print(f"i = {i} t = {t}")
-        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-        latent_model_input = torch.cat([latents] * 2)
-        sigma = scheduler.sigmas[i]
-        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
-
-        # predict the noise residual
-
-        # with torch.no_grad():
-        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
-
-        latent_model_input_numpy = (
-            latent_model_input.detach().numpy().astype(np.half)
-        )
-        text_embeddings_numpy = (
-            text_embeddings.detach().numpy().astype(np.half)
-        )
-
-        noise_pred = shark_unet.forward(
-            (
-                latent_model_input_numpy,
-                np.array([t]).astype(np.half),
-                text_embeddings_numpy,
-            )
-        )
-        noise_pred = torch.from_numpy(noise_pred).to(torch.float32)
-
-        # perform guidance
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        noise_pred = noise_pred_uncond + guidance_scale * (
-            noise_pred_text - noise_pred_uncond
-        )
-
-        # compute the previous noisy sample x_t -> x_t-1
-        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
-
-    # print("Latents shape : ", latents.shape)
-
-    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    latents_numpy = latents.detach().numpy()
-    image = shark_vae.forward((latents_numpy,))
-    image = torch.from_numpy(image)
-
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_tf.py
+++ b/shark/examples/shark_inference/stable_diff_tf.py
@@ -1,313 +0,0 @@
-import math
-import numpy as np
-import tensorflow as tf
-from tensorflow import keras
-from keras_cv.models.generative.stable_diffusion.clip_tokenizer import (
-    SimpleTokenizer,
-)
-from keras_cv.models.generative.stable_diffusion.constants import (
-    _ALPHAS_CUMPROD,
-)
-from keras_cv.models.generative.stable_diffusion.constants import (
-    _UNCONDITIONAL_TOKENS,
-)
-from keras_cv.models.generative.stable_diffusion.decoder import Decoder
-from keras_cv.models.generative.stable_diffusion.text_encoder import (
-    TextEncoder,
-)
-
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
-from PIL import Image
-
-# pip install "git+https://github.com/keras-team/keras-cv.git"
-# pip install tensorflow_dataset
-
-############### Parsing args #####################
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a photograph of an astronaut riding a horse",
-    help="the text prompt to use",
-)
-p.add_argument("--device", type=str, default="cpu", help="the device to use")
-p.add_argument(
-    "--steps", type=int, default=10, help="the number of steps to use"
-)
-p.add_argument(
-    "--save_path",
-    type=str,
-    default=None,
-    help="the file to save the resulting image to. (default to <input prompt>.jpg)",
-)
-args = p.parse_args()
-
-#####################################################
-
-MAX_PROMPT_LENGTH = 77
-
-
-class SharkStableDiffusion:
-    """Shark implementation of Stable Diffusion based on model from keras_cv.
-    Stable Diffusion is a powerful image generation model that can be used,
-    among other things, to generate pictures according to a short text description
-    (called a "prompt").
-    Arguments:
-        device: Device to use with SHARK. Default: cpu
-        jit_compile: Whether to compile the underlying models to XLA.
-            This can lead to a significant speedup on some systems. Default: False.
-    References:
-    - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
-    - [Original implementation](https://github.com/CompVis/stable-diffusion)
-    """
-
-    def __init__(self, device="cpu", jit_compile=True):
-        self.img_height = 512
-        self.img_width = 512
-        self.tokenizer = SimpleTokenizer()
-
-        # Create models
-        self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
-
-        mlir_model, func_name, inputs, golden_out = download_model(
-            "stable_diff", tank_url="gs://shark_tank/quinn", frontend="tf"
-        )
-        shark_module = SharkInference(
-            mlir_model, func_name, device=device, mlir_dialect="mhlo"
-        )
-        shark_module.compile()
-        self.diffusion_model = shark_module
-        self.decoder = Decoder(self.img_height, self.img_width)
-        if jit_compile:
-            self.text_encoder.compile(jit_compile=True)
-            self.decoder.compile(jit_compile=True)
-
-        print(
-            "By using this model checkpoint, you acknowledge that its usage is "
-            "subject to the terms of the CreativeML Open RAIL-M license at "
-            "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE"
-        )
-        # Load weights
-        text_encoder_weights_fpath = keras.utils.get_file(
-            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_encoder.h5",
-            file_hash="4789e63e07c0e54d6a34a29b45ce81ece27060c499a709d556c7755b42bb0dc4",
-        )
-        decoder_weights_fpath = keras.utils.get_file(
-            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_decoder.h5",
-            file_hash="ad350a65cc8bc4a80c8103367e039a3329b4231c2469a1093869a345f55b1962",
-        )
-        self.text_encoder.load_weights(text_encoder_weights_fpath)
-        self.decoder.load_weights(decoder_weights_fpath)
-
-    def text_to_image(
-        self,
-        prompt,
-        batch_size=1,
-        num_steps=25,
-        unconditional_guidance_scale=7.5,
-        seed=None,
-    ):
-        encoded_text = self.encode_text(prompt)
-
-        return self.generate_image(
-            encoded_text,
-            batch_size=batch_size,
-            num_steps=num_steps,
-            unconditional_guidance_scale=unconditional_guidance_scale,
-            seed=seed,
-        )
-
-    def encode_text(self, prompt):
-        """Encodes a prompt into a latent text encoding.
-        The encoding produced by this method should be used as the
-        `encoded_text` parameter of `StableDiffusion.generate_image`. Encoding
-        text separately from generating an image can be used to arbitrarily
-        modify the text encoding priot to image generation, e.g. for walking
-        between two prompts.
-        Args:
-            prompt: a string to encode, must be 77 tokens or shorter.
-        Example:
-        ```python
-        from keras_cv.models import StableDiffusion
-        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
-        encoded_text  = model.encode_text("Tacos at dawn")
-        img = model.generate_image(encoded_text)
-        ```
-        """
-        # Tokenize prompt (i.e. starting context)
-        inputs = self.tokenizer.encode(prompt)
-        if len(inputs) > MAX_PROMPT_LENGTH:
-            raise ValueError(
-                f"Prompt is too long (should be <= {MAX_PROMPT_LENGTH} tokens)"
-            )
-        phrase = inputs + [49407] * (MAX_PROMPT_LENGTH - len(inputs))
-        phrase = tf.convert_to_tensor([phrase], dtype=tf.int32)
-
-        context = self.text_encoder.predict_on_batch(
-            [phrase, self._get_pos_ids()]
-        )
-
-        return context
-
-    def generate_image(
-        self,
-        encoded_text,
-        batch_size=1,
-        num_steps=25,
-        unconditional_guidance_scale=7.5,
-        diffusion_noise=None,
-        seed=None,
-    ):
-        """Generates an image based on encoded text.
-        The encoding passed to this method should be derived from
-        `StableDiffusion.encode_text`.
-        Args:
-            encoded_text: Tensor of shape (`batch_size`, 77, 768), or a Tensor
-            of shape (77, 768). When the batch axis is omitted, the same encoded
-            text will be used to produce every generated image.
-            batch_size: number of images to generate. Default: 1.
-            num_steps: number of diffusion steps (controls image quality).
-                Default: 25.
-            unconditional_guidance_scale: float controling how closely the image
-                should adhere to the prompt. Larger values result in more
-                closely adhering to the prompt, but will make the image noisier.
-                Default: 7.5.
-            diffusion_noise: Tensor of shape (`batch_size`, img_height // 8,
-                img_width // 8, 4), or a Tensor of shape (img_height // 8,
-                img_width // 8, 4). Optional custom noise to seed the diffusion
-                process. When the batch axis is omitted, the same noise will be
-                used to seed diffusion for every generated image.
-            seed: integer which is used to seed the random generation of
-                diffusion noise, only to be specified if `diffusion_noise` is
-                None.
-        Example:
-        ```python
-        from keras_cv.models import StableDiffusion
-        batch_size = 8
-        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
-        e_tacos = model.encode_text("Tacos at dawn")
-        e_watermelons = model.encode_text("Watermelons at dusk")
-        e_interpolated = tf.linspace(e_tacos, e_watermelons, batch_size)
-        images = model.generate_image(e_interpolated, batch_size=batch_size)
-        ```
-        """
-        if diffusion_noise is not None and seed is not None:
-            raise ValueError(
-                "`diffusion_noise` and `seed` should not both be passed to "
-                "`generate_image`. `seed` is only used to generate diffusion "
-                "noise when it's not already user-specified."
-            )
-
-        encoded_text = tf.squeeze(encoded_text)
-        if encoded_text.shape.rank == 2:
-            encoded_text = tf.repeat(
-                tf.expand_dims(encoded_text, axis=0), batch_size, axis=0
-            )
-
-        context = encoded_text
-        unconditional_context = tf.repeat(
-            self._get_unconditional_context(), batch_size, axis=0
-        )
-        context = tf.concat([context, unconditional_context], 0)
-
-        if diffusion_noise is not None:
-            diffusion_noise = tf.squeeze(diffusion_noise)
-            if diffusion_noise.shape.rank == 3:
-                diffusion_noise = tf.repeat(
-                    tf.expand_dims(diffusion_noise, axis=0), batch_size, axis=0
-                )
-            latent = diffusion_noise
-        else:
-            latent = self._get_initial_diffusion_noise(batch_size, seed)
-
-        # Iterative reverse diffusion stage
-        timesteps = tf.range(1, 1000, 1000 // num_steps)
-        alphas, alphas_prev = self._get_initial_alphas(timesteps)
-        progbar = keras.utils.Progbar(len(timesteps))
-        iteration = 0
-        for index, timestep in list(enumerate(timesteps))[::-1]:
-            latent_prev = latent  # Set aside the previous latent vector
-            t_emb = self._get_timestep_embedding(timestep, batch_size)
-
-            # Prepare the latent and unconditional latent to be run with a single forward call
-            latent = tf.concat([latent, latent], 0)
-            t_emb = tf.concat([t_emb, t_emb], 0)
-            latent_numpy = self.diffusion_model.forward(
-                [latent.numpy(), t_emb.numpy(), context.numpy()]
-            )
-            latent = tf.convert_to_tensor(latent_numpy, dtype=tf.float32)
-            latent, unconditional_latent = tf.split(latent, 2)
-
-            latent = unconditional_latent + unconditional_guidance_scale * (
-                latent - unconditional_latent
-            )
-            a_t, a_prev = alphas[index], alphas_prev[index]
-            pred_x0 = (latent_prev - math.sqrt(1 - a_t) * latent) / math.sqrt(
-                a_t
-            )
-            latent = (
-                latent * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0
-            )
-            iteration += 1
-            progbar.update(iteration)
-
-        # Decoding stage
-        decoded = self.decoder.predict_on_batch(latent)
-        decoded = ((decoded + 1) / 2) * 255
-        return np.clip(decoded, 0, 255).astype("uint8")
-
-    def _get_unconditional_context(self):
-        unconditional_tokens = tf.convert_to_tensor(
-            [_UNCONDITIONAL_TOKENS], dtype=tf.int32
-        )
-        unconditional_context = self.text_encoder.predict_on_batch(
-            [unconditional_tokens, self._get_pos_ids()]
-        )
-
-        return unconditional_context
-
-    def _get_timestep_embedding(
-        self, timestep, batch_size, dim=320, max_period=10000
-    ):
-        half = dim // 2
-        freqs = tf.math.exp(
-            -math.log(max_period) * tf.range(0, half, dtype=tf.float32) / half
-        )
-        args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
-        embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
-        embedding = tf.reshape(embedding, [1, -1])
-        return tf.repeat(embedding, batch_size, axis=0)
-
-    def _get_initial_alphas(self, timesteps):
-        alphas = [_ALPHAS_CUMPROD[t] for t in timesteps]
-        alphas_prev = [1.0] + alphas[:-1]
-
-        return alphas, alphas_prev
-
-    def _get_initial_diffusion_noise(self, batch_size, seed):
-        return tf.random.normal(
-            (batch_size, self.img_height // 8, self.img_width // 8, 4),
-            seed=seed,
-        )
-
-    @staticmethod
-    def _get_pos_ids():
-        return tf.convert_to_tensor(
-            [list(range(MAX_PROMPT_LENGTH))], dtype=tf.int32
-        )
-
-
-if __name__ == "__main__":
-    SD = SharkStableDiffusion(device=args.device)
-    images = SD.text_to_image(args.prompt, num_steps=args.steps)
-    pil_images = [Image.fromarray(image) for image in images]
-    save_fname = args.prompt + ".jpg"
-    if args.save_path is not None:
-        save_fname = args.save_path
-    pil_images[0].save(save_fname)
--- a/shark/examples/shark_inference/stable_diffusion/.gitignore
+++ b/shark/examples/shark_inference/stable_diffusion/.gitignore
@@ -1,2 +0,0 @@
-*.vmfb
-*.jpg
--- a/shark/examples/shark_inference/stable_diffusion/README.md
+++ b/shark/examples/shark_inference/stable_diffusion/README.md
@@ -1,44 +0,0 @@
-# STABLE DIFFUSION
-
-## Installation
-
-Follow setup instructions in the main [README.md](https://github.com/nod-ai/SHARK#readme) for regular usage. 
-
-## Debug commands and other advanced usage follows.
-
-```shell
-python main.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text" 
-
-```
-
-## dump all dispatch .spv and isa using amdllpc
-
-```shell
-python main.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
-```
-
-## Compile and save the .vmfb (using vulkan fp16 as an example):
-
-```shell
-python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
-```
-
-## Capture an RGP trace
-
-```shell
-python shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
-```
-
-## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
-
-```shell
-iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf16  
-```
-
-## Run the unet module with iree-benchmark-module (same config as above):
-```shell
-##if you want to use .npz inputs:
-unzip ~/.local/shark_tank/<your unet>/inputs.npz
-
-iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --function_input=@arr_0.npy --function_input=1xf16 --function_input=@arr_2.npy --function_input=@arr_3.npy --function_input=@arr_4.npy  
-```
--- a/shark/examples/shark_inference/stable_diffusion/download_hf_models.py
+++ b/shark/examples/shark_inference/stable_diffusion/download_hf_models.py
@@ -1,25 +0,0 @@
-from PIL import Image
-import requests
-
-from transformers import CLIPProcessor, CLIPModel
-
-model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-
-inputs = processor(
-    text=["a photo of a cat", "a photo of a dog"],
-    images=image,
-    return_tensors="pt",
-    padding=True,
-)
-
-outputs = model(**inputs)
-logits_per_image = (
-    outputs.logits_per_image
-)  # this is the image-text similarity score
-probs = logits_per_image.softmax(
-    dim=1
-)  # we can take the softmax to get the label probabilities
--- a/shark/examples/shark_inference/stable_diffusion/main.py
+++ b/shark/examples/shark_inference/stable_diffusion/main.py
@@ -1,188 +0,0 @@
-from transformers import CLIPTextModel, CLIPTokenizer
-import torch
-from PIL import Image
-from diffusers import (
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerDiscreteScheduler,
-)
-from tqdm.auto import tqdm
-import numpy as np
-from stable_args import args
-from utils import get_shark_model, set_iree_runtime_flags
-from opt_params import get_unet, get_vae, get_clip
-import time
-from model_wrappers import get_vae_mlir
-from shark.iree_utils.compile_utils import dump_isas
-
-# Helper function to profile the vulkan device.
-def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
-    if args.vulkan_debug_utils and "vulkan" in args.device:
-        import iree
-
-        print(f"Profiling and saving to {file_path}.")
-        vulkan_device = iree.runtime.get_device(args.device)
-        vulkan_device.begin_profiling(mode=profiling_mode, file_path=file_path)
-        return vulkan_device
-    return None
-
-
-def end_profiling(device):
-    if device:
-        return device.end_profiling()
-
-
-if __name__ == "__main__":
-
-    dtype = torch.float32 if args.precision == "fp32" else torch.half
-
-    prompt = args.prompts
-    height = 512  # default height of Stable Diffusion
-    width = 512  # default width of Stable Diffusion
-    if args.version == "v2":
-        height = 768
-        width = 768
-
-    num_inference_steps = args.steps  # Number of denoising steps
-
-    # Scale for classifier-free guidance
-    guidance_scale = torch.tensor(args.guidance_scale).to(torch.float32)
-
-    generator = torch.manual_seed(
-        args.seed
-    )  # Seed generator to create the inital latent noise
-
-    batch_size = len(prompt)
-
-    set_iree_runtime_flags()
-    unet = get_unet()
-    vae = get_vae()
-    clip = get_clip()
-    if args.dump_isa:
-        dump_isas(args.dispatch_benchmarks_dir)
-
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    scheduler = DPMSolverMultistepScheduler.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
-        subfolder="scheduler",
-    )
-    if args.version == "v2":
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "stabilityai/stable-diffusion-2", subfolder="tokenizer"
-        )
-
-        scheduler = DPMSolverMultistepScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-2",
-            subfolder="scheduler",
-        )
-
-    if args.version == "v2.1base":
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer"
-        )
-
-        scheduler = EulerDiscreteScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base",
-            subfolder="scheduler",
-        )
-    start = time.time()
-
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=args.max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-
-    clip_inf_start = time.time()
-    text_embeddings = clip.forward((text_input.input_ids,))
-    clip_inf_end = time.time()
-    text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-        [""] * batch_size,
-        padding="max_length",
-        max_length=max_length,
-        return_tensors="pt",
-    )
-    uncond_clip_inf_start = time.time()
-    uncond_embeddings = clip.forward((uncond_input.input_ids,))
-    uncond_clip_inf_end = time.time()
-    uncond_embeddings = torch.from_numpy(uncond_embeddings).to(dtype)
-
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-    latents = torch.randn(
-        (batch_size, 4, height // 8, width // 8),
-        generator=generator,
-        dtype=torch.float32,
-    ).to(dtype)
-
-    scheduler.set_timesteps(num_inference_steps)
-    scheduler.is_scale_input_called = True
-
-    latents = latents * scheduler.init_noise_sigma
-    text_embeddings_numpy = text_embeddings.detach().numpy()
-    avg_ms = 0
-
-    for i, t in tqdm(enumerate(scheduler.timesteps)):
-        step_start = time.time()
-        print(f"i = {i} t = {t}", end="")
-        timestep = torch.tensor([t]).to(dtype).detach().numpy()
-        latent_model_input = scheduler.scale_model_input(latents, t)
-        latents_numpy = latent_model_input.detach().numpy()
-
-        profile_device = start_profiling(file_path="unet.rdc")
-
-        noise_pred = unet.forward(
-            (
-                latents_numpy,
-                timestep,
-                text_embeddings_numpy,
-                guidance_scale,
-            )
-        )
-
-        end_profiling(profile_device)
-
-        noise_pred = torch.from_numpy(noise_pred)
-        step_time = time.time() - step_start
-        avg_ms += step_time
-        step_ms = int((step_time) * 1000)
-        print(f" ({step_ms}ms)")
-
-        latents = scheduler.step(noise_pred, t, latents).prev_sample
-
-    avg_ms = 1000 * avg_ms / args.steps
-    print(f"Average step time: {avg_ms}ms/it")
-
-    # scale and decode the image latents with vae
-    latents = 1 / 0.18215 * latents
-    # latents = latents.
-    latents_numpy = latents.detach().numpy()
-    profile_device = start_profiling(file_path="vae.rdc")
-    vae_start = time.time()
-    image = vae.forward((latents_numpy,))
-    vae_end = time.time()
-    end_profiling(profile_device)
-    image = torch.from_numpy(image)
-    image = image.detach().cpu().permute(0, 2, 3, 1) * 255.0
-    images = image.numpy().round().astype("uint8")
-    total_end = time.time()
-
-    clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
-    uncond_clip_inf_time = (uncond_clip_inf_end - uncond_clip_inf_start) * 1000
-    avg_clip_inf = (clip_inf_time + uncond_clip_inf_time) / 2
-    vae_inf_time = (vae_end - vae_start) * 1000
-    print(
-        f"Clip Inference Avg time (ms) = ({clip_inf_time:.3f} + {uncond_clip_inf_time:.3f}) / 2 = {avg_clip_inf:.3f}"
-    )
-    print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
-    print(f"Total image generation runtime (s): {total_end - start:.4f}")
-
-    pil_images = [Image.fromarray(image) for image in images]
-    for i in range(batch_size):
-        pil_images[i].save(f"{args.prompts[i]}_{i}.jpg")
--- a/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
+++ b/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
@@ -1,184 +0,0 @@
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
-from transformers import CLIPTextModel
-from utils import compile_through_fx
-from stable_args import args
-import torch
-
-BATCH_SIZE = len(args.prompts)
-
-model_config = {
-    "v2": "stabilityai/stable-diffusion-2",
-    "v2.1base": "stabilityai/stable-diffusion-2-1-base",
-    "v1.4": "CompVis/stable-diffusion-v1-4",
-}
-
-model_input = {
-    "v2": {
-        "clip": (torch.randint(1, 2, (1, 77)),),
-        "vae": (torch.randn(1, 4, 96, 96),),
-        "unet": (
-            torch.randn(1, 4, 96, 96),  # latents
-            torch.tensor([1]).to(torch.float32),  # timestep
-            torch.randn(2, 77, 1024),  # embedding
-            torch.tensor(1).to(torch.float32),  # guidance_scale
-        ),
-    },
-    "v2.1base": {
-        "clip": (torch.randint(1, 2, (1, 77)),),
-        "vae": (torch.randn(1, 4, 64, 64),),
-        "unet": (
-            torch.randn(1, 4, 64, 64),  # latents
-            torch.tensor([1]).to(torch.float32),  # timestep
-            torch.randn(2, 77, 1024),  # embedding
-            torch.tensor(1).to(torch.float32),  # guidance_scale
-        ),
-    },
-    "v1.4": {
-        "clip": (torch.randint(1, 2, (1, 77)),),
-        "vae": (torch.randn(1, 4, 64, 64),),
-        "unet": (
-            torch.randn(1, 4, 64, 64),
-            torch.tensor([1]).to(torch.float32),  # timestep
-            torch.randn(2, 77, 768),
-            torch.tensor(1).to(torch.float32),
-        ),
-    },
-}
-
-# revision param for from_pretrained defaults to "main" => fp32
-model_revision = "fp16" if args.precision == "fp16" else "main"
-
-
-def get_clip_mlir(model_name="clip_text", extra_args=[]):
-
-    text_encoder = CLIPTextModel.from_pretrained(
-        "openai/clip-vit-large-patch14"
-    )
-    if args.version == "v2":
-        text_encoder = CLIPTextModel.from_pretrained(
-            model_config[args.version], subfolder="text_encoder"
-        )
-
-    class CLIPText(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.text_encoder = text_encoder
-
-        def forward(self, input):
-            return self.text_encoder(input)[0]
-
-    clip_model = CLIPText()
-    shark_clip = compile_through_fx(
-        clip_model,
-        model_input[args.version]["clip"],
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_clip
-
-
-def get_vae_mlir(model_name="vae", extra_args=[]):
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                model_config[args.version],
-                subfolder="vae",
-                revision=model_revision,
-            )
-
-        def forward(self, input):
-            x = self.vae.decode(input, return_dict=False)[0]
-            return (x / 2 + 0.5).clamp(0, 1)
-
-    vae = VaeModel()
-    if args.precision == "fp16":
-        vae = vae.half().cuda()
-        inputs = tuple(
-            [
-                inputs.half().cuda()
-                for inputs in model_input[args.version]["vae"]
-            ]
-        )
-    else:
-        inputs = model_input[args.version]["vae"]
-
-    shark_vae = compile_through_fx(
-        vae,
-        inputs,
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_vae
-
-
-def get_vae_encode_mlir(model_name="vae_encode", extra_args=[]):
-    class VaeEncodeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                model_config[args.version],
-                subfolder="vae",
-                revision="fp16",
-            )
-
-        def forward(self, x):
-            input = 2 * (x - 0.5)
-            return self.vae.encode(input, return_dict=False)[0]
-
-    vae = VaeEncodeModel()
-    vae = vae.half().cuda()
-    inputs = tuple(
-        [inputs.half().cuda() for inputs in model_input[args.version]["vae"]]
-    )
-    shark_vae = compile_through_fx(
-        vae,
-        inputs,
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_vae
-
-
-def get_unet_mlir(model_name="unet", extra_args=[]):
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                model_config[args.version],
-                subfolder="unet",
-                revision=model_revision,
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-        def forward(self, latent, timestep, text_embedding, guidance_scale):
-            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-            latents = torch.cat([latent] * 2)
-            unet_out = self.unet.forward(
-                latents, timestep, text_embedding, return_dict=False
-            )[0]
-            noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
-            return noise_pred
-
-    unet = UnetModel()
-    if args.precision == "fp16":
-        unet = unet.half().cuda()
-        inputs = tuple(
-            [
-                inputs.half().cuda() if len(inputs.shape) != 0 else inputs
-                for inputs in model_input[args.version]["unet"]
-            ]
-        )
-    else:
-        inputs = model_input[args.version]["unet"]
-    shark_unet = compile_through_fx(
-        unet,
-        inputs,
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_unet
--- a/shark/examples/shark_inference/stable_diffusion/opt_params.py
+++ b/shark/examples/shark_inference/stable_diffusion/opt_params.py
@@ -1,153 +0,0 @@
-import sys
-from model_wrappers import (
-    get_vae_mlir,
-    get_vae_encode_mlir,
-    get_unet_mlir,
-    get_clip_mlir,
-)
-from stable_args import args
-from utils import get_shark_model
-
-BATCH_SIZE = len(args.prompts)
-if BATCH_SIZE != 1:
-    sys.exit("Only batch size 1 is supported.")
-
-
-def get_unet():
-    iree_flags = []
-    if len(args.iree_vulkan_target_triple) > 0:
-        iree_flags.append(
-            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-        )
-    # Tuned model is present for `fp16` precision.
-    if args.precision == "fp16":
-        if args.use_tuned:
-            bucket = "gs://shark_tank/vivian"
-            model_name = "unet_1dec_fp16_tuned"
-            return get_shark_model(bucket, model_name, iree_flags)
-        else:
-            bucket = "gs://shark_tank/stable_diffusion"
-            model_name = "unet_8dec_fp16"
-            if args.version == "v2.1base":
-                model_name = "unet2base_8dec_fp16"
-            iree_flags += [
-                "--iree-flow-enable-padding-linalg-ops",
-                "--iree-flow-linalg-ops-padding-size=32",
-                "--iree-flow-enable-conv-img2col-transform",
-            ]
-            if args.import_mlir:
-                return get_unet_mlir(model_name, iree_flags)
-            return get_shark_model(bucket, model_name, iree_flags)
-
-    # Tuned model is not present for `fp32` case.
-    if args.precision == "fp32":
-        bucket = "gs://shark_tank/stable_diffusion"
-        model_name = "unet_1dec_fp32"
-        iree_flags += [
-            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=16",
-        ]
-        if args.import_mlir:
-            return get_unet_mlir(model_name, iree_flags)
-        return get_shark_model(bucket, model_name, iree_flags)
-
-    if args.precision == "int8":
-        bucket = "gs://shark_tank/prashant_nod"
-        model_name = "unet_int8"
-        iree_flags += [
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=32",
-        ]
-        sys.exit("int8 model is currently in maintenance.")
-        # # TODO: Pass iree_flags to the exported model.
-        # if args.import_mlir:
-        # sys.exit(
-        # "--import_mlir is not supported for the int8 model, try --no-import_mlir flag."
-        # )
-        # return get_shark_model(bucket, model_name, iree_flags)
-
-
-def get_vae():
-    iree_flags = []
-    if len(args.iree_vulkan_target_triple) > 0:
-        iree_flags.append(
-            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-        )
-    if args.precision in ["fp16", "int8"]:
-        bucket = "gs://shark_tank/stable_diffusion"
-        model_name = "vae_8dec_fp16"
-        if args.version == "v2.1base":
-            model_name = "vae2base_8dec_fp16"
-        iree_flags += [
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=32",
-            "--iree-flow-enable-conv-img2col-transform",
-        ]
-        if args.import_mlir:
-            return get_vae_mlir(model_name, iree_flags)
-        return get_shark_model(bucket, model_name, iree_flags)
-
-    if args.precision == "fp32":
-        bucket = "gs://shark_tank/stable_diffusion"
-        model_name = "vae_1dec_fp32"
-        iree_flags += [
-            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=16",
-        ]
-        if args.import_mlir:
-            return get_vae_mlir(model_name, iree_flags)
-        return get_shark_model(bucket, model_name, iree_flags)
-
-
-def get_vae_encode():
-    iree_flags = []
-    if len(args.iree_vulkan_target_triple) > 0:
-        iree_flags.append(
-            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-        )
-    if args.precision in ["fp16", "int8"]:
-        bucket = "gs://shark_tank/stable_diffusion"
-        model_name = "vae_encode_1dec_fp16"
-        if args.version == "v2":
-            model_name = "vae2_encode_29nov_fp16"
-        iree_flags += [
-            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=32",
-        ]
-        if args.import_mlir:
-            return get_vae_encode_mlir(model_name, iree_flags)
-        return get_shark_model(bucket, model_name, iree_flags)
-
-    if args.precision == "fp32":
-        bucket = "gs://shark_tank/stable_diffusion"
-        model_name = "vae_encode_1dec_fp32"
-        iree_flags += [
-            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
-            "--iree-flow-enable-padding-linalg-ops",
-            "--iree-flow-linalg-ops-padding-size=16",
-        ]
-        if args.import_mlir:
-            return get_vae_mlir(model_name, iree_flags)
-        return get_shark_model(bucket, model_name, iree_flags)
-
-
-def get_clip():
-    iree_flags = []
-    if len(args.iree_vulkan_target_triple) > 0:
-        iree_flags.append(
-            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-        )
-    bucket = "gs://shark_tank/stable_diffusion"
-    model_name = "clip_8dec_fp32"
-    if args.version == "v2.1base":
-        model_name = "clip2base_8dec_fp32"
-    iree_flags += [
-        "--iree-flow-linalg-ops-padding-size=16",
-        "--iree-flow-enable-padding-linalg-ops",
-    ]
-    if args.import_mlir:
-        return get_clip_mlir(model_name, iree_flags)
-    return get_shark_model(bucket, model_name, iree_flags)
--- a/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
+++ b/shark/examples/shark_inference/stable_diffusion/profiling_with_iree.md
@@ -1,44 +0,0 @@
-Compile / Run Instructions:
-
-To compile .vmfb for SD (vae, unet, CLIP), run the following commands with the .mlir in your local shark_tank cache (default location for Linux users is `~/.local/shark_tank`). These will be available once the script from [this README](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md) is run once.
-Running the script mentioned above with the `--save_vmfb` flag will also save the .vmfb in your SHARK base directory if you want to skip straight to benchmarks.
-
-Compile Commands FP32/FP16: 
-
-```shell
-Vulkan AMD: 
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
-
-#  add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
-#  use –iree-input-type=mhlo for tf models
-
-CUDA NVIDIA:
-iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
-
-CPU:
-iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu  --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
-```
-
-
-
-Run / Benchmark Command (FP32 - NCHW):
-(NEED to use BS=2 since we do two forward passes to unet as a result of classifier free guidance.)
-
-```shell
-## Vulkan AMD:
-iree-benchmark-module --module_file=/path/to/output/vmfb --entry_function=forward --device=vulkan --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
-
-## CUDA:
-iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=cuda  --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
-
-## CPU:
-iree-benchmark-module --module_file=/path/to/vmfb --entry_function=forward --device=local-task  --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
-
-```
-
-Run via vulkan_gui for RGP Profiling:
-
-To build the vulkan app for profiling UNet follow the instructions [here](https://github.com/nod-ai/SHARK/tree/main/cpp) and then run the following command from the cpp directory with your compiled stable_diff.vmfb
-```shell
-./build/vulkan_gui/iree-vulkan-gui --module_file=/path/to/unet.vmfb --function_input=1x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32 --function_input=f32=1.0 --function_input=f32=1.0
-```
--- a/shark/examples/shark_inference/stable_diffusion/stable_args.py
+++ b/shark/examples/shark_inference/stable_diffusion/stable_args.py
@@ -1,128 +0,0 @@
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument(
-    "--prompts",
-    nargs="+",
-    default=["a photograph of an astronaut riding a horse"],
-    help="text of which images to be generated.",
-)
-p.add_argument(
-    "--device", type=str, default="cpu", help="device to run the model."
-)
-p.add_argument(
-    "--steps",
-    type=int,
-    default=50,
-    help="the no. of steps to do the sampling.",
-)
-
-p.add_argument(
-    "--version",
-    type=str,
-    default="v2.1base",
-    help="Specify version of stable diffusion model",
-)
-
-p.add_argument(
-    "--seed",
-    type=int,
-    default=42,
-    help="the seed to use.",
-)
-p.add_argument(
-    "--guidance_scale",
-    type=float,
-    default=7.5,
-    help="the value to be used for guidance scaling.",
-)
-
-p.add_argument(
-    "--import_mlir",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
-)
-
-p.add_argument(
-    "--precision", type=str, default="fp16", help="precision to run the model."
-)
-
-p.add_argument(
-    "--max_length",
-    type=int,
-    default=77,
-    help="max length of the tokenizer output.",
-)
-
-p.add_argument(
-    "--load_vmfb",
-    default=True,
-    action=argparse.BooleanOptionalAction,
-    help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
-)
-
-p.add_argument(
-    "--save_vmfb",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="saves the compiled flatbuffer to the local directory",
-)
-
-p.add_argument(
-    "--iree-vulkan-target-triple",
-    type=str,
-    default="",
-    help="Specify target triple for vulkan",
-)
-
-p.add_argument(
-    "--vulkan_debug_utils",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Profiles vulkan device and collects the .rdc info",
-)
-
-p.add_argument(
-    "--use_tuned",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Download and use the tuned version of the model if available",
-)
-
-p.add_argument(
-    "--dump_isa",
-    default=False,
-    action="store_true",
-    help="When enabled call amdllpc to get ISA dumps. use with dispatch benchmarks.",
-)
-
-p.add_argument(
-    "--dispatch_benchmarks",
-    default=None,
-    help='dispatches to return benchamrk data on.  use "All" for all, and None for none.',
-)
-
-p.add_argument(
-    "--dispatch_benchmarks_dir",
-    default="temp_dispatch_benchmarks",
-    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
-)
-
-p.add_argument(
-    "--vulkan_large_heap_block_size",
-    default="4294967296",
-    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
-)
-
-p.add_argument(
-    "--enable_rgp",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="flag for inserting debug frames between iterations for use with rgp.",
-)
-
-args = p.parse_args()
--- a/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
+++ b/shark/examples/shark_inference/stable_diffusion/stable_diffusion_amd.md
@@ -1,111 +0,0 @@
-# Stable Diffusion optimized for AMD RDNA2/RDNA3 GPUs
-
-## Install the latest AMD Drivers
-
-### RDNA2 Drivers:
-*AMD Software: Adrenalin Edition 22.11.1 for MLIR/IREE Driver Version 22.20.29.09 for Windows® 10 and Windows® 11 (Windows Driver Store Version 31.0.12029.9003)*
-
-https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mlir-iree
-
-Note that if you previously tried Stable Diffusion with a different driver, it may be necessary to clear vulkan cache after changing drivers.
-
-For Windows users this can be done by clearing the contents of `C:\Users\<username>\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
-
-## Installation
-
-Download the latest Windows SHARK SD binary [here](https://github.com/nod-ai/SHARK/releases/download/20221213.383/shark_sd_20221213_383.exe). Accept if Windows warns of an unsigned .exe.
-
-#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
-
-
-<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
-
-
-Here are some samples generated:
-
-![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
-
-![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
-
-
-<details>
-  <summary>Advanced Installation </summary>
-
-## Setup your Python VirtualEnvironment and Dependencies
-
-### Windows 10/11 Users
-
-* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
-
-* Install Git for Windows from [here](https://git-scm.com/download/win)
-
-#### Allow the install script to run in Powershell
-```powershell
-set-executionpolicy remotesigned 
-```
-
-#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
-```powershell
-git clone https://github.com/nod-ai/SHARK.git
-cd SHARK
-./setup_venv.ps1 #You can re-run this script to get the latest version
-```
-
-### Linux
-
-```shell
-git clone https://github.com/nod-ai/SHARK.git
-cd SHARK
-./setup_venv.sh
-source shark.venv/bin/activate
-```
-
-### Run Stable Diffusion on your device - WebUI
-
-#### Windows 10/11 Users
-```powershell
-(shark.venv) PS C:\Users\nod\SHARK> cd web
-(shark.venv) PS C:\Users\nod\SHARK\web> python index.py
-```
-#### Linux Users
-```shell
-(shark.venv) > cd web
-(shark.venv) > python index.py
-```
-
-
-
-### Run Stable Diffusion on your device - Commandline
-
-#### Windows 10/11 Users
-```powershell
-(shark.venv) PS C:\g\shark> python .\shark\examples\shark_inference\stable_diffusion\main.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
-```
-
-#### Linux
-```shell
-python3.10 shark/examples/shark_inference/stable_diffusion/main.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
-```
-
-The output on a 6900XT would like:
-
-```shell 
-44it [00:08,  5.14it/s]i = 44 t = 120 (191ms)
-45it [00:08,  5.15it/s]i = 45 t = 100 (191ms)
-46it [00:08,  5.16it/s]i = 46 t = 80 (191ms)
-47it [00:09,  5.16it/s]i = 47 t = 60 (193ms)
-48it [00:09,  5.15it/s]i = 48 t = 40 (195ms)
-49it [00:09,  5.12it/s]i = 49 t = 20 (196ms)
-50it [00:09,  5.14it/s]
-Average step time: 192.8154182434082ms/it
-Total image generation runtime (s): 10.390909433364868
-(shark.venv) PS C:\g\shark>
-```
-
-
-For more options to the Stable Diffusion model read [this](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md)
-</details>
-<details>
-  <summary>Discord link</summary>
-Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
-</details>
--- a/shark/examples/shark_inference/stable_diffusion/utils.py
+++ b/shark/examples/shark_inference/stable_diffusion/utils.py
@@ -1,83 +0,0 @@
-import os
-
-import torch
-from shark.shark_inference import SharkInference
-from stable_args import args
-from shark.shark_importer import import_with_fx
-from shark.iree_utils.vulkan_utils import set_iree_vulkan_runtime_flags
-
-
-def _compile_module(shark_module, model_name, extra_args=[]):
-    if args.load_vmfb or args.save_vmfb:
-        device = (
-            args.device
-            if "://" not in args.device
-            else "-".join(args.device.split("://"))
-        )
-        extended_name = "{}_{}".format(model_name, device)
-        vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
-        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
-            print(f"loading existing vmfb from: {vmfb_path}")
-            shark_module.load_module(vmfb_path, extra_args=extra_args)
-        else:
-            if args.save_vmfb:
-                print("Saving to {}".format(vmfb_path))
-            else:
-                print(
-                    "No vmfb found. Compiling and saving to {}".format(
-                        vmfb_path
-                    )
-                )
-            path = shark_module.save_module(
-                os.getcwd(), extended_name, extra_args
-            )
-            shark_module.load_module(path, extra_args=extra_args)
-    else:
-        shark_module.compile(extra_args)
-    return shark_module
-
-
-# Downloads the model from shark_tank and returns the shark_module.
-def get_shark_model(tank_url, model_name, extra_args=[]):
-    from shark.shark_downloader import download_model
-
-    mlir_model, func_name, inputs, golden_out = download_model(
-        model_name,
-        tank_url=tank_url,
-        frontend="torch",
-    )
-    shark_module = SharkInference(
-        mlir_model, func_name, device=args.device, mlir_dialect="linalg"
-    )
-    return _compile_module(shark_module, model_name, extra_args)
-
-
-# Converts the torch-module into a shark_module.
-def compile_through_fx(model, inputs, model_name, extra_args=[]):
-
-    mlir_module, func_name = import_with_fx(model, inputs)
-
-    shark_module = SharkInference(
-        mlir_module,
-        func_name,
-        device=args.device,
-        mlir_dialect="linalg",
-    )
-
-    return _compile_module(shark_module, model_name, extra_args)
-
-
-def set_iree_runtime_flags():
-
-    vulkan_runtime_flags = [
-        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
-    ]
-    if args.enable_rgp:
-        vulkan_runtime_flags += [
-            f"--enable_rgp=true",
-            f"--vulkan_debug_utils=true",
-        ]
-    if "vulkan" in args.device:
-        set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
-
-    return
--- a/shark/examples/shark_inference/t5_pt_en_to_fr.py
+++ b/shark/examples/shark_inference/t5_pt_en_to_fr.py
@@ -0,0 +1,47 @@
+from PIL import Image
+import requests
+
+from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model
+import torch
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+from iree import runtime as ireert
+import os
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+
+class T5Module(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
+        self.model.eval()
+
+    def forward(self, input_ids):
+        return self.model.generate(input_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    text = "I love the distilled version of models."
+    task_prefix = "translate English to German: "
+    encoded_input = tokenizer(task_prefix + text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt").input_ids
+    inputs = (encoded_input)
+    mlir_importer = SharkImporter(
+        T5Module(),
+        inputs,
+        frontend="torch",
+    )
+    import pdb; pdb.set_trace()
+    minilm_mlir, func_name = mlir_importer.import_mlir(
+        is_dynamic=True, tracing_required=True
+    )
+    shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="linalg")
+    shark_module.compile()
+    import pdb; pdb.set_trace()
+    output = shark_module.forward(inputs)
+    print(tokenizer.batch_decode(output, skip_special_tokens=True))
--- a/shark/examples/shark_inference/t5_tf.py
+++ b/shark/examples/shark_inference/t5_tf.py
@@ -18,7 +18,7 @@ class T5Module(tf.Module):
        self.m = TFT5Model.from_pretrained("t5-small")
        self.m.predict = lambda x, y: self.m(input_ids=x, decoder_input_ids=y)

-    @tf.function(input_signature=t5_inputs, jit_compile=True)
+    @tf.function(input_signature=t5_inputs)
    def forward(self, input_ids, decoder_input_ids):
        return self.m.predict(input_ids, decoder_input_ids)

--- a/shark/examples/shark_inference/t5_tf_en_to_fr.py
+++ b/shark/examples/shark_inference/t5_tf_en_to_fr.py
@@ -0,0 +1,51 @@
+from PIL import Image
+import requests
+
+from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+from iree import runtime as ireert
+import os
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of inputs
+t5_inputs = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+class T5Module(tf.Module):
+    def __init__(self):
+        super(T5Module, self).__init__()
+        self.m = TFT5ForConditionalGeneration.from_pretrained("t5-small")
+        self.m.predict = lambda x: self.m.generate(input_ids=x)
+
+    @tf.function(input_signature=t5_inputs)
+    def forward(self, input_ids):
+        return self.m.predict(input_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    text = "I love the distilled version of models."
+    task_prefix = "translate English to German: "
+    encoded_input = tokenizer(task_prefix + text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf").input_ids
+    inputs = (encoded_input)
+    mlir_importer = SharkImporter(
+        T5Module(),
+        inputs,
+        frontend="tf",
+    )
+    minilm_mlir, func_name = mlir_importer.import_mlir(
+        is_dynamic=False, tracing_required=False
+    )
+    shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="mhlo")
+    shark_module.compile()
+    import pdb; pdb.set_trace()
+    output = shark_module.forward(inputs)
+    print(tokenizer.batch_decode(output, skip_special_tokens=True))
--- a/shark/examples/shark_inference/unet_script.py
+++ b/shark/examples/shark_inference/unet_script.py
@@ -1,9 +1,8 @@
 import torch
-import numpy as np
-from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
+from shark_runner import SharkInference


+# Currently not supported aten.transpose_conv2d missing.
 class UnetModule(torch.nn.Module):
    def __init__(self):
        super().__init__()
@@ -15,7 +14,7 @@ class UnetModule(torch.nn.Module):
            init_features=32,
            pretrained=True,
        )
-        self.model.eval()
+        self.train(False)

    def forward(self, input):
        return self.model(input)
@@ -23,17 +22,10 @@ class UnetModule(torch.nn.Module):

 input = torch.randn(1, 3, 224, 224)

-mlir_importer = SharkImporter(
+print(input)
+shark_module = SharkInference(
    UnetModule(),
    (input,),
-    frontend="torch",
 )
-
-(vision_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
-    tracing_required=False
-)
-
-shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
-shark_module.compile()
-result = shark_module.forward((input,))
-np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+shark_module.benchmark_forward((input,))
+print(input)
--- a/shark/examples/shark_inference/v_diffusion.py
+++ b/shark/examples/shark_inference/v_diffusion.py
@@ -1,13 +1,11 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
+from shark.shark_downloader import download_torch_model


-mlir_model, func_name, inputs, golden_out = download_model(
-    "v_diffusion", frontend="torch"
-)
+mlir_model, func_name, inputs, golden_out = download_torch_model("v_diffusion")

 shark_module = SharkInference(
-    mlir_model, func_name, device="vulkan", mlir_dialect="linalg"
+    mlir_model, func_name, mlir_dialect="linalg"
 )
 shark_module.compile()
 result = shark_module.forward(inputs)
--- a/shark/examples/shark_training/bert_training_tf.py
+++ b/shark/examples/shark_training/bert_training_tf.py
@@ -52,8 +52,7 @@ class BertModule(tf.Module):
        input_signature=[
            bert_input,  # inputs
            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ],
-        jit_compile=True,
+        ]
    )
    def forward(self, inputs, labels):
        with tf.GradientTape() as tape:
--- a/shark/examples/shark_training/stable-diffusion-img2img/README.md
+++ b/shark/examples/shark_training/stable-diffusion-img2img/README.md
@@ -1,41 +0,0 @@
-# Stable Diffusion Img2Img model
-
-## Installation
-
-<details>
-  <summary>Installation (Linux)</summary>
-
-### Activate shark.venv Virtual Environment
-
-```shell
-source shark.venv/bin/activate
-
-# Some older pip installs may not be able to handle the recent PyTorch deps
-python -m pip install --upgrade pip
-```
-
-### Install dependencies
-
-# Run the setup.sh script
-
-```shell
-./setup.sh
-```
-
-### Run the Stable diffusion Img2Img model
-
-To run the model with the default set of images and params, run:
-```shell
-python stable_diffusion_img2img.py
-```
-To run the model with your set of images, and parameters you need to specify the following params:
-1.) Input images directory with the arg `--input_dir` containing 3-5 images.
-2.) What to teach the model? Using the arg `--what_to_teach`, allowed values are `object` or `style`.
-3.) Placeholder token using the arg `--placeholder_token`, that represents your new concept. It should be passed with the opening and closing angle brackets. For ex: token is `cat-toy`, it should be passed as `<cat-toy>`.
-4.) Initializer token using the arg `--initializer_token`, which summarise what is your new concept.
-
-For the result, you need to pass the text prompt with the arg: `--prompt`. The prompt string should contain a "*s" in it, which will be replaced by the placeholder token during the inference.
-
-By default the result images will go into the `sd_result` dir. To specify your output dir use the arg: `--output_dir`.
-
-The default value of max_training_steps is `3000`, which takes some hours to complete. You can pass the smaller value with the arg `--training_steps`. Specify the number of images to be sampled for the result with the `--num_inference_samples` arg.
--- a/shark/examples/shark_training/stable-diffusion-img2img/setup.sh
+++ b/shark/examples/shark_training/stable-diffusion-img2img/setup.sh
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-TD="$(cd $(dirname $0) && pwd)"
-if [ -z "$PYTHON" ]; then
-  PYTHON="$(which python3)"
-fi
-
-function die() {
-  echo "Error executing command: $*"
-  exit 1
-}
-
-PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))'`
-
-echo "Python: $PYTHON"
-echo "Python version: $PYTHON_VERSION_X_Y"
-
-mkdir input_images
-
-wget https://huggingface.co/datasets/valhalla/images/resolve/main/2.jpeg -P input_images/
-wget https://huggingface.co/datasets/valhalla/images/resolve/main/3.jpeg -P input_images/
-wget https://huggingface.co/datasets/valhalla/images/resolve/main/5.jpeg -P input_images/
-wget https://huggingface.co/datasets/valhalla/images/resolve/main/6.jpeg -P input_images/
-
-pip install diffusers["training"]==0.4.1 transformers ftfy opencv-python
--- a/shark/examples/shark_training/stable-diffusion-img2img/stable_diffusion_img2img.py
+++ b/shark/examples/shark_training/stable-diffusion-img2img/stable_diffusion_img2img.py
@@ -1,597 +0,0 @@
-# Textual-inversion fine-tuning for Stable Diffusion using diffusers
-# This script shows how to "teach" Stable Diffusion a new concept via
-# textual-inversion using 🤗 Hugging Face [🧨 Diffusers library](https://github.com/huggingface/diffusers).
-# By using just 3-5 images you can teach new concepts to Stable Diffusion
-# and personalize the model on your own images.
-
-import argparse
-import itertools
-import math
-import os
-import random
-import cv2
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.utils.data import Dataset
-
-import PIL
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.hub_utils import init_git_repo, push_to_hub
-from diffusers.optimization import get_scheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from PIL import Image
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-YOUR_TOKEN = "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"
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-p.add_argument(
-    "--input_dir",
-    type=str,
-    default="input_images/",
-    help="the directory contains the images used for fine tuning",
-)
-p.add_argument(
-    "--output_dir",
-    type=str,
-    default="sd_result",
-    help="the directory contains the images used for fine tuning",
-)
-p.add_argument(
-    "--training_steps",
-    type=int,
-    default=3000,
-    help="the maximum number of training steps",
-)
-p.add_argument("--seed", type=int, default=42, help="the random seed")
-p.add_argument(
-    "--what_to_teach",
-    type=str,
-    choices=["object", "style"],
-    default="object",
-    help="what is it that you are teaching?",
-)
-p.add_argument(
-    "--placeholder_token",
-    type=str,
-    default="<cat-toy>",
-    help="It is the token you are going to use to represent your new concept",
-)
-p.add_argument(
-    "--initializer_token",
-    type=str,
-    default="toy",
-    help="It is a word that can summarise what is your new concept",
-)
-p.add_argument(
-    "--inference_steps",
-    type=int,
-    default=50,
-    help="the number of steps for inference",
-)
-p.add_argument(
-    "--num_inference_samples",
-    type=int,
-    default=4,
-    help="the number of samples for inference",
-)
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a grafitti in a wall with a *s on it",
-    help="the text prompt to use",
-)
-args = p.parse_args()
-
-if "*s" not in args.prompt:
-    raise ValueError(
-        f'The prompt should have a "*s" which will be replaced by a placeholder token.'
-    )
-
-prompt1, prompt2 = args.prompt.split("*s")
-args.prompt = prompt1 + args.placeholder_token + prompt2
-
-pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4"
-
-# Load input images.
-images = []
-for filename in os.listdir(args.input_dir):
-    img = cv2.imread(os.path.join(args.input_dir, filename))
-    if img is not None:
-        images.append(img)
-
-# Setup the prompt templates for training
-imagenet_templates_small = [
-    "a photo of a {}",
-    "a rendering of a {}",
-    "a cropped photo of the {}",
-    "the photo of a {}",
-    "a photo of a clean {}",
-    "a photo of a dirty {}",
-    "a dark photo of the {}",
-    "a photo of my {}",
-    "a photo of the cool {}",
-    "a close-up photo of a {}",
-    "a bright photo of the {}",
-    "a cropped photo of a {}",
-    "a photo of the {}",
-    "a good photo of the {}",
-    "a photo of one {}",
-    "a close-up photo of the {}",
-    "a rendition of the {}",
-    "a photo of the clean {}",
-    "a rendition of a {}",
-    "a photo of a nice {}",
-    "a good photo of a {}",
-    "a photo of the nice {}",
-    "a photo of the small {}",
-    "a photo of the weird {}",
-    "a photo of the large {}",
-    "a photo of a cool {}",
-    "a photo of a small {}",
-]
-
-imagenet_style_templates_small = [
-    "a painting in the style of {}",
-    "a rendering in the style of {}",
-    "a cropped painting in the style of {}",
-    "the painting in the style of {}",
-    "a clean painting in the style of {}",
-    "a dirty painting in the style of {}",
-    "a dark painting in the style of {}",
-    "a picture in the style of {}",
-    "a cool painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a bright painting in the style of {}",
-    "a cropped painting in the style of {}",
-    "a good painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a rendition in the style of {}",
-    "a nice painting in the style of {}",
-    "a small painting in the style of {}",
-    "a weird painting in the style of {}",
-    "a large painting in the style of {}",
-]
-
-# Setup the dataset
-class TextualInversionDataset(Dataset):
-    def __init__(
-        self,
-        data_root,
-        tokenizer,
-        learnable_property="object",  # [object, style]
-        size=512,
-        repeats=100,
-        interpolation="bicubic",
-        flip_p=0.5,
-        set="train",
-        placeholder_token="*",
-        center_crop=False,
-    ):
-
-        self.data_root = data_root
-        self.tokenizer = tokenizer
-        self.learnable_property = learnable_property
-        self.size = size
-        self.placeholder_token = placeholder_token
-        self.center_crop = center_crop
-        self.flip_p = flip_p
-
-        self.image_paths = [
-            os.path.join(self.data_root, file_path)
-            for file_path in os.listdir(self.data_root)
-        ]
-
-        self.num_images = len(self.image_paths)
-        self._length = self.num_images
-
-        if set == "train":
-            self._length = self.num_images * repeats
-
-        self.interpolation = {
-            "linear": PIL.Image.LINEAR,
-            "bilinear": PIL.Image.BILINEAR,
-            "bicubic": PIL.Image.BICUBIC,
-            "lanczos": PIL.Image.LANCZOS,
-        }[interpolation]
-
-        self.templates = (
-            imagenet_style_templates_small
-            if learnable_property == "style"
-            else imagenet_templates_small
-        )
-        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, i):
-        example = {}
-        image = Image.open(self.image_paths[i % self.num_images])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        placeholder_string = self.placeholder_token
-        text = random.choice(self.templates).format(placeholder_string)
-
-        example["input_ids"] = self.tokenizer(
-            text,
-            padding="max_length",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids[0]
-
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-
-        if self.center_crop:
-            crop = min(img.shape[0], img.shape[1])
-            h, w, = (
-                img.shape[0],
-                img.shape[1],
-            )
-            img = img[
-                (h - crop) // 2 : (h + crop) // 2,
-                (w - crop) // 2 : (w + crop) // 2,
-            ]
-
-        image = Image.fromarray(img)
-        image = image.resize(
-            (self.size, self.size), resample=self.interpolation
-        )
-
-        image = self.flip_transform(image)
-        image = np.array(image).astype(np.uint8)
-        image = (image / 127.5 - 1.0).astype(np.float32)
-
-        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
-        return example
-
-
-# Setting up the model
-# Load the tokenizer and add the placeholder token as a additional special token.
-# Please read and if you agree accept the LICENSE
-# [here](https://huggingface.co/CompVis/stable-diffusion-v1-4) if you see an error
-tokenizer = CLIPTokenizer.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="tokenizer",
-    use_auth_token=YOUR_TOKEN,
-)
-
-# Add the placeholder token in tokenizer
-num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
-if num_added_tokens == 0:
-    raise ValueError(
-        f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
-        " `placeholder_token` that is not already in the tokenizer."
-    )
-
-# Get token ids for our placeholder and initializer token.
-# This code block will complain if initializer string is not a single token
-# Convert the initializer_token, placeholder_token to ids
-token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
-# Check if initializer_token is a single token or a sequence of tokens
-if len(token_ids) > 1:
-    raise ValueError("The initializer token must be a single token.")
-
-initializer_token_id = token_ids[0]
-placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
-
-# Load the Stable Diffusion model
-# Load models and create wrapper for stable diffusion
-text_encoder = CLIPTextModel.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="text_encoder",
-    use_auth_token=YOUR_TOKEN,
-)
-vae = AutoencoderKL.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="vae",
-    use_auth_token=YOUR_TOKEN,
-)
-unet = UNet2DConditionModel.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="unet",
-    use_auth_token=YOUR_TOKEN,
-)
-
-# We have added the `placeholder_token` in the `tokenizer` so we resize the token embeddings here,
-#  this will a new embedding vector in the token embeddings for our `placeholder_token`
-text_encoder.resize_token_embeddings(len(tokenizer))
-
-# Initialise the newly added placeholder token with the embeddings of the initializer token
-token_embeds = text_encoder.get_input_embeddings().weight.data
-token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
-
-# In Textual-Inversion we only train the newly added embedding vector,
-# so lets freeze rest of the model parameters here.
-
-
-def freeze_params(params):
-    for param in params:
-        param.requires_grad = False
-
-
-# Freeze vae and unet
-freeze_params(vae.parameters())
-freeze_params(unet.parameters())
-# Freeze all parameters except for the token embeddings in text encoder
-params_to_freeze = itertools.chain(
-    text_encoder.text_model.encoder.parameters(),
-    text_encoder.text_model.final_layer_norm.parameters(),
-    text_encoder.text_model.embeddings.position_embedding.parameters(),
-)
-freeze_params(params_to_freeze)
-
-# Creating our training data
-
-train_dataset = TextualInversionDataset(
-    data_root=args.input_dir,
-    tokenizer=tokenizer,
-    size=512,
-    placeholder_token=args.placeholder_token,
-    repeats=100,
-    learnable_property=args.what_to_teach,  # Option selected above between object and style
-    center_crop=False,
-    set="train",
-)
-
-
-def create_dataloader(train_batch_size=1):
-    return torch.utils.data.DataLoader(
-        train_dataset, batch_size=train_batch_size, shuffle=True
-    )
-
-
-# Create noise_scheduler for training.
-noise_scheduler = DDPMScheduler(
-    beta_start=0.00085,
-    beta_end=0.012,
-    beta_schedule="scaled_linear",
-    num_train_timesteps=1000,
-    tensor_format="pt",
-)
-
-# Define hyperparameters for our training
-hyperparameters = {
-    "learning_rate": 5e-04,
-    "scale_lr": True,
-    "max_train_steps": args.training_steps,
-    "train_batch_size": 1,
-    "gradient_accumulation_steps": 4,
-    "seed": args.seed,
-    "output_dir": "sd-concept-output",
-}
-
-
-def training_function(text_encoder, vae, unet):
-    logger = get_logger(__name__)
-
-    train_batch_size = hyperparameters["train_batch_size"]
-    gradient_accumulation_steps = hyperparameters[
-        "gradient_accumulation_steps"
-    ]
-    learning_rate = hyperparameters["learning_rate"]
-    max_train_steps = hyperparameters["max_train_steps"]
-    output_dir = hyperparameters["output_dir"]
-
-    accelerator = Accelerator(
-        gradient_accumulation_steps=gradient_accumulation_steps,
-    )
-
-    train_dataloader = create_dataloader(train_batch_size)
-
-    if hyperparameters["scale_lr"]:
-        learning_rate = (
-            learning_rate
-            * gradient_accumulation_steps
-            * train_batch_size
-            * accelerator.num_processes
-        )
-
-    # Initialize the optimizer
-    optimizer = torch.optim.AdamW(
-        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
-        lr=learning_rate,
-    )
-
-    text_encoder, optimizer, train_dataloader = accelerator.prepare(
-        text_encoder, optimizer, train_dataloader
-    )
-
-    # Move vae and unet to device
-    vae.to(accelerator.device)
-    unet.to(accelerator.device)
-
-    # Keep vae and unet in eval model as we don't train these
-    vae.eval()
-    unet.eval()
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / gradient_accumulation_steps
-    )
-    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
-
-    # Train!
-    total_batch_size = (
-        train_batch_size
-        * accelerator.num_processes
-        * gradient_accumulation_steps
-    )
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {gradient_accumulation_steps}"
-    )
-    logger.info(f"  Total optimization steps = {max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(max_train_steps), disable=not accelerator.is_local_main_process
-    )
-    progress_bar.set_description("Steps")
-    global_step = 0
-
-    for epoch in range(num_train_epochs):
-        text_encoder.train()
-        for step, batch in enumerate(train_dataloader):
-            with accelerator.accumulate(text_encoder):
-                # Convert images to latent space
-                latents = (
-                    vae.encode(batch["pixel_values"])
-                    .latent_dist.sample()
-                    .detach()
-                )
-                latents = latents * 0.18215
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn(latents.shape).to(latents.device)
-                bsz = latents.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0,
-                    noise_scheduler.num_train_timesteps,
-                    (bsz,),
-                    device=latents.device,
-                ).long()
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(
-                    latents, noise, timesteps
-                )
-
-                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
-
-                # Predict the noise residual
-                noise_pred = unet(
-                    noisy_latents, timesteps, encoder_hidden_states
-                ).sample
-
-                loss = (
-                    F.mse_loss(noise_pred, noise, reduction="none")
-                    .mean([1, 2, 3])
-                    .mean()
-                )
-                accelerator.backward(loss)
-
-                # Zero out the gradients for all token embeddings except the newly added
-                # embeddings for the concept, as we only want to optimize the concept embeddings
-                if accelerator.num_processes > 1:
-                    grads = (
-                        text_encoder.module.get_input_embeddings().weight.grad
-                    )
-                else:
-                    grads = text_encoder.get_input_embeddings().weight.grad
-                # Get the index for tokens that we want to zero the grads for
-                index_grads_to_zero = (
-                    torch.arange(len(tokenizer)) != placeholder_token_id
-                )
-                grads.data[index_grads_to_zero, :] = grads.data[
-                    index_grads_to_zero, :
-                ].fill_(0)
-
-                optimizer.step()
-                optimizer.zero_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-            logs = {"loss": loss.detach().item()}
-            progress_bar.set_postfix(**logs)
-
-            if global_step >= max_train_steps:
-                break
-
-        accelerator.wait_for_everyone()
-
-    # Create the pipeline using using the trained modules and save it.
-    if accelerator.is_main_process:
-        pipeline = StableDiffusionPipeline(
-            text_encoder=accelerator.unwrap_model(text_encoder),
-            vae=vae,
-            unet=unet,
-            tokenizer=tokenizer,
-            scheduler=PNDMScheduler(
-                beta_start=0.00085,
-                beta_end=0.012,
-                beta_schedule="scaled_linear",
-                skip_prk_steps=True,
-            ),
-            safety_checker=StableDiffusionSafetyChecker.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker"
-            ),
-            feature_extractor=CLIPFeatureExtractor.from_pretrained(
-                "openai/clip-vit-base-patch32"
-            ),
-        )
-        pipeline.save_pretrained(output_dir)
-        # Also save the newly trained embeddings
-        learned_embeds = (
-            accelerator.unwrap_model(text_encoder)
-            .get_input_embeddings()
-            .weight[placeholder_token_id]
-        )
-        learned_embeds_dict = {
-            args.placeholder_token: learned_embeds.detach().cpu()
-        }
-        torch.save(
-            learned_embeds_dict, os.path.join(output_dir, "learned_embeds.bin")
-        )
-
-
-import accelerate
-
-accelerate.notebook_launcher(
-    training_function, args=(text_encoder, vae, unet), num_processes=1
-)
-
-# Set up the pipeline
-pipe = StableDiffusionPipeline.from_pretrained(
-    hyperparameters["output_dir"],
-    # torch_dtype=torch.float16,
-)
-
-all_images = []
-for _ in range(args.num_inference_samples):
-    images = pipe(
-        [args.prompt],
-        num_inference_steps=args.inference_steps,
-        guidance_scale=7.5,
-    ).images
-    all_images.extend(images)
-
-# output_path = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
-if not os.path.isdir(args.output_dir):
-    os.mkdir(args.output_dir)
-
-[
-    image.save(f"{args.output_dir}/{i}.jpeg")
-    for i, image in enumerate(all_images)
-]
--- a/shark/iree_eager_backend.py
+++ b/shark/iree_eager_backend.py
@@ -48,8 +48,8 @@ class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):

    def __init__(self, device: str):
        self.torch_device_str = device
-        self.config = ireert.Config(IREE_DEVICE_MAP[device])
-        self.raw_device_str = device
+        self.iree_device_str = IREE_DEVICE_MAP[device]
+        self.config = ireert.Config(self.iree_device_str)

    def get_torch_metadata(
        self, tensor: DeviceArray, kwargs: Dict[str, Any]
@@ -71,7 +71,7 @@ class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):
            "EagerMode",
        )
        callable, _ = get_iree_compiled_module(
-            imported_module, self.raw_device_str, func_name=fn_name
+            imported_module, self.iree_device_str, func_name=fn_name
        )
        return callable

--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -37,51 +37,30 @@ def run_cmd(cmd):
        sys.exit("Exiting program due to error running:", cmd)


-def iree_device_map(device):
-    uri_parts = device.split("://", 2)
-    if len(uri_parts) == 1:
-        return _IREE_DEVICE_MAP[uri_parts[0]]
-    else:
-        return f"{_IREE_DEVICE_MAP[uri_parts[0]]}://{uri_parts[1]}"
-
-
-def get_supported_device_list():
-    return list(_IREE_DEVICE_MAP.keys())
-
-
-_IREE_DEVICE_MAP = {
+IREE_DEVICE_MAP = {
    "cpu": "local-task",
+    "gpu": "cuda",
    "cuda": "cuda",
    "vulkan": "vulkan",
    "metal": "vulkan",
    "rocm": "rocm",
-    "intel-gpu": "level_zero",
+    "intel-gpu" : "level_zero",
 }

-
-def iree_target_map(device):
-    if "://" in device:
-        device = device.split("://")[0]
-    return _IREE_TARGET_MAP[device]
-
-
-_IREE_TARGET_MAP = {
-    "cpu": "llvm-cpu",
+IREE_TARGET_MAP = {
+    "cpu": "dylib",
+    "gpu": "cuda",
    "cuda": "cuda",
    "vulkan": "vulkan",
    "metal": "vulkan",
    "rocm": "rocm",
-    "intel-gpu": "opencl-spirv",
+    "intel-gpu" : "opencl-spirv",
 }

-
 # Finds whether the required drivers are installed for the given device.
 def check_device_drivers(device):
    """Checks necessary drivers present for gpu and vulkan devices"""
-    if "://" in device:
-        device = device.split("://")[0]
-
-    if device == "cuda":
+    if device in ["gpu", "cuda"]:
        try:
            subprocess.check_output("nvidia-smi")
        except Exception:
@@ -92,18 +71,10 @@ def check_device_drivers(device):
        except Exception:
            return True
    elif device in ["intel-gpu"]:
-        try:
-            subprocess.check_output(["dpkg", "-L", "intel-level-zero-gpu"])
-            return False
-        except Exception:
-            return True
+        # TODO: Add intel gpu check.
+        return False
    elif device == "cpu":
        return False
-    elif device == "rocm":
-        try:
-            subprocess.check_output("rocminfo")
-        except Exception:
-            return True
    # Unknown device.
    else:
        return True
@@ -113,11 +84,9 @@ def check_device_drivers(device):

 # Installation info for the missing device drivers.
 def device_driver_info(device):
-    if device == "cuda":
+    if device in ["gpu", "cuda"]:
        return "nvidia-smi not found, please install the required drivers from https://www.nvidia.in/Download/index.aspx?lang=en-in"
    elif device in ["metal", "vulkan"]:
        return "vulkaninfo not found, Install from https://vulkan.lunarg.com/sdk/home or your distribution"
-    elif device == "rocm":
-        return "rocm info not found. Please install rocm"
    else:
        return f"{device} is not supported."
--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -13,13 +13,12 @@
 # limitations under the License.

 import iree.runtime.scripts.iree_benchmark_module as benchmark_module
-from shark.iree_utils._common import run_cmd, iree_device_map
-from shark.iree_utils.cpu_utils import get_cpu_count
+from shark.iree_utils._common import run_cmd, IREE_DEVICE_MAP
 import numpy as np
 import os
 import re

-UNIT_TO_SECOND_MAP = {"us": 1e-6, "ms": 0.001, "s": 1}
+UNIT_TO_SECOND_MAP = {"ms": 0.001, "s": 1}


 def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
@@ -35,12 +34,9 @@ def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
            dtype_string = str(input_tensor.dtype).replace("torch.", "")
        elif mlir_dialect in ["mhlo", "tflite"]:
            dtype = input_tensor.dtype
-            try:
-                dtype_string = re.findall("'[^\"]*'", str(dtype))[0].replace(
-                    "'", ""
-                )
-            except IndexError:
-                dtype_string = str(dtype)
+            dtype_string = re.findall("'[^\"]*'", str(dtype))[0].replace(
+                "'", ""
+            )
        regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
        match = regex_split.match(dtype_string)
        mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
@@ -70,40 +66,10 @@ def build_benchmark_args(
        # TODO: Replace name of train with actual train fn name.
        fn_name = "train"
    benchmark_cl.append(f"--entry_function={fn_name}")
-    benchmark_cl.append(f"--device={iree_device_map(device)}")
+    benchmark_cl.append(f"--device={IREE_DEVICE_MAP[device]}")
    mlir_input_types = tensor_to_type_str(input_tensors, mlir_dialect)
    for mlir_input in mlir_input_types:
        benchmark_cl.append(f"--function_input={mlir_input}")
-    if device == "cpu":
-        num_cpus = get_cpu_count()
-        if num_cpus is not None:
-            benchmark_cl.append(f"--task_topology_max_group_count={num_cpus}")
-    time_extractor = "| awk 'END{{print $2 $3}}'"
-    benchmark_cl.append(time_extractor)
-    return benchmark_cl
-
-
-def build_benchmark_args_non_tensor_input(
-    input_file: str,
-    device: str,
-    inputs: tuple,
-    mlir_dialect: str,
-    function_name: str,
-):
-    """
-    Inputs: input_file leading to vmfb, input_tensor to function, target device,
-    and whether it is training or not.
-    Outputs: string that execute benchmark-module on target model.
-    """
-    path = benchmark_module.__path__[0]
-    benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
-    benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
-    # TODO: The function named can be passed as one of the args.
-    if function_name:
-        benchmark_cl.append(f"--entry_function={function_name}")
-    benchmark_cl.append(f"--device={iree_device_map(device)}")
-    for input in inputs:
-        benchmark_cl.append(f"--function_input={input}")
    time_extractor = "| awk 'END{{print $2 $3}}'"
    benchmark_cl.append(time_extractor)
    return benchmark_cl
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -13,34 +13,24 @@
 # limitations under the License.
 import iree.runtime as ireert
 import iree.compiler as ireec
-from shark.iree_utils._common import iree_device_map, iree_target_map
-from shark.iree_utils.benchmark_utils import *
-from shark.parser import shark_args
+from shark.iree_utils._common import IREE_DEVICE_MAP, IREE_TARGET_MAP
 import numpy as np
 import os
-import re
-

 # Get the iree-compile arguments given device.
-def get_iree_device_args(device, extra_args=[]):
-    if "://" in device:
-        device = device.split("://")[0]
+def get_iree_device_args(device):
    if device == "cpu":
        from shark.iree_utils.cpu_utils import get_iree_cpu_args

        return get_iree_cpu_args()
-    if device == "cuda":
+    if device in ["gpu", "cuda"]:
        from shark.iree_utils.gpu_utils import get_iree_gpu_args

        return get_iree_gpu_args()
    if device in ["metal", "vulkan"]:
        from shark.iree_utils.vulkan_utils import get_iree_vulkan_args

-        return get_iree_vulkan_args(extra_args=extra_args)
-    if device == "rocm":
-        from shark.iree_utils.gpu_utils import get_iree_rocm_args
-
-        return get_iree_rocm_args()
+        return get_iree_vulkan_args()
    return []


@@ -64,182 +54,17 @@ def get_iree_common_args():
    return [
        "--iree-stream-resource-index-bits=64",
        "--iree-vm-target-index-bits=64",
-        "--iree-util-zero-fill-elided-attrs",
    ]


-# Args that are suitable only for certain models or groups of models.
-# shark_args are passed down from pytests to control which models compile with these flags,
-# but they can also be set in shark/parser.py
-def get_model_specific_args():
-    ms_args = []
-    if shark_args.enable_conv_transform == True:
-        ms_args += ["--iree-flow-enable-conv-nchw-to-nhwc-transform"]
-    return ms_args
-
-
-def create_dispatch_dirs(bench_dir, device):
-    protected_files = ["ordered-dispatches.txt"]
-    bench_dir_path = bench_dir.split("/")
-    bench_dir_path[-1] = "temp_" + bench_dir_path[-1]
-    tmp_bench_dir = "/".join(bench_dir_path)
-    for f_ in os.listdir(bench_dir):
-        if os.path.isfile(f"{bench_dir}/{f_}") and f_ not in protected_files:
-            dir_name = re.sub("\.\S*$", "", f_)
-            if os.path.exists(f"{bench_dir}/{dir_name}"):
-                os.system(f"rm -rf {bench_dir}/{dir_name}")
-            os.system(f"mkdir {bench_dir}/{dir_name}")
-            os.system(f"mv {bench_dir}/{f_} {bench_dir}/{dir_name}/{f_}")
-    for f_ in os.listdir(tmp_bench_dir):
-        if os.path.isfile(f"{tmp_bench_dir}/{f_}"):
-            dir_name = ""
-            for d_ in os.listdir(bench_dir):
-                if re.search(f"{d_}(?=\D)", f_):
-                    dir_name = d_
-            if dir_name != "":
-                os.system(
-                    f"mv {tmp_bench_dir}/{f_} {bench_dir}/{dir_name}/{dir_name}_benchmark.mlir"
-                )
-
-
-def dump_isas(bench_dir):
-    for d_ in os.listdir(bench_dir):
-        if os.path.isdir(f"{bench_dir}/{d_}"):
-            for f_ in os.listdir(f"{bench_dir}/{d_}"):
-                if f_.endswith(".spv"):
-                    os.system(
-                        f"amdllpc -gfxip 11.0 {bench_dir}/{d_}/{f_} -v > \
-                         {bench_dir}/{d_}/isa.txt"
-                    )
-
-
-def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
-    benchmark_runtimes = {}
-    dispatch_list = []
-    all_dispatches = False
-
-    if dispatch_benchmarks.lower().strip() == "all":
-        all_dispatches = True
-    else:
-        try:
-            dispatch_list = [
-                int(dispatch_index)
-                for dispatch_index in dispatch_benchmarks.split(" ")
-            ]
-        except:
-            print("ERROR: Invalid dispatch benchmarks")
-            return None
-    for d_ in os.listdir(bench_dir):
-        if os.path.isdir(f"{bench_dir}/{d_}"):
-            in_dispatches = False
-            for dispatch in dispatch_list:
-                if str(dispatch) in d_:
-                    in_dispatches = True
-            if all_dispatches or in_dispatches:
-                for f_ in os.listdir(f"{bench_dir}/{d_}"):
-
-                    if "benchmark.mlir" in f_:
-                        dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
-                        module = dispatch_file.read()
-                        dispatch_file.close()
-
-                        flatbuffer_blob = ireec.compile_str(
-                            module, target_backends=[iree_target_map(device)]
-                        )
-
-                        vmfb_file = open(
-                            f"{bench_dir}/{d_}/{d_}_benchmark.vmfb", "wb"
-                        )
-                        vmfb_file.write(flatbuffer_blob)
-                        vmfb_file.close()
-
-                        config = get_iree_runtime_config(device)
-                        vm_module = ireert.VmModule.from_flatbuffer(
-                            config.vm_instance, flatbuffer_blob
-                        )
-
-                        benchmark_cl = build_benchmark_args_non_tensor_input(
-                            input_file=f"{bench_dir}/{d_}/{d_}_benchmark.vmfb",
-                            device=device,
-                            inputs=(0,),
-                            mlir_dialect="linalg",
-                            function_name="",
-                        )
-
-                        benchmark_bash = open(
-                            f"{bench_dir}/{d_}/{d_}_benchmark.sh", "w+"
-                        )
-                        benchmark_bash.write("#!/bin/bash\n")
-                        benchmark_bash.write(" ".join(benchmark_cl))
-                        benchmark_bash.close()
-
-                        benchmark_data = run_benchmark_module(benchmark_cl)
-
-                        benchmark_file = open(
-                            f"{bench_dir}/{d_}/{d_}_data.txt", "w+"
-                        )
-                        benchmark_file.write(f"DISPATCH: {d_}\n")
-                        benchmark_file.write(str(benchmark_data) + "\n")
-                        benchmark_file.write(
-                            "SHARK BENCHMARK RESULT: "
-                            + str(1 / (benchmark_data * 0.001))
-                            + "\n"
-                        )
-                        benchmark_file.close()
-
-                        benchmark_runtimes[d_] = 1 / (benchmark_data * 0.001)
-
-                    elif ".mlir" in f_ and "benchmark" not in f_:
-                        dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
-                        module = dispatch_file.read()
-                        dispatch_file.close()
-
-                        module = re.sub(
-                            "hal.executable private",
-                            "hal.executable public",
-                            module,
-                        )
-
-                        flatbuffer_blob = ireec.compile_str(
-                            module,
-                            target_backends=[iree_target_map(device)],
-                            extra_args=["--compile-mode=hal-executable"],
-                        )
-
-                        spirv_file = open(
-                            f"{bench_dir}/{d_}/{d_}_spirv.vmfb", "wb"
-                        )
-                        spirv_file.write(flatbuffer_blob)
-                        spirv_file.close()
-
-    ordered_dispatches = [
-        (k, v)
-        for k, v in sorted(
-            benchmark_runtimes.items(), key=lambda item: item[1]
-        )
-    ][::-1]
-    f_ = open(f"{bench_dir}/ordered-dispatches.txt", "w+")
-    for dispatch in ordered_dispatches:
-        f_.write(f"{dispatch[0]}: {dispatch[1]}ms\n")
-    f_.close()
-
-
 def compile_module_to_flatbuffer(
-    module,
-    device,
-    frontend,
-    func_name,
-    model_config_path,
-    extra_args,
-    model_name="None",
+    module, device, frontend, func_name, model_config_path
 ):
    # Setup Compile arguments wrt to frontends.
    input_type = ""
    args = get_iree_frontend_args(frontend)
-    args += get_iree_device_args(device, extra_args)
-    args += get_iree_common_args()
-    args += get_model_specific_args()
-    args += extra_args
+    args += get_iree_device_args(device)
+    # args += get_iree_common_args()

    if frontend in ["tensorflow", "tf"]:
        input_type = "mhlo"
@@ -247,24 +72,24 @@ def compile_module_to_flatbuffer(
        input_type = frontend
    elif frontend in ["tflite", "tflite-tosa"]:
        input_type = "tosa"
-    elif frontend in ["tm_tensor"]:
-        input_type = ireec.InputType.TM_TENSOR

    # TODO: make it simpler.
    # Compile according to the input type, else just try compiling.
+    if input_type not in ["mhlo", "tosa"]:
+        module = str(module)
    if input_type != "":
        # Currently for MHLO/TOSA.
        flatbuffer_blob = ireec.compile_str(
            module,
-            target_backends=[iree_target_map(device)],
+            target_backends=[IREE_TARGET_MAP[device]],
            extra_args=args,
            input_type=input_type,
        )
    else:
        # Currently for Torch.
        flatbuffer_blob = ireec.compile_str(
-            module,
-            target_backends=[iree_target_map(device)],
+            str(module),
+            target_backends=[IREE_TARGET_MAP[device]],
            extra_args=args,
        )

@@ -273,10 +98,8 @@ def compile_module_to_flatbuffer(

 def get_iree_module(flatbuffer_blob, device, func_name):
    # Returns the compiled module and the configs.
-    config = get_iree_runtime_config(device)
-    vm_module = ireert.VmModule.from_flatbuffer(
-        config.vm_instance, flatbuffer_blob
-    )
+    vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
+    config = ireert.Config(IREE_DEVICE_MAP[device])
    ctx = ireert.SystemContext(config=config)
    ctx.add_vm_module(vm_module)
    ModuleCompiled = ctx.modules.module[func_name]
@@ -289,44 +112,27 @@ def get_iree_compiled_module(
    frontend: str = "torch",
    func_name: str = "forward",
    model_config_path: str = None,
-    extra_args: list = [],
 ):
    """Given a module returns the compiled .vmfb and configs"""
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, frontend, func_name, model_config_path, extra_args
+        module, device, frontend, func_name, model_config_path
    )
    return get_iree_module(flatbuffer_blob, device, func_name)


-def load_flatbuffer(
-    flatbuffer_path: str, device: str, func_name: str = "forward"
-):
-
-    with open(os.path.join(flatbuffer_path), "rb") as f:
-        flatbuffer_blob = f.read()
-
-    return get_iree_module(flatbuffer_blob, device, func_name)
-
-
 def export_iree_module_to_vmfb(
    module,
    device: str,
    directory: str,
-    mlir_dialect: str = "linalg",
+    frontend: str = "torch",
    func_name: str = "forward",
    model_config_path: str = None,
-    module_name: str = None,
-    extra_args: list = [],
 ):
    # Compiles the module given specs and saves it as .vmfb file.
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, mlir_dialect, func_name, model_config_path, extra_args
+        module, device, frontend, func_name, model_config_path
    )
-    if module_name is None:
-        device_name = (
-            device if "://" not in device else "-".join(device.split("://"))
-        )
-        module_name = f"{mlir_dialect}_{func_name}_{device_name}"
+    module_name = f"{frontend}_{func_name}_{device}"
    filename = os.path.join(directory, module_name + ".vmfb")
    print(f"Saved vmfb in {filename}.")
    with open(filename, "wb") as f:
@@ -362,10 +168,4 @@ def get_results(compiled_vm, input, config, frontend="torch"):
        res = np.array(data, dtype=object)
        return np.copy(res)
    else:
-        return result.to_host()
-
-
-def get_iree_runtime_config(device):
-    device = iree_device_map(device)
-    config = ireert.Config(device=ireert.get_device(device))
-    return config
+        return np.copy(np.asarray(result, dtype=result.dtype))
--- a/shark/iree_utils/cpu_utils.py
+++ b/shark/iree_utils/cpu_utils.py
@@ -16,17 +16,6 @@

 import subprocess

-
-def get_cpu_count():
-    import multiprocessing
-
-    try:
-        cpu_count = multiprocessing.cpu_count()
-        return cpu_count
-    except NotImplementedError:
-        return None
-
-
 # Get the default cpu args.
 def get_iree_cpu_args():
    find_triple_cmd = "uname -s -m"
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -16,7 +16,6 @@

 import iree.runtime as ireert
 import ctypes
-from shark.parser import shark_args

 # Get the default gpu args given the architecture.
 def get_iree_gpu_args():
@@ -24,9 +23,7 @@ def get_iree_gpu_args():
    ireert.flags.parse_flags("--cuda_allow_inline_execution")
    # TODO: Give the user_interface to pass the sm_arch.
    sm_arch = get_cuda_sm_cc()
-    if (
-        sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]
-    ) and (shark_args.enable_tf32 == True):
+    if sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]:
        return [
            "--iree-hal-cuda-disable-loop-nounroll-wa",
            f"--iree-hal-cuda-llvm-target-arch={sm_arch}",
@@ -35,18 +32,6 @@ def get_iree_gpu_args():
        return ["--iree-hal-cuda-disable-loop-nounroll-wa"]


-# Get the default gpu args given the architecture.
-def get_iree_rocm_args():
-    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    # TODO: find a way to get arch from code.
-    rocm_arch = "gfx908"
-    return [
-        f"--iree-rocm-target-chip={rocm_arch}",
-        "--iree-rocm-link-bc=true",
-        "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
-    ]
-
-
 # Some constants taken from cuda.h
 CUDA_SUCCESS = 0
 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -14,69 +14,27 @@

 # All the iree_vulkan related functionalities go here.

-from os import linesep
 from shark.iree_utils._common import run_cmd
-import iree.runtime as ireert
-from sys import platform


-def get_vulkan_device_name():
-    vulkaninfo_dump = run_cmd("vulkaninfo").split(linesep)
-    vulkaninfo_list = [s.strip() for s in vulkaninfo_dump if "deviceName" in s]
-    if len(vulkaninfo_list) == 0:
-        raise ValueError("No device name found in VulkanInfo!")
-    if len(vulkaninfo_list) > 1:
-        print(
-            f"Found {len(vulkaninfo_list)} device names. choosing first one: {vulkaninfo_list[0]}"
-        )
-    return vulkaninfo_list[0]
-
-
-def get_os_name():
-    if platform.startswith("linux"):
-        return "linux"
-    elif platform == "darwin":
-        return "macos"
-    elif platform == "win32":
-        return "windows"
-    else:
-        print("Cannot detect OS type, defaulting to linux.")
-        return "linux"
-
-
-def get_vulkan_triple_flag(extra_args=[]):
-    if "-iree-vulkan-target-triple=" in " ".join(extra_args):
-        print(f"Using target triple from command line args")
-        return None
-    system_os = get_os_name()
-    vulkan_device = get_vulkan_device_name()
-    if all(x in vulkan_device for x in ("Apple", "M1")):
-        print(f"Found {vulkan_device} Device. Using m1-moltenvk-macos")
+def get_vulkan_triple_flag():
+    vulkan_device_cmd = "vulkaninfo | grep deviceName | awk 'END{{print $NF}}'"
+    vulkan_device = run_cmd(vulkan_device_cmd).strip()
+    if vulkan_device == "Ultra":
+        print("Found MacStudio M1 Device. Using m1-moltenvk-macos")
        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif all(x in vulkan_device for x in ("Apple", "M2")):
+    elif vulkan_device == "M2":
        print("Found Apple M2 Device. Using m1-moltenvk-macos")
        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif all(x in vulkan_device for x in ("A100", "SXM4")):
-        print(
-            f"Found {vulkan_device} Device. Using ampere-rtx3080-{system_os}"
-        )
-        return f"-iree-vulkan-target-triple=ampere-rtx3080-{system_os}"
-    elif all(x in vulkan_device for x in ("RTX", "3090")):
-        print(
-            f"Found {vulkan_device} Device. Using ampere-rtx3090-{system_os}"
-        )
-        return f"-iree-vulkan-target-triple=ampere-rtx3090-{system_os}"
-    elif all(x in vulkan_device for x in ("RTX", "4090")):
-        print(
-            f"Found {vulkan_device} Device. Using ampere-rtx3090-{system_os}"
-        )
-        return f"-iree-vulkan-target-triple=ampere-rtx3090-{system_os}"
-    elif all(x in vulkan_device for x in ("AMD", "7900")):
-        print(f"Found {vulkan_device} Device. Using rdna3-7900-{system_os}")
-        return f"-iree-vulkan-target-triple=rdna3-7900-{system_os}"
-    elif any(x in vulkan_device for x in ("AMD", "Radeon")):
-        print(f"Found AMD device. Using rdna2-unknown-{system_os}")
-        return f"-iree-vulkan-target-triple=rdna2-unknown-{system_os}"
+    elif vulkan_device == "M1":
+        print("Found Apple M1 Device. Using m1-moltenvk-macos")
+        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
+    elif vulkan_device == "A100-SXM4-40GB":
+        print("Found Nvidia Device. Using ampere-rtx3080-linux")
+        return "-iree-vulkan-target-triple=ampere-rtx3080-linux"
+    elif vulkan_device == "3090":
+        print("Found Nvidia Device. Using ampere-rtx3090-linux")
+        return "-iree-vulkan-target-triple=ampere-rtx3090-linux"
    else:
        print(
            """Optimized kernel for your target device is not added yet.
@@ -87,16 +45,10 @@ def get_vulkan_triple_flag(extra_args=[]):
        return None


-def get_iree_vulkan_args(extra_args=[]):
+def get_iree_vulkan_args():
    # vulkan_flag = ["--iree-flow-demote-i64-to-i32"]
    vulkan_flag = []
-    vulkan_triple_flag = get_vulkan_triple_flag(extra_args)
+    vulkan_triple_flag = get_vulkan_triple_flag()
    if vulkan_triple_flag is not None:
        vulkan_flag.append(vulkan_triple_flag)
    return vulkan_flag
-
-
-def set_iree_vulkan_runtime_flags(flags):
-    for flag in flags:
-        ireert.flags.parse_flags(flag)
-    return
--- a/shark/model_annotation.py
+++ b/shark/model_annotation.py
@@ -12,34 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""
-Usage:
-This function takes the model mlir file and the tuned config file as input,
-and output a new mlir file with lowering configs annotated on certain ops.
-There are two ways to utilize the function:
-1. Call model_annotation function within another python script
-from shark.model_annotation import model_annotation
-with create_context() as ctx:
-   module = model_annotation(ctx, input_contents=..., config_path=..., search_op=...)
-2. Run model_annotation.py directly
-python model_annotation.py path_to_original_mlir path_to_config_file
-"""
-
+import sys
 import json
 import os
-import sys
-from typing import Dict, List
+from typing import List, Dict

 from iree.compiler import ir
 from iree.compiler.transforms import ireec as ireec_trans

+MATMUL_OP_NAMES = set(
+    ["linalg.matmul", "linalg.batch_matmul", "mhlo.dot", "mhlo.dot_general"]
+)
+idx = 0
+

 def model_annotation(
-    ctx: ir.Context,
-    *,
-    input_contents: str,
-    config_path: str,
-    search_op: str = "matmul",
+    ctx: ir.Context, *, input_contents: str, config_path: str
 ):
    if os.path.isfile(input_contents):
        with open(input_contents, "rb") as f:
@@ -53,35 +41,21 @@ def model_annotation(

    # The Python API does not expose a general walk() function, so we just
    # do it ourselves.
-    walk_children(module.operation, configs, 0, search_op)
+    walk_children(module.operation, configs)

    if not module.operation.verify():
        raise RuntimeError("Modified program does not verify!")

+    # More efficient than: print(module)
+    #   - Disables verification (already done above)
+    #   - Writes as binary, avoiding costly unicode conversions
+    sys.stdout.buffer.write(
+        module.operation.get_asm(assume_verified=True, binary=True)
+    )
    return module


-def walk_children(
-    op: ir.Operation, configs: List[Dict], idx: int, search_op: str
-):
-    if search_op == "matmul":
-        op_names = ["linalg.matmul", "mhlo.dot"]
-    elif search_op == "bmm":
-        op_names = ["linalg.batch_matmul", "mhlo.dot_general"]
-    elif search_op == "conv":
-        op_names = ["mhlo.convolution", "linalg.conv_2d_nhwc_hwcf"]
-    elif search_op == "all":
-        op_names = [
-            "mhlo.dot",
-            "mhlo.dot_general",
-            "mhlo.convolution",
-            "linalg.matmul",
-            "linalg.batch_matmul",
-            "linalg.conv_2d_nhwc_hwcf",
-        ]
-    else:
-        raise ValueError(f"{search_op} op is not tunable.")
-
+def walk_children(op: ir.Operation, configs: List[Dict]):
    for region in op.regions:
        for block in region.blocks:
            for child_op in block.operations:
@@ -89,38 +63,34 @@ def walk_children(
                # 'operation' and 'name' attributes.
                if isinstance(child_op, ir.OpView):
                    child_op = child_op.operation
-                if child_op.name in op_names and idx < len(configs):
-                    add_attributes(child_op, configs[idx])
+                if child_op.name in MATMUL_OP_NAMES:
+                    global idx
+                    (
+                        tile_sizes,
+                        pipeline,
+                        workgroup_size,
+                        split_k,
+                        pipeline_depth,
+                    ) = parse_config(configs[idx])
+
+                    add_compilation_info(
+                        child_op,
+                        tile_sizes=tile_sizes,
+                        pipeline=pipeline,
+                        workgroup_size=workgroup_size,
+                        pipeline_depth=pipeline_depth,
+                    )
+
+                    if split_k:
+                        add_split_k(child_op, split_k)
+
                    idx = idx + 1
                    print(f"Updated op {child_op}", file=sys.stderr)
-                walk_children(child_op, configs, idx, search_op)
-
-
-def add_attributes(op: ir.Operation, config: Dict):
-    (
-        tile_sizes,
-        pipeline,
-        workgroup_size,
-        split_k,
-        pipeline_depth,
-    ) = parse_config(config)
-
-    add_compilation_info(
-        op,
-        tile_sizes=tile_sizes,
-        pipeline=pipeline,
-        workgroup_size=workgroup_size,
-        pipeline_depth=pipeline_depth,
-    )
-
-    if split_k:
-        add_attribute_by_name(op, "iree_flow_split_k", split_k)
+                walk_children(child_op, configs)


 def parse_config(config: Dict):
-    split_k = None
-    pipeline_depth = None
-    if "GPU" in config["pipeline"]:
+    if config["pipeline"] == "GPU" or config["pipeline"] == "GPU_TENSORCORE":
        pipeline = (
            "LLVMGPUMatmulSimt"
            if config["pipeline"] == "GPU"
@@ -128,31 +98,24 @@ def parse_config(config: Dict):
        )
        tile_sizes = [config["work_group_tile_sizes"]]
        workgroup_size = config["work_group_sizes"]
-        if "pipeline_depth" in config.keys():
+        try:
            pipeline_depth = config["pipeline_depth"]
-        if "split_k" in config.keys():
+        except:
+            pipeline_depth = None
+        try:
            split_k = config["split_k"]
-    elif "SPIRV" in config["pipeline"]:
-        pipeline = config["pipeline"]
-        tile_sizes = [
-            config["work_group_tile_sizes"],
-            config["parallel_tile_sizes"],
-            config["reduction_tile_sizes"],
-        ]
-        if "vector_tile_sizes" in config.keys():
-            tile_sizes += [config["vector_tile_sizes"]]
-        if "window_tile_sizes" in config.keys():
-            tile_sizes += [config["window_tile_sizes"]]
-        workgroup_size = config["work_group_sizes"]
+        except:
+            split_k = None
    else:
-        # For IREE CPU pipelines
        pipeline = config["pipeline"]
        tile_sizes = [
            config["work_group_tile_sizes"],
-            config["parallel_tile_sizes"],
-            config["reduction_tile_sizes"],
+            config["l1_tile_sizes"],
+            config["vector_tile_sizes"],
        ]
        workgroup_size = []
+        split_k = None
+        pipeline_depth = None
    return tile_sizes, pipeline, workgroup_size, split_k, pipeline_depth


@@ -182,9 +145,9 @@ def add_compilation_info(
    op.attributes["compilation_info"] = attr


-def add_attribute_by_name(op: ir.Operation, name: str, val: int):
-    attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), val)
-    op.attributes[name] = attr
+def add_split_k(op: ir.Operation, k: int):
+    attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), k)
+    op.attributes["iree_flow_split_k"] = attr


 def create_context() -> ir.Context:
@@ -196,14 +159,6 @@ def create_context() -> ir.Context:

 if __name__ == "__main__":
    with create_context() as ctx:
-        module = model_annotation(
-            ctx,
-            input_contents=sys.argv[1],
-            config_path=sys.argv[2],
-            search_op="all",
+        model_annotation(
+            ctx, input_contents=sys.argv[1], config_path=sys.argv[2]
        )
-        mlir_str = str(module)
-        filename = "tuned_model.mlir"
-        with open(filename, "w") as f:
-            f.write(mlir_str)
-        print(f"Saved mlir in {filename}.")
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -38,7 +38,7 @@ parser.add_argument(
    "--device",
    type=str,
    default="cpu",
-    help="Device on which shark_runner runs. options are cpu, cuda, and vulkan",
+    help="Device on which shark_runner runs. options are cpu, gpu, and vulkan",
 )
 parser.add_argument(
    "--repro_dir",
@@ -47,10 +47,16 @@ parser.add_argument(
    default="./shark_tmp",
 )
 parser.add_argument(
-    "--enable_tf32",
-    type=bool,
+    "--save_mlir",
    default=False,
-    help="Enables TF32 precision calculations on supported GPUs.",
+    action="store_true",
+    help="Saves input MLIR module to /tmp/ directory.",
+)
+parser.add_argument(
+    "--save_vmfb",
+    default=False,
+    action="store_true",
+    help="Saves iree .vmfb module to /tmp/ directory.",
 )
 parser.add_argument(
    "--model_config_path",
@@ -61,55 +67,14 @@ parser.add_argument(
 parser.add_argument(
    "--num_warmup_iterations",
    type=int,
-    default=5,
+    default=2,
    help="Run the model for the specified number of warmup iterations.",
 )
 parser.add_argument(
    "--num_iterations",
    type=int,
-    default=100,
+    default=1,
    help="Run the model for the specified number of iterations.",
 )
-parser.add_argument(
-    "--onnx_bench",
-    default=False,
-    action="store_true",
-    help="When enabled, pytest bench results will include ONNX benchmark results.",
-)
-parser.add_argument(
-    "--shark_prefix",
-    default="latest",
-    help="gs://shark_tank/<this_flag>/model_directories",
-)
-parser.add_argument(
-    "--update_tank",
-    default=False,
-    action="store_true",
-    help="When enabled, SHARK downloader will update local shark_tank if local hash is different from latest upstream hash.",
-)
-parser.add_argument(
-    "--local_tank_cache",
-    default="",
-    help="Specify where to save downloaded shark_tank artifacts. If this is not set, the default is ~/.local/shark_tank/.",
-)
-
-parser.add_argument(
-    "--dispatch_benchmarks",
-    default=None,
-    help='dispatches to return benchamrk data on.  use "All" for all, and None for none.',
-)
-
-parser.add_argument(
-    "--dispatch_benchmarks_dir",
-    default="temp_dispatch_benchmarks",
-    help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
-)
-
-parser.add_argument(
-    "--enable_conv_transform",
-    default=False,
-    action="store_false",
-    help="Enables the --iree-flow-enable-conv-nchw-to-nhwc-transform flag.",
-)

 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -19,74 +19,37 @@ from shark.iree_utils.benchmark_utils import (
    run_benchmark_module,
 )
 from shark.parser import shark_args
+from tank.model_utils import get_torch_model
 from datetime import datetime
 import time
 import csv
 import os


-class OnnxFusionOptions(object):
-    def __init__(self):
-        self.disable_gelu = False
-        self.disable_layer_norm = False
-        self.disable_attention = False
-        self.disable_skip_layer_norm = False
-        self.disable_embed_layer_norm = False
-        self.disable_bias_skip_layer_norm = False
-        self.disable_bias_gelu = False
-        self.enable_gelu_approximation = False
-        self.use_mask_index = False
-        self.no_attention_mask = False
-
-
-def check_requirements(frontend):
-    import importlib
-
-    has_pkgs = False
-    if frontend == "torch":
-        tv_spec = importlib.util.find_spec("torchvision")
-        has_pkgs = tv_spec is not None
-
-    elif frontend in ["tensorflow", "tf"]:
-        keras_spec = importlib.util.find_spec("keras")
-        tf_spec = importlib.util.find_spec("tensorflow")
-        has_pkgs = keras_spec is not None and tf_spec is not None
-
-    return has_pkgs
-
-
 class SharkBenchmarkRunner(SharkRunner):
    # SharkRunner derived class with Benchmarking capabilities.
    def __init__(
        self,
-        mlir_module: bytes,
+        mlir_module: str,
        function_name: str = "forward",
        device: str = "none",
        mlir_dialect: str = "linalg",
-        extra_args: list = [],
+        frontend: str = "torch",
    ):
        self.device = shark_args.device if device == "none" else device
+        self.frontend = frontend
        self.frontend_model = None
        self.vmfb_file = None
-        self.mlir_dialect = mlir_dialect
-        self.extra_args = extra_args
        SharkRunner.__init__(
            self,
            mlir_module,
            function_name,
            device,
-            self.mlir_dialect,
-            self.extra_args,
-            compile_vmfb=True,
+            mlir_dialect,
        )
        if self.vmfb_file == None:
            self.vmfb_file = export_iree_module_to_vmfb(
-                mlir_module,
-                device,
-                shark_args.repro_dir,
-                self.mlir_dialect,
-                function_name,
-                extra_args=self.extra_args,
+                mlir_module, device, shark_args.repro_dir, self.frontend
            )

    def setup_cl(self, input_tensors):
@@ -97,25 +60,23 @@ class SharkBenchmarkRunner(SharkRunner):
            mlir_dialect=self.mlir_dialect,
        )

-    def benchmark_frontend(self, modelname):
-        if self.mlir_dialect in ["linalg", "torch"]:
+    def benchmark_frontend(self, inputs, modelname):
+        if self.frontend in ["pytorch", "torch"]:
            return self.benchmark_torch(modelname)
-
-        elif self.mlir_dialect in ["mhlo", "tf"]:
-            return self.benchmark_tf(modelname)
+        elif self.frontend in ["tensorflow", "tf"]:
+            return self.benchmark_tf(inputs, modelname)

    def benchmark_torch(self, modelname):
        import torch
-        from tank.model_utils import get_torch_model

-        if self.device == "cuda":
+        if self.device == "gpu":
            torch.set_default_tensor_type(torch.cuda.FloatTensor)
        else:
            torch.set_default_tensor_type(torch.FloatTensor)
        torch_device = torch.device(
-            "cuda:0" if self.device == "cuda" else "cpu"
+            "cuda:0" if self.device == "gpu" else "cpu"
        )
-        HFmodel, input = get_torch_model(modelname)[:2]
+        HFmodel, input, act_out = get_torch_model(modelname)
        frontend_model = HFmodel.model
        frontend_model.to(torch_device)
        input.to(torch_device)
@@ -137,49 +98,27 @@ class SharkBenchmarkRunner(SharkRunner):
            f"{((end-begin)/shark_args.num_iterations)*1000}",
        ]

-    def benchmark_tf(self, modelname):
-        import tensorflow as tf
+    def benchmark_tf(self, frontend_model, inputs):
+        for i in range(shark_args.num_warmup_iterations):
+            frontend_model.forward(*inputs)

-        visible_default = tf.config.list_physical_devices("GPU")
-        try:
-            tf.config.set_visible_devices([], "GPU")
-            visible_devices = tf.config.get_visible_devices()
-            for device in visible_devices:
-                assert device.device_type != "GPU"
-        except:
-            # Invalid device or cannot modify virtual devices once initialized.
-            pass
-
-        from tank.model_utils_tf import get_tf_model
-
-        # tf_device = "/GPU:0" if self.device == "cuda" else "/CPU:0"
-        tf_device = "/CPU:0"
-        with tf.device(tf_device):
-            model, input, = get_tf_model(
-                modelname
-            )[:2]
-            frontend_model = model
-
-            for i in range(shark_args.num_warmup_iterations):
-                frontend_model.forward(*input)
-
-            begin = time.time()
-            for i in range(shark_args.num_iterations):
-                out = frontend_model.forward(*input)
-                if i == shark_args.num_iterations - 1:
-                    end = time.time()
-                    break
-            print(
-                f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
-            )
-            return [
-                f"{shark_args.num_iterations/(end-begin)}",
-                f"{((end-begin)/shark_args.num_iterations)*1000}",
-            ]
+        begin = time.time()
+        for i in range(shark_args.num_iterations):
+            out = frontend_model.forward(*inputs)
+            if i == shark_args.num_iterations - 1:
+                end = time.time()
+                break
+        print(
+            f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+        return [
+            f"{shark_args.num_iterations/(end-begin)}",
+            f"{((end-begin)/shark_args.num_iterations)*1000}",
+        ]

    def benchmark_c(self):
        result = run_benchmark_module(self.benchmark_cl)
-        print(f"Shark-IREE-C benchmark:{result} iter/second")
+        print(f"Shark-{self.frontend} C-benchmark:{result} iter/second")
        return [f"{result}", f"{1000/result}"]

    def benchmark_python(self, inputs):
@@ -193,134 +132,32 @@ class SharkBenchmarkRunner(SharkRunner):
            if i == shark_args.num_iterations - 1:
                end = time.time()
        print(
-            f"Shark-IREE Python benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+            f"Shark-{self.frontend} Python-benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
        return [
            f"{shark_args.num_iterations/(end-begin)}",
            f"{((end-begin)/shark_args.num_iterations)*1000}",
        ]

-    def benchmark_onnx(self, modelname, inputs):
-        if self.device == "cuda":
-            print(
-                "Currently GPU benchmarking on ONNX is not supported in SHARK."
-            )
-            return ["N/A", "N/A"]
-        else:
-            from onnxruntime.transformers.benchmark import run_onnxruntime
-            from onnxruntime.transformers.huggingface_models import MODELS
-            from onnxruntime.transformers.benchmark_helper import (
-                ConfigModifier,
-                Precision,
-            )
-            import psutil
-
-            if modelname == "microsoft/MiniLM-L12-H384-uncased":
-                modelname = "bert-base-uncased"
-            if modelname not in MODELS:
-                print(
-                    f"{modelname} is currently not supported in ORT's HF. Check \
-https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
-for currently supported models. Exiting benchmark ONNX."
-                )
-                return ["N/A", "N/A"]
-            use_gpu = self.device == "cuda"
-            num_threads = psutil.cpu_count(logical=False)
-            batch_sizes = [1]
-            sequence_lengths = [128]
-            cache_dir = os.path.join(".", "cache_models")
-            onnx_dir = os.path.join(".", "onnx_models")
-            verbose = False
-            input_counts = [1]
-            optimize_onnx = True
-            validate_onnx = False
-            disable_ort_io_binding = False
-            use_raw_attention_mask = True
-            model_fusion_statistics = {}
-            overwrite = False
-            model_source = "pt"  # Either "pt" or "tf"
-            provider = None
-            config_modifier = ConfigModifier(None)
-            onnx_args = OnnxFusionOptions()
-            result = run_onnxruntime(
-                use_gpu,
-                provider,
-                (modelname,),
-                None,
-                config_modifier,
-                Precision.FLOAT32,
-                num_threads,
-                batch_sizes,
-                sequence_lengths,
-                shark_args.num_iterations,
-                input_counts,
-                optimize_onnx,
-                validate_onnx,
-                cache_dir,
-                onnx_dir,
-                verbose,
-                overwrite,
-                disable_ort_io_binding,
-                use_raw_attention_mask,
-                model_fusion_statistics,
-                model_source,
-                onnx_args,
-            )
-            print(
-                f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
-            )
-            return [
-                result[0]["QPS"],
-                result[0]["average_latency_ms"],
-            ]
-
-    def get_metadata(self, modelname):
-        with open("./tank/model_metadata.csv", mode="r") as csvfile:
-            torch_reader = csv.reader(csvfile, delimiter=",")
-            fields = next(torch_reader)
-            for row in torch_reader:
-                torch_model_name = row[0]
-                if torch_model_name == modelname:
-                    param_count = row[3]
-                    model_tags = row[4]
-                    model_notes = row[5]
-                    return [param_count, model_tags, model_notes]
-
-    def compare_bench_results(self, baseline: str, result: str):
-        if baseline is not None:
-            # Takes a baseline and a result string and calculates a comparison, e.g. "1.04x baseline".
-            a = float(baseline)
-            b = float(result)
-            comparison = a / b
-            comp_str = f"{round(comparison, 2)}x baseline"
-        else:
-            comp_str = "N/A"
-
-        return comp_str
+    def benchmark_all(self, inputs: tuple):
+        self.benchmark_frontend(inputs)
+        self.benchmark_python(inputs)
+        self.benchmark_c()

    def benchmark_all_csv(
        self, inputs: tuple, modelname, dynamic, device_str, frontend
    ):
        self.setup_cl(inputs)
        field_names = [
+            "platform",
            "model",
-            "engine",
-            "dialect",
+            "dynamic",
            "device",
-            "shape_type",
-            "data_type",
            "iter/sec",
            "ms/iter",
-            "vs. PyTorch/TF",
-            "iterations",
-            "param_count",
-            "tags",
-            "notes",
            "datetime",
        ]
-        engines = ["frontend", "shark_python", "shark_iree_c"]
-        if shark_args.onnx_bench == True:
-            engines.append("onnxruntime")
+        platforms = ["frontend", "shark_python", "shark_iree_c"]

        if not os.path.exists("bench_results.csv"):
            with open("bench_results.csv", mode="w", newline="") as f:
@@ -332,69 +169,26 @@ for currently supported models. Exiting benchmark ONNX."
            bench_result = {}
            bench_result["model"] = modelname
            if dynamic == True:
-                bench_result["shape_type"] = "dynamic"
+                bench_result["dynamic"] = "True"
            else:
-                bench_result["shape_type"] = "static"
+                bench_result["dynamic"] = "False"
            bench_result["device"] = device_str
-            bench_result["data_type"] = inputs[0].dtype
-            for e in engines:
-                (
-                    bench_result["param_count"],
-                    bench_result["tags"],
-                    bench_result["notes"],
-                ) = ["", "", ""]
-                if e == "frontend":
-                    bench_result["engine"] = frontend
-                    if check_requirements(frontend):
-                        (
-                            bench_result["iter/sec"],
-                            bench_result["ms/iter"],
-                        ) = self.benchmark_frontend(modelname)
-                        self.frontend_result = bench_result["ms/iter"]
-                        bench_result["vs. PyTorch/TF"] = "baseline"
-                        (
-                            bench_result["param_count"],
-                            bench_result["tags"],
-                            bench_result["notes"],
-                        ) = self.get_metadata(modelname)
-                    else:
-                        self.frontend_result = None
-                        continue
-
-                elif e == "shark_python":
-                    bench_result["engine"] = "shark_python"
-                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
-                    ) = self.benchmark_python(inputs)
-
-                    bench_result[
-                        "vs. PyTorch/TF"
-                    ] = self.compare_bench_results(
-                        self.frontend_result, bench_result["ms/iter"]
-                    )
-
-                elif e == "shark_iree_c":
-                    bench_result["engine"] = "shark_iree_c"
-                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
-                    ) = self.benchmark_c()
-
-                    bench_result[
-                        "vs. PyTorch/TF"
-                    ] = self.compare_bench_results(
-                        self.frontend_result, bench_result["ms/iter"]
-                    )
-
-                elif e == "onnxruntime":
-                    bench_result["engine"] = "onnxruntime"
-                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
-                    ) = self.benchmark_onnx(modelname, inputs)
-
-                bench_result["dialect"] = self.mlir_dialect
-                bench_result["iterations"] = shark_args.num_iterations
+            for p in platforms:
+                if p == "frontend":
+                    bench_result["platform"] = frontend
+                    bench_result["iter/sec"] = self.benchmark_frontend(
+                        inputs, modelname
+                    )[0]
+                    bench_result["ms/iter"] = self.benchmark_frontend(
+                        inputs, modelname
+                    )[1]
+                elif p == "shark_python":
+                    bench_result["platform"] = "shark_python"
+                    bench_result["iter/sec"] = self.benchmark_python(inputs)[0]
+                    bench_result["ms/iter"] = self.benchmark_python(inputs)[1]
+                else:
+                    bench_result["platform"] = "shark_iree_c"
+                    bench_result["iter/sec"] = self.benchmark_c()[0]
+                    bench_result["ms/iter"] = self.benchmark_c()[1]
                bench_result["datetime"] = str(datetime.now())
                writer.writerow(bench_result)
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -14,51 +14,10 @@

 import numpy as np
 import os
-import sys
+import urllib.request
+import json
+import hashlib
 from pathlib import Path
-from shark.parser import shark_args
-from google.cloud import storage
-
-
-def download_public_file(
-    full_gs_url, destination_folder_name, single_file=False
-):
-    """Downloads a public blob from the bucket."""
-    # bucket_name = "gs://your-bucket-name/path/to/file"
-    # destination_file_name = "local/path/to/file"
-
-    storage_client = storage.Client.create_anonymous_client()
-    bucket_name = full_gs_url.split("/")[2]
-    source_blob_name = None
-    dest_filename = None
-    desired_file = None
-    if single_file:
-
-        desired_file = full_gs_url.split("/")[-1]
-        source_blob_name = "/".join(full_gs_url.split("/")[3:-1])
-        destination_folder_name, dest_filename = os.path.split(
-            destination_folder_name
-        )
-    else:
-        source_blob_name = "/".join(full_gs_url.split("/")[3:])
-    bucket = storage_client.bucket(bucket_name)
-    blobs = bucket.list_blobs(prefix=source_blob_name)
-    if not os.path.exists(destination_folder_name):
-        os.mkdir(destination_folder_name)
-    for blob in blobs:
-        blob_name = blob.name.split("/")[-1]
-        if single_file:
-            if blob_name == desired_file:
-                destination_filename = os.path.join(
-                    destination_folder_name, dest_filename
-                )
-                blob.download_to_filename(destination_filename)
-            else:
-                continue
-
-        destination_filename = os.path.join(destination_folder_name, blob_name)
-        blob.download_to_filename(destination_filename)
-

 input_type_to_np_dtype = {
    "float32": np.float32,
@@ -70,27 +29,11 @@ input_type_to_np_dtype = {
    "int8": np.int8,
 }

+
 # Save the model in the home local so it needn't be fetched everytime in the CI.
 home = str(Path.home())
-alt_path = os.path.join(os.path.dirname(__file__), "../gen_shark_tank/")
-custom_path = shark_args.local_tank_cache
-if os.path.exists(alt_path):
-    WORKDIR = alt_path
-    print(
-        f"Using {WORKDIR} as shark_tank directory. Delete this directory if you aren't working from locally generated shark_tank."
-    )
-if custom_path:
-    if not os.path.exists(custom_path):
-        os.mkdir(custom_path)
-
-    WORKDIR = custom_path
-
-    print(f"Using {WORKDIR} as local shark_tank cache directory.")
-else:
-    WORKDIR = os.path.join(home, ".local/shark_tank/")
-    print(
-        f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache= flag"
-    )
+WORKDIR = os.path.join(home, ".local/shark_tank/")
+print(WORKDIR)


 # Checks whether the directory and files exists.
@@ -118,64 +61,57 @@ def check_dir_exists(model_name, frontend="torch", dynamic=""):
            and os.path.isfile(os.path.join(model_dir, "golden_out.npz"))
            and os.path.isfile(os.path.join(model_dir, "hash.npy"))
        ):
-            print(f"""Using cached models from {WORKDIR}...""")
+            print(
+                f"""The models are present in the {WORKDIR}. If you want a fresh 
+                download, consider deleting the directory."""
+            )
            return True
    return False


 # Downloads the torch model from gs://shark_tank dir.
-def download_model(
-    model_name,
-    dynamic=False,
-    tank_url="gs://shark_tank/latest",
-    frontend=None,
-    tuned=None,
-):
+def download_torch_model(model_name, dynamic=False):
    model_name = model_name.replace("/", "_")
    dyn_str = "_dynamic" if dynamic else ""
    os.makedirs(WORKDIR, exist_ok=True)
-    model_dir_name = model_name + "_" + frontend
-    model_dir = os.path.join(WORKDIR, model_dir_name)
-    full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name
+    model_dir_name = model_name + "_torch"

-    if not check_dir_exists(
-        model_dir_name, frontend=frontend, dynamic=dyn_str
-    ):
-        print(f"Downloading artifacts for model {model_name}...")
-        download_public_file(full_gs_url, model_dir)
+    def gs_download_model():
+        gs_command = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
+            + "/"
+            + model_dir_name
+            + " "
+            + WORKDIR
+        )
+        if os.system(gs_command) != 0:
+            raise Exception("model not present in the tank. Contact Nod Admin")
+
+    if not check_dir_exists(model_dir_name, frontend="torch", dynamic=dyn_str):
+        gs_download_model()
    else:
-        if not _internet_connected():
-            print(
-                "No internet connection. Using the model already present in the tank."
-            )
-        else:
-            local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
-            gs_hash_url = (
-                tank_url.rstrip("/") + "/" + model_dir_name + "/hash.npy"
-            )
-            download_public_file(
-                gs_hash_url,
-                os.path.join(model_dir, "upstream_hash.npy"),
-                single_file=True,
-            )
-            upstream_hash = str(
-                np.load(os.path.join(model_dir, "upstream_hash.npy"))
-            )
-            if local_hash != upstream_hash:
-                if shark_args.update_tank == True:
-                    print(f"Updating artifacts for model {model_name}...")
-                    download_public_file(full_gs_url, WORKDIR)
-                else:
-                    print(
-                        "Hash does not match upstream in gs://shark_tank/. If you are using SHARK Downloader with locally generated artifacts, this is working as intended."
-                    )
+        model_dir = os.path.join(WORKDIR, model_dir_name)
+        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
+        gs_hash = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
+            + "/"
+            + model_dir_name
+            + "/hash.npy"
+            + " "
+            + os.path.join(model_dir, "upstream_hash.npy")
+        )
+        if os.system(gs_hash) != 0:
+            raise Exception("hash of the model not present in the tank.")
+        upstream_hash = str(
+            np.load(os.path.join(model_dir, "upstream_hash.npy"))
+        )
+        if local_hash != upstream_hash:
+            gs_download_model()

    model_dir = os.path.join(WORKDIR, model_dir_name)
-    tuned_str = "" if tuned is None else "_" + tuned
-    suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
-    filename = os.path.join(model_dir, model_name + suffix)
-
-    with open(filename, mode="rb") as f:
+    with open(
+        os.path.join(model_dir, model_name + dyn_str + "_torch.mlir")
+    ) as f:
        mlir_file = f.read()

    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
@@ -187,11 +123,106 @@ def download_model(
    return mlir_file, function_name, inputs_tuple, golden_out_tuple


-def _internet_connected():
-    import requests as req
+# Downloads the tflite model from gs://shark_tank dir.
+def download_tflite_model(model_name, dynamic=False):
+    dyn_str = "_dynamic" if dynamic else ""
+    os.makedirs(WORKDIR, exist_ok=True)
+    model_dir_name = model_name + "_tflite"

-    try:
-        req.get("http://1.1.1.1")
-        return True
-    except:
-        return False
+    def gs_download_model():
+        gs_command = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
+            + "/"
+            + model_dir_name
+            + " "
+            + WORKDIR
+        )
+        if os.system(gs_command) != 0:
+            raise Exception("model not present in the tank. Contact Nod Admin")
+
+    if not check_dir_exists(
+        model_dir_name, frontend="tflite", dynamic=dyn_str
+    ):
+        gs_download_model()
+    else:
+        model_dir = os.path.join(WORKDIR, model_dir_name)
+        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
+        gs_hash = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
+            + "/"
+            + model_dir_name
+            + "/hash.npy"
+            + " "
+            + os.path.join(model_dir, "upstream_hash.npy")
+        )
+        if os.system(gs_hash) != 0:
+            raise Exception("hash of the model not present in the tank.")
+        upstream_hash = str(
+            np.load(os.path.join(model_dir, "upstream_hash.npy"))
+        )
+        if local_hash != upstream_hash:
+            gs_download_model()
+
+    model_dir = os.path.join(WORKDIR, model_dir_name)
+    with open(
+        os.path.join(model_dir, model_name + dyn_str + "_tflite.mlir")
+    ) as f:
+        mlir_file = f.read()
+
+    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
+    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
+    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
+
+    inputs_tuple = tuple([inputs[key] for key in inputs])
+    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
+    return mlir_file, function_name, inputs_tuple, golden_out_tuple
+
+
+def download_tf_model(model_name):
+    model_name = model_name.replace("/", "_")
+    os.makedirs(WORKDIR, exist_ok=True)
+    model_dir_name = model_name + "_tf"
+
+    def gs_download_model():
+        gs_command = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
+            + "/"
+            + model_dir_name
+            + " "
+            + WORKDIR
+        )
+        if os.system(gs_command) != 0:
+            raise Exception("model not present in the tank. Contact Nod Admin")
+
+    if not check_dir_exists(model_dir_name, frontend="tf"):
+        gs_download_model()
+    else:
+        model_dir = os.path.join(WORKDIR, model_dir_name)
+        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
+        gs_hash = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
+            + "/"
+            + model_dir_name
+            + "/hash.npy"
+            + " "
+            + os.path.join(model_dir, "upstream_hash.npy")
+        )
+        if os.system(gs_hash) != 0:
+            raise Exception("hash of the model not present in the tank.")
+        upstream_hash = str(
+            np.load(os.path.join(model_dir, "upstream_hash.npy"))
+        )
+        if local_hash != upstream_hash:
+            gs_download_model()
+
+    model_dir = os.path.join(WORKDIR, model_dir_name)
+    with open(os.path.join(model_dir, model_name + "_tf.mlir")) as f:
+        mlir_file = f.read()
+
+    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
+    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
+    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
+
+    inputs_tuple = tuple([inputs[key] for key in inputs])
+    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
+    return mlir_file, function_name, inputs_tuple, golden_out_tuple
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -75,24 +75,21 @@ class SharkImporter:
            self.module, self.inputs, is_dynamic, tracing_required
        )

-    def _tf_mlir(self, func_name, save_dir="./shark_tmp/"):
+    def _tf_mlir(self, func_name):
        from iree.compiler import tf as tfc

        return tfc.compile_module(
-            self.module,
-            exported_names=[func_name],
-            import_only=True,
-            output_file=save_dir,
+            self.module, exported_names=[func_name], import_only=True
        )

-    def _tflite_mlir(self, func_name, save_dir="./shark_tmp/"):
+    def _tflite_mlir(self, func_name):
        from iree.compiler import tflite as tflitec
+        from shark.iree_utils._common import IREE_TARGET_MAP

        self.mlir_model = tflitec.compile_file(
            self.raw_model_file,  # in tflite, it is a path to .tflite file, not a tflite interpreter
            input_type="tosa",
            import_only=True,
-            output_file=save_dir,
        )
        return self.mlir_model

@@ -102,7 +99,6 @@ class SharkImporter:
        is_dynamic=False,
        tracing_required=False,
        func_name="forward",
-        save_dir="./shark_tmp/",
    ):
        if self.frontend in ["torch", "pytorch"]:
            if self.inputs == None:
@@ -112,15 +108,15 @@ class SharkImporter:
                sys.exit(1)
            return self._torch_mlir(is_dynamic, tracing_required), func_name
        if self.frontend in ["tf", "tensorflow"]:
-            return self._tf_mlir(func_name, save_dir), func_name
+            return self._tf_mlir(func_name), func_name
        if self.frontend in ["tflite", "tf-lite"]:
            func_name = "main"
-            return self._tflite_mlir(func_name, save_dir), func_name
+            return self._tflite_mlir(func_name), func_name

    # Converts the frontend specific tensors into np array.
    def convert_to_numpy(self, array_tuple: tuple):
        if self.frontend in ["torch", "pytorch"]:
-            return [x.detach().cpu().numpy() for x in array_tuple]
+            return [x.detach().numpy() for x in array_tuple]
        if self.frontend in ["tf", "tensorflow"]:
            return [x.numpy() for x in array_tuple]

@@ -134,20 +130,19 @@ class SharkImporter:
        outputs_name = "golden_out.npz"
        func_file_name = "function_name"
        model_name_mlir = model_name + "_" + self.frontend + ".mlir"
-        try:
-            inputs = [x.cpu().detach() for x in inputs]
-        except AttributeError:
-            try:
-                inputs = [x.numpy() for x in inputs]
-            except AttributeError:
-                inputs = [x for x in inputs]
        np.savez(os.path.join(dir, inputs_name), *inputs)
        np.savez(os.path.join(dir, outputs_name), *outputs)
        np.save(os.path.join(dir, func_file_name), np.array(func_name))

+        mlir_str = mlir_data
        if self.frontend == "torch":
-            with open(os.path.join(dir, model_name_mlir), "wb") as mlir_file:
-                mlir_file.write(mlir_data)
+            mlir_str = mlir_data.operation.get_asm()
+        elif self.frontend == "tf":
+            mlir_str = mlir_data.decode("utf-8")
+        elif self.frontend == "tflite":
+            mlir_str = mlir_data.decode("utf-8")
+        with open(os.path.join(dir, model_name_mlir), "w") as mlir_file:
+            mlir_file.write(mlir_str)

        return

@@ -164,13 +159,9 @@ class SharkImporter:
                f"There is no input provided: {self.inputs}, please provide inputs or simply run import_mlir."
            )
            sys.exit(1)
-        model_name_mlir = model_name + "_" + self.frontend + ".mlir"
-        artifact_path = os.path.join(dir, model_name_mlir)
+
        imported_mlir = self.import_mlir(
-            is_dynamic,
-            tracing_required,
-            func_name,
-            save_dir=artifact_path,
+            is_dynamic, tracing_required, func_name
        )
        # TODO: Make sure that any generic function name is accepted. Currently takes in the default function names.
        # TODO: Check for multiple outputs.
@@ -180,7 +171,7 @@ class SharkImporter:
            golden_out = self.module(*self.inputs)
            if torch.is_tensor(golden_out):
                golden_out = tuple(
-                    golden_out.detach().cpu().numpy(),
+                    golden_out.detach().numpy(),
                )
            else:
                golden_out = self.convert_to_numpy(golden_out)
@@ -208,11 +199,9 @@ class SharkImporter:
                )
            elif golden_out is tuple:
                golden_out = self.convert_to_numpy(golden_out)
-            elif hasattr(golden_out, "logits"):
+            else:
                # from transformers import TFSequenceClassifierOutput
                golden_out = golden_out.logits
-            else:
-                golden_out = golden_out.last_hidden_state
            # Save the artifacts in the directory dir.
            self.save_data(
                dir,
@@ -243,59 +232,3 @@ class SharkImporter:
                self.inputs,
                golden_out,
            )
-
-
-# Applies fx conversion to the model and imports the mlir.
-def import_with_fx(model, inputs, debug=False):
-    import torch
-    from torch.fx.experimental.proxy_tensor import make_fx
-    from torch._decomp import get_decompositions
-
-    # TODO: Control the decompositions.
-    fx_g = make_fx(
-        model,
-        decomposition_table=get_decompositions(
-            [
-                torch.ops.aten.embedding_dense_backward,
-                torch.ops.aten.native_layer_norm_backward,
-                torch.ops.aten.slice_backward,
-                torch.ops.aten.select_backward,
-                torch.ops.aten.norm.ScalarOpt_dim,
-                torch.ops.aten.native_group_norm,
-                torch.ops.aten.upsample_bilinear2d.vec,
-                torch.ops.aten.split.Tensor,
-                torch.ops.aten.split_with_sizes,
-                torch.ops.aten.native_layer_norm,
-            ]
-        ),
-    )(*inputs)
-
-    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-    fx_g.recompile()
-
-    def strip_overloads(gm):
-        """
-        Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-        Args:
-            gm(fx.GraphModule): The input Fx graph module to be modified
-        """
-        for node in gm.graph.nodes:
-            if isinstance(node.target, torch._ops.OpOverload):
-                node.target = node.target.overloadpacket
-        gm.recompile()
-
-    strip_overloads(fx_g)
-
-    mlir_importer = SharkImporter(
-        fx_g,
-        inputs,
-        frontend="torch",
-    )
-
-    if debug:
-        (mlir_module, func_name), _, _ = mlir_importer.import_debug()
-        return mlir_module, func_name
-
-    mlir_module, func_name = mlir_importer.import_mlir()
-
-    return mlir_module, func_name
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -9,15 +9,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from shark.iree_utils.compile_utils import (
-    export_iree_module_to_vmfb,
-    load_flatbuffer,
-    create_dispatch_dirs,
-    compile_benchmark_dirs,
-)
-import os
 from shark.shark_runner import SharkRunner
-from shark.parser import shark_args
 import numpy as np


@@ -39,7 +31,7 @@ class SharkInference:
    Attributes
    ----------
    mlir_module : str
-        mlir_module represented in string; modules from torch-mlir are serialized in bytecode format.
+        mlir_module represented in string.
    function_name : str
        function to execute in the given mlir_module.
    device : str
@@ -65,48 +57,21 @@ class SharkInference:

    def __init__(
        self,
-        mlir_module: bytes,
+        mlir_module: str,
        function_name: str = "forward",
        device: str = "none",
        mlir_dialect: str = "linalg",
        is_benchmark: bool = False,
-        dispatch_benchmark: str = None,
-        dispatch_benchmark_dir: str = "temp_dispatch_benchmarks",
    ):
        self.mlir_module = mlir_module
        self.function_name = function_name
-        self.device = shark_args.device if device == "none" else device
+        self.device = device
        self.mlir_dialect = mlir_dialect
        self.is_benchmark = is_benchmark
-        self.dispatch_benchmarks = (
-            shark_args.dispatch_benchmarks
-            if dispatch_benchmark is None
-            else dispatch_benchmark
-        )
-        self.dispatch_benchmarks_dir = (
-            shark_args.dispatch_benchmarks_dir
-            if dispatch_benchmark_dir == "temp_dispatch_benchmarks"
-            else dispatch_benchmark_dir
-        )

        self.shark_runner = None

-    def compile(self, extra_args=[]):
-
-        if self.dispatch_benchmarks is not None:
-            extra_args.append(
-                f"--iree-hal-dump-executable-sources-to={self.dispatch_benchmarks_dir}"
-            )
-            extra_args.append(
-                f"--iree-hal-dump-executable-binaries-to={self.dispatch_benchmarks_dir}"
-            )
-            temp_dir = self.dispatch_benchmarks_dir.split("/")
-            temp_dir[-1] = "temp_" + temp_dir[-1]
-            temp_dir = "/".join(temp_dir)
-            self.temp_dispatch_benchmarks_dir = temp_dir
-            extra_args.append(
-                f"--iree-hal-dump-executable-benchmarks-to={self.temp_dispatch_benchmarks_dir}"
-            )
+    def compile(self):

        if self.is_benchmark == True:
            from shark.shark_benchmark_runner import SharkBenchmarkRunner
@@ -116,7 +81,6 @@ class SharkInference:
                self.function_name,
                self.device,
                self.mlir_dialect,
-                extra_args=extra_args,
            )

        else:
@@ -125,18 +89,8 @@ class SharkInference:
                self.function_name,
                self.device,
                self.mlir_dialect,
-                extra_args=extra_args,
            )

-        if self.dispatch_benchmarks is not None:
-            create_dispatch_dirs(self.dispatch_benchmarks_dir, self.device)
-            compile_benchmark_dirs(
-                self.dispatch_benchmarks_dir,
-                self.device,
-                self.dispatch_benchmarks,
-            )
-            os.system(f"rm -rf {self.temp_dispatch_benchmarks_dir}")
-
    # inputs are considered to be tuple of np.array.
    def forward(self, inputs: tuple):
        return self.shark_runner.run(inputs)
@@ -181,34 +135,3 @@ class SharkInference:
                )
            )
        return tuple(inputs)
-
-    # TODO: Instead of passing directory and having names decided by the module
-    # , user may want to save the module with manual names.
-    def save_module(self, dir=os.getcwd(), module_name=None, extra_args=[]):
-        return export_iree_module_to_vmfb(
-            self.mlir_module,
-            self.device,
-            dir,
-            self.mlir_dialect,
-            self.function_name,
-            module_name=module_name,
-            extra_args=extra_args,
-        )
-
-    # load and return the module.
-    def load_module(self, path, extra_args=[]):
-        self.shark_runner = SharkRunner(
-            function_name=self.function_name,
-            device=self.device,
-            compile_vmfb=False,
-            extra_args=extra_args,
-        )
-        (
-            self.shark_runner.iree_compilation_module,
-            self.shark_runner.iree_config,
-        ) = load_flatbuffer(
-            path,
-            self.device,
-            self.function_name,
-        )
-        return
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -16,7 +16,6 @@ from shark.iree_utils.compile_utils import (
    get_iree_compiled_module,
    get_results,
    export_iree_module_to_vmfb,
-    load_flatbuffer,
 )
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from shark.parser import shark_args
@@ -25,7 +24,7 @@ import sys


 # supported dialects by the shark-runtime.
-supported_dialects = {"linalg", "mhlo", "tosa", "tf-lite", "tm_tensor"}
+supported_dialects = {"linalg", "mhlo", "tosa", "tf-lite"}


 class SharkRunner:
@@ -61,35 +60,30 @@ class SharkRunner:

    def __init__(
        self,
-        mlir_module: bytes = None,
+        mlir_module: str,
        function_name: str = "forward",
        device: str = "none",
        mlir_dialect: str = "linalg",
-        extra_args: list = [],
-        compile_vmfb: bool = True,
    ):
        self.mlir_module = mlir_module
        self.function_name = function_name
        self.device = shark_args.device if device == "none" else device
        self.mlir_dialect = mlir_dialect
-        self.extra_args = extra_args

        if check_device_drivers(self.device):
-            print(device_driver_info(self.device))
+            device_driver_info(self.device)
            sys.exit(1)

-        if compile_vmfb == True:
-            # Compile the module to get the .vmfb.
-            (
-                self.iree_compilation_module,
-                self.iree_config,
-            ) = get_iree_compiled_module(
-                self.mlir_module,
-                self.device,
-                self.mlir_dialect,
-                func_name=self.function_name,
-                extra_args=self.extra_args,
-            )
+        # Compile the module to get the .vmfb.
+        (
+            self.iree_compilation_module,
+            self.iree_config,
+        ) = get_iree_compiled_module(
+            self.mlir_module,
+            self.device,
+            self.mlir_dialect,
+            func_name=self.function_name,
+        )

    def run(self, inputs: tuple):
        return get_results(
@@ -98,3 +92,10 @@ class SharkRunner:
            self.iree_config,
            self.mlir_dialect,
        )
+
+    # TODO: Instead of passing directory and having names decided by the module
+    # , user may want to save the module with manual names.
+    def save_module(self, dir=os.getcwd()):
+        return export_iree_module_to_vmfb(
+            self.model, self.device, dir, self.mlir_dialect
+        )
--- a/shark/sharkdynamo/README.md
+++ b/shark/sharkdynamo/README.md
@@ -1,11 +0,0 @@
-1. Install torchdynamo
-   - `git clone https://github.com/pytorch/torchdynamo.git`
-   - `cd torchdynamo`
-   - `python -m pip install -r requirements.txt`
-   - `python setup.py develop`
-
-2. Install functorch
-   - `python -m pip install -v "git+https://github.com/pytorch/pytorch.git@$(python -c "import torch.version; print(torch.version.git_version)")#subdirectory=functorch"`
-
-3. Run examples.
-    - `python shark/examples/shark_dynamo/basic_examples.py`
--- a/shark/sharkdynamo/init.py
+++ b/shark/sharkdynamo/init.py
--- a/shark/sharkdynamo/utils.py
+++ b/shark/sharkdynamo/utils.py
@@ -1,157 +0,0 @@
-import functools
-import time
-from typing import List, Optional
-import torch
-from torch.fx.experimental.proxy_tensor import make_fx
-from functorch._src.compile_utils import strip_overloads
-from shark.shark_inference import SharkInference
-from torch._decomp import get_decompositions
-
-import torch_mlir
-
-# TODO: Control decompositions.
-def default_decompositions():
-    return get_decompositions(
-        [
-            torch.ops.aten.embedding_dense_backward,
-            torch.ops.aten.native_layer_norm_backward,
-            torch.ops.aten.slice_backward,
-            torch.ops.aten.select_backward,
-            torch.ops.aten.norm.ScalarOpt_dim,
-            torch.ops.aten.native_group_norm,
-            torch.ops.aten.upsample_bilinear2d.vec,
-            torch.ops.aten.split.Tensor,
-            torch.ops.aten.split_with_sizes,
-        ]
-    )
-
-
-def timeit(*, append_time_to: Optional[List] = None):
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            start_time = time.time_ns()
-            result = func(*args, **kwargs)
-            end_time = time.time_ns()
-
-            if append_time_to is not None:
-                append_time_to.append(end_time - start_time)
-            return result
-
-        return wrapper
-
-    return decorator
-
-
-def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert (
-                len(node.args) == 1
-            ), "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, tuple):
-                return len(node_arg) == 0
-    return False
-
-
-def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
-    """
-    Replace tuple with tuple element in functions that return one-element tuples.
-    Returns true if an unwrapping took place, and false otherwise.
-    """
-    unwrapped_tuple = False
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert (
-                len(node.args) == 1
-            ), "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, tuple):
-                if len(node_arg) == 1:
-                    node.args = (node_arg[0],)
-                    unwrapped_tuple = True
-                    break
-
-    if unwrapped_tuple:
-        fx_g.graph.lint()
-        fx_g.recompile()
-    return unwrapped_tuple
-
-
-def make_shark_compiler(use_tracing: bool, device: str, verbose=False):
-    def compiler(
-        fx_graph: torch.fx.GraphModule,
-        example_inputs: List[torch.Tensor],
-    ):
-        """Compile GraphModule using torch-mlir + SHARK."""
-        if verbose:
-            print("Compiling graph...")
-
-        if _returns_nothing(fx_graph):
-            return fx_graph
-
-        was_unwrapped = _unwrap_single_tuple_return(fx_graph)
-        fx_graph = make_fx(
-            fx_graph, decomposition_table=default_decompositions()
-        )(*example_inputs)
-        strip_overloads(fx_graph)
-
-        if verbose:
-            print("torch.fx graph:")
-            print(fx_graph.graph)
-
-        ts_compiler = torch.jit.trace if use_tracing else torch.jit.script
-        ts_graph = ts_compiler(fx_graph, example_inputs)
-
-        if verbose:
-            torch_mlir_module = torch_mlir.compile(
-                ts_graph,
-                example_inputs,
-                output_type=torch_mlir.OutputType.TORCH,
-            )
-            print("\n\ntorch-mlir backend contract graph:")
-            print(torch_mlir_module)
-
-        linalg_module = torch_mlir.compile(
-            ts_graph,
-            example_inputs,
-            output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
-        )
-
-        shark_module = SharkInference(
-            linalg_module, "forward", mlir_dialect="linalg", device=device
-        )
-        shark_module.compile()
-
-        def forward(*inputs):
-            result = shark_module.forward(inputs)
-            result = tuple() if result is None else result
-            return (result,) if was_unwrapped else result
-
-        return forward
-
-    return compiler
-
-
-def check_results(compiled_results, eager_results):
-    for compiled_result, eager_result in zip(compiled_results, eager_results):
-        if not torch.allclose(
-            compiled_result.to("cpu"), eager_result.to("cpu"), atol=1e-5
-        ):
-            print("Compiled result does not match eager result")
-            return
-    print("Compiled result matches eager result!")
-
-
-def print_time_stats(times):
-    times_tensor = torch.tensor(times)
-
-    def quantile_ms(q):
-        return torch.quantile(times_tensor.to(float), q).item() / 1e6
-
-    print(f"Median: {quantile_ms(0.5)} ms")
-    print(f"10%ile: {quantile_ms(0.1)} ms")
-    print(f"90%ile: {quantile_ms(0.9)} ms")
-    print(f"Total: {torch.sum(times_tensor) / 1e6} ms")
-    print()
--- a/shark/stress_test.py
+++ b/shark/stress_test.py
@@ -1,296 +0,0 @@
-# Copyright 2022 The Nod Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from iree.runtime import query_available_drivers, get_driver
-from shark.shark_downloader import download_model
-from shark.shark_inference import SharkInference
-from typing import List, Optional, Tuple
-import numpy as np
-import argparse
-from shark.iree_utils._common import _IREE_DEVICE_MAP
-import multiprocessing
-from shark.shark_runner import supported_dialects
-import logging
-from concurrent.futures import ProcessPoolExecutor
-from concurrent.futures.thread import ThreadPoolExecutor
-import time
-import numpy as np
-
-IREE_TO_SHARK_DRIVER_MAP = {v: k for k, v in _IREE_DEVICE_MAP.items()}
-
-
-def stress_test_compiled_model(
-    shark_module_path: str,
-    function_name: str,
-    device: str,
-    inputs: List[np.ndarray],
-    golden_out: List[np.ndarray],
-    batch_size: int,
-    max_iterations: int,
-    max_duration_seconds: float,
-    inference_timeout_seconds: float,
-    tolerance_nulp: int,
-    stress_test_index: int,
-):
-    logging.info(
-        f"Running stress test {stress_test_index} on device {device}."
-    )
-    shark_module = SharkInference(
-        mlir_module=bytes(), function_name=function_name, device=device
-    )
-    shark_module.load_module(shark_module_path)
-    input_batches = [np.repeat(arr, batch_size, axis=0) for arr in inputs]
-    golden_output_batches = np.repeat(golden_out, batch_size, axis=0)
-    report_interval_seconds = 10
-    start_time = time.time()
-    previous_report_time = start_time
-    executor = ThreadPoolExecutor(1)
-    first_iteration_output = None
-    for i in range(max_iterations):
-        inference_task = executor.submit(shark_module.forward, input_batches)
-        output = inference_task.result(inference_timeout_seconds)
-        if first_iteration_output is None:
-            np.testing.assert_array_almost_equal_nulp(
-                golden_output_batches, output, nulp=tolerance_nulp
-            )
-            first_iteration_output = output
-        else:
-            np.testing.assert_array_equal(output, first_iteration_output)
-        current_time = time.time()
-        if report_interval_seconds < current_time - previous_report_time:
-            logging.info(
-                f"Stress test {stress_test_index} on device "
-                f"{device} at iteration {i+1}"
-            )
-            previous_report_time = current_time
-        if max_duration_seconds < current_time - start_time:
-            return
-    logging.info(f"Stress test {stress_test_index} on device {device} done.")
-
-
-def get_device_type(device_name: str):
-    return device_name.split("://", 1)[0]
-
-
-def get_device_types(device_names: str):
-    return [get_device_type(device_name) for device_name in device_names]
-
-
-def query_devices(device_types: Optional[List[str]] = None) -> List[str]:
-    devices = []
-    if device_types is None:
-        device_types = [
-            IREE_TO_SHARK_DRIVER_MAP[name]
-            for name in query_available_drivers()
-            if name in IREE_TO_SHARK_DRIVER_MAP
-        ]
-    for device_type in device_types:
-        driver = get_driver(_IREE_DEVICE_MAP[device_type])
-        device_infos = driver.query_available_devices()
-        for device_info in device_infos:
-            uri_path = (
-                device_info["path"]
-                if device_info["path"] != ""
-                else str(device_info["device_id"])
-            )
-            device_uri = f"{device_type}://{uri_path}"
-            devices.append(device_uri)
-    return devices
-
-
-def compile_stress_test_module(
-    device_types: List[str], mlir_model: str, func_name: str, mlir_dialect: str
-) -> List[str]:
-    shark_module_paths = []
-    for device_type in device_types:
-        logging.info(
-            f"Compiling stress test model for device type {device_type}."
-        )
-        shark_module = SharkInference(
-            mlir_model,
-            func_name,
-            mlir_dialect=mlir_dialect,
-            device=device_type,
-        )
-        shark_module_paths.append(shark_module.save_module())
-    return shark_module_paths
-
-
-def stress_test(
-    model_name: str,
-    dynamic_model: bool = False,
-    device_types: Optional[List[str]] = None,
-    device_names: Optional[List[str]] = None,
-    batch_size: int = 1,
-    max_iterations: int = 10**7,
-    max_duration_seconds: float = 3600,
-    inference_timeout_seconds: float = 60,
-    mlir_dialect: str = "linalg",
-    frontend: str = "torch",
-    oversubscription_factor: int = 1,
-    tolerance_nulp: int = 50000,
-):
-    logging.info(f"Downloading stress test model {model_name}.")
-    mlir_model, func_name, inputs, golden_out = download_model(
-        model_name=model_name, dynamic=dynamic_model, frontend=frontend
-    )
-
-    if device_names is None or device_types is not None:
-        device_names = [] if device_names is None else device_names
-        with ProcessPoolExecutor() as executor:
-            device_names.extend(
-                executor.submit(query_devices, device_types).result()
-            )
-
-    device_types_set = list(set(get_device_types(device_names)))
-    shark_module_paths_set = compile_stress_test_module(
-        device_types_set, mlir_model, func_name, mlir_dialect
-    )
-    device_type_shark_module_path_map = {
-        device_type: module_path
-        for device_type, module_path in zip(
-            device_types_set, shark_module_paths_set
-        )
-    }
-    device_name_shark_module_path_map = {
-        device_name: device_type_shark_module_path_map[
-            get_device_type(device_name)
-        ]
-        for device_name in device_names
-    }
-
-    # This needs to run in a spearate process, because it uses the drvier chache
-    # in IREE and a subsequent call to `iree.runtime.SystemContext.add_vm_module`
-    # in a forked process will hang.
-    with multiprocessing.Pool(
-        len(device_name_shark_module_path_map) * oversubscription_factor
-    ) as process_pool:
-        process_pool.starmap(
-            stress_test_compiled_model,
-            [
-                (
-                    module_path,
-                    func_name,
-                    device_name,
-                    inputs,
-                    golden_out,
-                    batch_size,
-                    max_iterations,
-                    max_duration_seconds,
-                    inference_timeout_seconds,
-                    tolerance_nulp,
-                    stress_test_index,
-                )
-                for stress_test_index, (device_name, module_path) in enumerate(
-                    list(device_name_shark_module_path_map.items())
-                    * oversubscription_factor
-                )
-            ],
-        )
-
-
-if __name__ == "__main__":
-    logging.basicConfig(encoding="utf-8", level=logging.INFO)
-    parser = argparse.ArgumentParser(
-        description="Downloads, compiles and runs a model from the tank to stress test the system."
-    )
-    parser.add_argument(
-        "--model", type=str, help="Model name in the tank.", default="alexnet"
-    )
-    parser.add_argument(
-        "--dynamic",
-        help="Use dynamic version of the model.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--frontend", type=str, help="Frontend of the model.", default="torch"
-    )
-    parser.add_argument(
-        "--mlir-dialect",
-        type=str,
-        help="MLIR dialect of the model.",
-        default="linalg",
-        choices=supported_dialects,
-    )
-    parser.add_argument(
-        "--device-types",
-        type=str,
-        nargs="*",
-        choices=_IREE_DEVICE_MAP.keys(),
-        help="Runs the stress test on all devices with that type. "
-        "If absent and no deveices are specified "
-        "will run against all available devices.",
-    )
-    parser.add_argument(
-        "--devices",
-        type=str,
-        nargs="*",
-        help="List of devices to run the stress test on. "
-        "If device-types is specified will run against the union of the two.",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        help="Number of inputs to feed into the model",
-        default=1,
-    )
-    parser.add_argument(
-        "--oversubscription",
-        type=int,
-        help="Oversubscrption factor. Each device will execute the model simultaneously "
-        "this many number of times.",
-        default=1,
-    )
-    parser.add_argument(
-        "--max-iterations",
-        type=int,
-        help="Maximum number of iterations to run the stress test per device.",
-        default=10**7,
-    )
-    parser.add_argument(
-        "--max-duration",
-        type=float,
-        help="Maximum number of seconds to run the stress test.",
-        default=3600,
-    )
-    parser.add_argument(
-        "--inference-timeout",
-        type=float,
-        help="Timeout in seconds for a single model inference operation.",
-        default=60,
-    )
-    parser.add_argument(
-        "--tolerance-nulp",
-        type=int,
-        help="The maximum number of unit in the last place for tolerance "
-        "when verifing results with the golden reference output.",
-        default=50000,
-    )
-
-    args = parser.parse_known_args()[0]
-    stress_test(
-        model_name=args.model,
-        dynamic_model=args.dynamic,
-        frontend=args.frontend,
-        mlir_dialect=args.mlir_dialect,
-        device_types=args.device_types,
-        device_names=args.devices,
-        batch_size=args.batch_size,
-        oversubscription_factor=args.oversubscription,
-        max_iterations=args.max_iterations,
-        max_duration_seconds=args.max_duration,
-        inference_timeout_seconds=args.inference_timeout,
-        tolerance_nulp=args.tolerance_nulp,
-    )
--- a/shark/tests/test_stress_test.py
+++ b/shark/tests/test_stress_test.py
@@ -1,31 +0,0 @@
-# Copyright 2022 The Nod Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import subprocess
-import sys
-import importlib.util
-
-
-def test_stress_test():
-    subprocess.check_call(
-        [
-            sys.executable,
-            importlib.util.find_spec("shark.stress_test").origin,
-            "--model=squeezenet1_0",
-            "--devices",
-            "cpu",
-            "--max-iterations=1",
-        ]
-    )
--- a/shark/torch_mlir_lockstep_tensor.py
+++ b/shark/torch_mlir_lockstep_tensor.py
@@ -1,220 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Also available under a BSD-style license. See LICENSE.
-import contextlib
-import re
-import traceback
-import warnings
-from typing import Any
-import numpy as np
-
-import torch
-from torch.utils._pytree import tree_map
-
-from torch_mlir.eager_mode.ir_building import build_mlir_module
-from torch_mlir.eager_mode.torch_mlir_dispatch import (
-    UnsupportedByTorchMlirEagerMode,
-    normalize_args_kwargs,
-    check_get_aliased_arg,
-)
-from torch_mlir.eager_mode import EAGER_MODE_DEBUG
-from torch_mlir.eager_mode.torch_mlir_tensor import (
-    TorchMLIRTensor,
-    check_requires_grad,
-    make_wrapper_subclass_from_torch_tensor,
-    make_bare_wrapper_subclass,
-    UNSUPPORTED_OPS,
-    no_dispatch,
-)
-from torch_mlir.eager_mode import torch_mlir_tensor
-from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
-
-
-backend = EagerModeIREELinalgOnTensorsBackend("cpu")
-torch_mlir_tensor.backend = backend
-rtol = 1e-04
-atol = 1e-05
-
-
-class TorchMLIRLockstepTensor(TorchMLIRTensor):
-    """This class overrides the dispatching for TorchMLIRTensor to allow for an op-by-op numerical comparison between PyTorch and the Torch-MLIR -> IREE backend compilation pipeline. This only supports the IREE backend and focuses on op-by-op level verification.
-
-    TODO: Extend this to do a cumulative trace with summary statistics at the end. Possibly requires a wrapper environment to store full trace info.
-    """
-
-    def __new__(cls, elem, **kwargs):
-        if kwargs.get("constructing_from_device_tensor", False):
-            tensor_meta_data = backend.get_torch_metadata(elem, kwargs)
-            r = make_bare_wrapper_subclass(
-                cls=cls,
-                size=tensor_meta_data.size,
-                strides=tensor_meta_data.strides,
-                storage_offset=tensor_meta_data.storage_offset,
-                dtype=tensor_meta_data.dtype,
-                layout=tensor_meta_data.layout,
-                device=tensor_meta_data.device,
-                requires_grad=tensor_meta_data.requires_grad,
-            )
-            r.elem = elem
-        elif isinstance(elem, torch.nn.Parameter):
-            r = make_wrapper_subclass_from_torch_tensor(
-                cls, elem.data, **kwargs
-            )
-            # This is a hack to handle non-contiguous data through IREE-backend
-            nt = elem.detach().data.numpy()
-            if not nt.flags["C_CONTIGUOUS"]:
-                nt = np.ascontiguousarray(nt, dtype=nt.dtype)
-            r.elem = backend.transfer_from_torch_to_device(
-                torch.from_numpy(nt)
-            )
-        elif isinstance(elem, torch.Tensor):
-            r = make_wrapper_subclass_from_torch_tensor(cls, elem, **kwargs)
-            # Ditto TODO: Find a better way to handle this
-            nt = elem.numpy()
-            if not nt.flags["C_CONTIGUOUS"]:
-                nt = np.ascontiguousarray(nt, dtype=nt.dtype)
-            r.elem = backend.transfer_from_torch_to_device(
-                torch.from_numpy(nt)
-            )
-        # This branch handles the case when a python scalar is passed to some op
-        # or is returned from some aten op, such as _local_scalar_dense.
-        elif isinstance(elem, (int, float, bool)):
-            return elem
-        else:
-            raise ValueError(f"Unknown element type: {type(elem)}")
-        return r
-
-    def __repr__(self):
-        if self.grad_fn:
-            return f"TorchMLIRLockstepTensor({self.elem}, backend={backend.__class__.__name__}, grad_fn={self.grad_fn})"
-        else:
-            return f"TorchMLIRLockstepTensor({self.elem}, backend={backend.__class__.__name__})"
-
-    """This does essentially the same dispatch as TorchMLIRTensor but operates as if debug mode is enabled. The numeric verification happens after the Torch-MLIR result is obtained by comparing against the 
-    """
-
-    @classmethod
-    def __torch_dispatch__(cls, func, _types, args=(), kwargs=None):
-        requires_grad = check_requires_grad(*args, **kwargs)
-        try:
-            with no_dispatch():
-                if hasattr(func, "op_name"):
-                    op_name = func.op_name
-                elif hasattr(func, "__name__"):
-                    # Handle builtin_function_or_method.
-                    op_name = func.__name__
-                else:
-                    raise RuntimeError(f"op {func} has no name")
-
-                if UNSUPPORTED_OPS.match(op_name):
-                    raise UnsupportedByTorchMlirEagerMode(op_name)
-
-                if not hasattr(func, "_schema"):
-                    raise RuntimeError(f"op {func} has no schema.")
-
-                normalized_kwargs = normalize_args_kwargs(func, args, kwargs)
-
-                if "layout" in normalized_kwargs and normalized_kwargs[
-                    "layout"
-                ] not in {0, None}:
-                    raise UnsupportedByTorchMlirEagerMode(
-                        f"{normalized_kwargs['layout']} layout not supported."
-                    )
-                if "memory_format" in normalized_kwargs and normalized_kwargs[
-                    "memory_format"
-                ] not in {0, None}:
-                    raise UnsupportedByTorchMlirEagerMode(
-                        f"{normalized_kwargs['memory_format']} memory format not supported."
-                    )
-                eager_module = build_mlir_module(func, normalized_kwargs)
-            device_tensor_args = [
-                kwarg.elem
-                for _, kwarg in normalized_kwargs.items()
-                if isinstance(kwarg, cls)
-            ]
-            assert len(eager_module.body.operations[0].arguments) == len(
-                device_tensor_args
-            ), "Number of parameters and number of arguments differs."
-            op_mlir_backend_callable = backend.compile(eager_module)
-            out = op_mlir_backend_callable(*device_tensor_args)
-            out = tree_map(
-                lambda x: cls(
-                    x,
-                    requires_grad=requires_grad,
-                    constructing_from_device_tensor=True,
-                ),
-                out,
-            )
-
-            # Numeric verification; Value for comparison comes from PyTorch eager
-            with no_dispatch():
-                unwrapped_args = tree_map(cls.unwrap, args)
-                unwrapped_kwargs = tree_map(cls.unwrap, kwargs)
-                if "_reshape_alias" in op_name:
-                    native_out = torch.ops.aten.view(
-                        unwrapped_args[0], unwrapped_args[1]
-                    )
-                else:
-                    native_out = func(*unwrapped_args, **unwrapped_kwargs)
-
-            native_out = tree_map(
-                lambda x: cls(x, requires_grad=requires_grad), native_out
-            ).elem
-            tmp_out = out.elem
-
-            try:
-                np.testing.assert_allclose(
-                    native_out.to_host(),
-                    tmp_out.to_host(),
-                    rtol=rtol,
-                    atol=atol,
-                )
-            except Exception as e:
-                shaped_args = [
-                    arg.shape if torch.is_tensor(arg) else arg
-                    for arg in unwrapped_args
-                ]
-                shaped_kwargs = [
-                    kwarg.shape if torch.is_tensor(kwarg) else kwarg
-                    for kwarg in unwrapped_kwargs
-                ]
-                warnings.warn(
-                    f"Lockstep accuracy verification failed with error: *{str(e)}*; "
-                    f"Dispatched function name: *{str(func)}*; "
-                    f"Dispatched function args: *{str(shaped_args)}*; "
-                    f"Dispatched function kwargs: *{str(shaped_kwargs)}*; "
-                )
-        except Exception as e:
-            warnings.warn(traceback.format_exc())
-            if isinstance(e, UnsupportedByTorchMlirEagerMode):
-                warnings.warn(
-                    f"Couldn't use TorchMLIR eager because current incompatibility: *{str(e)}*; running through PyTorch eager."
-                )
-            else:
-                warnings.warn(
-                    f"Couldn't use TorchMLIR eager because of error: *{str(e)}*; "
-                    f"Running through PyTorch eager"
-                )
-
-            with no_dispatch():
-                unwrapped_args = tree_map(cls.unwrap, args)
-                unwrapped_kwargs = tree_map(cls.unwrap, kwargs)
-                if "_reshape_alias" in op_name:
-                    out = torch.ops.aten.view(
-                        unwrapped_args[0], unwrapped_args[1]
-                    )
-                else:
-                    out = func(*unwrapped_args, **unwrapped_kwargs)
-
-            out = tree_map(lambda x: cls(x, requires_grad=requires_grad), out)
-
-        maybe_aliased_arg_name = check_get_aliased_arg(func)
-        if maybe_aliased_arg_name is not None:
-            warnings.warn(
-                f"Found aliased arg, but didn't copy tensor contents. This could lead to incorrect results for E2E model execution but doesn't affect the validity of the lockstep op verification."
-            )
-            # TODO: Find a way to handle argument aliasing for IREE backend
-            # backend.copy_into(normalized_kwargs[maybe_aliased_arg_name].elem, out.elem)
-
-        return out
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -12,12 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import torch
+import io
+import pickle
+
+from torch_mlir.dialects.torch.importer.jit_ir import (
+    ClassAnnotator,
+    ModuleBuilder,
+)
+from torch_mlir_e2e_test.torchscript.serialization import (
+    extract_serializable_annotations,
+    apply_serializable_annotations,
+    SerializableTest,
+)
+
+from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
+
+from torch_mlir.passmanager import PassManager
+from torch_mlir_e2e_test.torchscript.annotations import annotate_args, export
 from torch_mlir.ir import StringAttr
 import torch_mlir
-from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
-import tempfile
-from shark.parser import shark_args
-import io


 def get_module_name_for_asm_dump(module):
@@ -31,6 +45,22 @@ def get_module_name_for_asm_dump(module):
    ).value


+def get_input_annotations(inputs: tuple, dynamic: bool) -> list:
+    """TODO: Include necessary documentation"""
+
+    annotations_list = [None]
+    for i in inputs:
+        temp_list = []
+        if dynamic:
+            temp_list.append([-1 for i in range(len(i.shape))])
+        else:
+            temp_list.append(list(i.shape))
+        temp_list.append(i.dtype)
+        temp_list.append(True)
+        annotations_list.append(tuple(temp_list))
+    return annotations_list
+
+
 def run_on_refbackend(torch_module, inputs):
    backend = refbackend.RefBackendLinalgOnTensorsBackend()
    compiled = backend.compile(torch_module)
@@ -39,16 +69,42 @@ def run_on_refbackend(torch_module, inputs):
    return jit_module.forward(np_inputs[0])


-# Creates dynamic dims for all dims.
-# TODO: Pass user specified dynamic dims.
-def create_dynamic_placeholders(inputs):
-    placeholders = []
-    for inp in inputs:
-        placeholder = torch_mlir.TensorPlaceholder.like(
-            inp, dynamic_axes=[i for i in range(len(inp.shape))]
-        )
-        placeholders.append(placeholder)
-    return tuple(placeholders)
+def shark_jit_trace(
+    module, input: tuple, dynamic: bool, tracing_required: bool
+):
+    """TODO: Include necessary documentation."""
+
+    if not tracing_required:
+        return torch.jit.script(module)
+
+    traced_module = torch.jit.trace_module(module, {"forward": input})
+    actual_script = traced_module._actual_script_module
+    export(actual_script.forward)
+    annotate_args_decorator = annotate_args(
+        get_input_annotations(input, dynamic)
+    )
+    annotate_args_decorator(actual_script.forward)
+    module = torch.jit.script(actual_script)
+
+    # TODO: remove saved annotations.pickle
+    torchscript_module_bytes = module.save_to_buffer(
+        {
+            "annotations.pkl": pickle.dumps(
+                extract_serializable_annotations(module)
+            )
+        }
+    )
+    serializable_test = SerializableTest(
+        unique_name="", program=torchscript_module_bytes, trace=None
+    )
+    _extra_files = {"annotations.pkl": ""}
+    module = torch.jit.load(
+        io.BytesIO(serializable_test.program), _extra_files=_extra_files
+    )
+    # Load the pickled annotations.
+    annotations = pickle.loads(_extra_files["annotations.pkl"])
+    apply_serializable_annotations(module, annotations)
+    return module


 def get_torch_mlir_module(
@@ -56,24 +112,41 @@ def get_torch_mlir_module(
    input: tuple,
    dynamic: bool,
    jit_trace: bool,
+    from_torchscript: bool = False,
 ):
-    """Get the MLIR's linalg-on-tensors module from the torchscipt module."""
-    ignore_traced_shapes = False
-    if dynamic:
-        input = create_dynamic_placeholders(input)
-    if jit_trace:
-        ignore_traced_shapes = True
+    """TODO: Include necessary documentation."""

-    tempfile.tempdir = shark_args.repro_dir
+    # Static modules compiles well with the torch_mlir.compile API.
+    # We will always jit_trace = True with the API since we always
+    # want to propagate static shapes.
+    if not dynamic:
+        module = torch_mlir.compile(
+            module,
+            input,
+            output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=jit_trace,
+        )
+        return module

-    mlir_module = torch_mlir.compile(
-        module,
-        input,
-        output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=jit_trace,
-        ignore_traced_shapes=ignore_traced_shapes,
+    # Tracing is not required from the aot_module.
+    if not from_torchscript:
+        module = shark_jit_trace(module, input, dynamic, jit_trace)
+
+    mb = ModuleBuilder()
+    class_annotator = ClassAnnotator()
+    class_annotator.exportNone(module._c._type())
+    class_annotator.exportPath(module._c._type(), ["forward"])
+    class_annotator.annotateArgs(
+        module._c._type(),
+        ["forward"],
+        get_input_annotations(input, dynamic),
    )
-    bytecode_stream = io.BytesIO()
-    mlir_module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-    return bytecode
+    mb.import_module(module._c, class_annotator)
+
+    with mb.module.context:
+        pm = PassManager.parse(
+            "torchscript-module-to-torch-backend-pipeline,torch-backend-to-linalg-on-tensors-backend-pipeline"
+        )
+        pm.run(mb.module)
+
+    return mb.module
--- a/tank/README.md
+++ b/tank/README.md
@@ -1,223 +0,0 @@
-## Supported and Validated Models
-
-### PyTorch HuggingFace Models
-
-| PyTorch Language Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :green_heart: (JIT)          | :green_heart:         | :green_heart:         | :green_heart:            |
-| Albert              | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
-| BigBird             | :green_heart: (AOT)            |          |          |             |
-| dbmdz/ConvBERT      | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| DistilBERT          | :broken_heart: (JIT)            |          |          |             |
-| GPT2                | :green_heart:            | :green_heart:         |  :green_heart:        | :green_heart:            |
-| MobileBert          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
-| microsoft/beit      | :green_heart:                  | :green_heart:         | :broken_heart:         | :broken_heart:            |
-| facebook/deit       | :green_heart:          | :green_heart:         | :broken_heart:         | :broken_heart:            |
-| facebook/convnext   | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-
-### Torchvision  Models
-
-| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|--------------------|----------------------|----------|----------|-------------|
-| AlexNet            | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| MobileNetV2        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| MobileNetV3        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| Unet               | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| Resnet18           | :green_heart: (Script)         | :green_heart:         |  :green_heart:        | :green_heart:            |
-| Resnet50           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
-| Resnet101           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
-| Resnext50_32x4d    | :green_heart: (Script)         |          |          |             |
-| SqueezeNet         | :green_heart: (Script)         | :green_heart:         |   :broken_heart:       | :broken_heart:            |
-| EfficientNet       | :green_heart: (Script)         |          |          |             |
-| Regnet             | :green_heart: (Script)         |          |          |             |
-| Resnest            | :broken_heart: (Script)         |          |          |             |
-| Vision Transformer | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| VGG 16             | :green_heart: (Script)         | :green_heart:         |   :green_heart:       |             |
-| Wide Resnet        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
-| RAFT               | :broken_heart: (JIT)            |          |          |             |
-
-For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
-
-### Tensorflow Models (Inference)
-
-| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| MiniLM                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| albert-base-v2              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| DistilBERT          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| CamemBert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| ConvBert              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| Deberta              |            |         |          |             |
-| electra          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| funnel              |            |         |          |             |
-| layoutlm              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| longformer              |            |         |          |             |
-| mobile-bert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| rembert              |            |         |          |             |
-| tapas              |            |         |          |             |
-| flaubert                | :broken_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| roberta                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
-| xlm-roberta              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-| mpnet              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
-
-### PyTorch Training Models
-
-| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :green_heart:           | :green_heart:         |          |             |
-| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
-
-### JAX  Models
-
-| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| DALL-E                | :broken_heart:           | :broken_heart:         |          |             |
-| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
-
-<details>
-  <summary>TFLite Models</summary>
-
-### TFLite Models
-
-| Models | TOSA/LinAlg  | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------------------|----------|----------|-------------|
-| BERT                | :broken_heart:           | :broken_heart:         |          |             |
-| FullyConnected      | :green_heart:           | :green_heart:         |          |             |
-| albert | :green_heart:           | :green_heart:         |          |             |
-| asr_conformer | :green_heart:           | :green_heart:         |          |             |
-| bird_classifier | :green_heart:           | :green_heart:         |          |             |
-| cartoon_gan | :green_heart:           | :green_heart:         |          |             |
-| craft_text | :green_heart:           | :green_heart:         |          |             |
-| deeplab_v3 | :green_heart:           | :green_heart:         |          |             |
-| densenet | :green_heart:           | :green_heart:         |          |             |
-| east_text_detector | :green_heart:           | :green_heart:         |          |             |
-| efficientnet_lite0_int8 | :green_heart:           | :green_heart:         |          |             |
-| efficientnet | :green_heart:           | :green_heart:         |          |             |
-| gpt2 | :green_heart:           | :green_heart:         |          |             |
-| image_stylization | :green_heart:           | :green_heart:         |          |             |
-| inception_v4 | :green_heart:           | :green_heart:         |          |             |
-| inception_v4_uint8 | :green_heart:           | :green_heart:         |          |             |
-| lightning_fp16 | :green_heart:           | :green_heart:         |          |             |
-| lightning_i8 | :green_heart:           | :green_heart:         |          |             |
-| lightning | :green_heart:           | :green_heart:         |          |             |
-| magenta | :green_heart:           | :green_heart:         |          |             |
-| midas | :green_heart:           | :green_heart:         |          |             |
-| mirnet | :green_heart:           | :green_heart:         |          |             |
-| mnasnet | :green_heart:           | :green_heart:         |          |             |
-| mobilebert_edgetpu_s_float | :green_heart:           | :green_heart:         |          |             |
-| mobilebert_edgetpu_s_quant | :green_heart:           | :green_heart:         |          |             |
-| mobilebert | :green_heart:           | :green_heart:         |          |             |
-| mobilebert_tf2_float | :green_heart:           | :green_heart:         |          |             |
-| mobilebert_tf2_quant | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_ssd_quant | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v2_uint8 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v3-large | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v3-large_uint8 | :green_heart:           | :green_heart:         |          |             |
-| mobilenet_v35-int8 | :green_heart:           | :green_heart:         |          |             |
-| nasnet | :green_heart:           | :green_heart:         |          |             |
-| person_detect | :green_heart:           | :green_heart:         |          |             |
-| posenet | :green_heart:           | :green_heart:         |          |             |
-| resnet_50_int8 | :green_heart:           | :green_heart:         |          |             |
-| rosetta | :green_heart:           | :green_heart:         |          |             |
-| spice | :green_heart:           | :green_heart:         |          |             |
-| squeezenet | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v2_fpnlite | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
-| ssd_mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
-| ssd_spaghettinet_large | :green_heart:           | :green_heart:         |          |             |
-| ssd_spaghettinet_large_uint8 | :green_heart:           | :green_heart:         |          |             |
-| visual_wake_words_i8 | :green_heart:           | :green_heart:         |          |             |
-
-</details>
-
-## Testing and Benchmarks
-
-### Run all model tests on CPU/GPU/VULKAN/Metal
-
-For a list of models included in our pytest model suite, see https://github.com/nod-ai/SHARK/blob/main/tank/all_models.csv
-
-```shell
-pytest tank/test_models.py
-
-# Models included in the pytest suite can be found listed in all_models.csv.
-
-# If on Linux for multithreading on CPU (faster results):
-pytest tank/test_models.py -n auto
-```
-
-### Running specific tests
-```shell
-
-# Search for test cases by including a keyword that matches all or part of the test case's name;
-pytest tank/test_models.py -k "keyword" 
-
-# Test cases are named uniformly by format test_module_<model_name_underscores_only>_<torch/tf>_<static/dynamic>_<device>.
-
-# Example: Test all models on nvidia gpu:
-pytest tank/test_models.py -k "cuda"
-
-# Example: Test all tensorflow resnet models on Vulkan backend:
-pytest tank/test_models.py -k "resnet and tf and vulkan"
-
-# Exclude a test case:
-pytest tank/test_models.py -k "not ..."
-
-### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.
-
-(the following requires source installation with `IMPORTER=1 ./setup_venv.sh`)
-
-```shell
-pytest --benchmark tank/test_models.py
-  
-# Just do static GPU benchmarks for PyTorch tests:
-pytest --benchmark tank/test_models.py -k "pytorch and static and cuda"
-
-```
-  
-### Benchmark Resnet50, MiniLM on CPU
-
-(requires source installation with `IMPORTER=1 ./setup_venv.sh`)  
-  
-```shell
-# We suggest running the following commands as root before running benchmarks on CPU:
-  
-cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print $2}' | sort -n | uniq | ( while read X ; do echo $X ; echo 0 > /sys/devices/system/cpu/cpu$X/online ; done )
-echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
-
-# Benchmark canonical Resnet50 on CPU via pytest
-pytest --benchmark tank/test_models.py -k "resnet50 and tf_static_cpu"
-
-# Benchmark canonical MiniLM on CPU via pytest
-pytest --benchmark tank/test_models.py -k "MiniLM and cpu"
-
-# Benchmark MiniLM on CPU via transformer-benchmarks:
-git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
-cd transformer-benchmarks
-./perf-ci.sh -n
-# Check detail.csv for MLIR/IREE results.
-
-```
-
-To run the fine tuning example, from the root SHARK directory, run:
-
-```shell
-IMPORTER=1 ./setup_venv.sh
-source shark.venv/bin/activate
-pip install jupyter tf-models-nightly tf-datasets
-jupyter-notebook
-```
-if running from a google vm, you can view jupyter notebooks on your local system with:
-```shell
-gcloud compute ssh <YOUR_INSTANCE_DETAILS> --ssh-flag="-N -L localhost:8888:localhost:8888"
-```
-
-
-
--- a/tank/albert-base-v2_tf/albert-base-v2_tf_test.py
+++ b/tank/albert-base-v2_tf/albert-base-v2_tf_test.py
@@ -0,0 +1,60 @@
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+
+import iree.compiler as ireec
+import unittest
+import pytest
+import numpy as np
+
+
+class AlbertBaseModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model, func_name, inputs, golden_out = download_tf_model(
+            "albert-base-v2"
+        )
+
+        shark_module = SharkInference(
+            model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        result = shark_module.forward(inputs)
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+
+
+class AlbertBaseModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = AlbertBaseModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/Show More
+++ b/Show More