Update vulkan_utils.py

2026-01-11 23:08:19 -05:00 · 2022-10-11 20:53:41 +05:30
196 changed files with 5682 additions and 17133 deletions
--- a/.flake8
+++ b/.flake8
@@ -1,5 +0,0 @@
-[flake8]
-count = 1
-show-source = 1
-select = E9,F63,F7,F82
-exclude = lit.cfg.py, apps/language_models/scripts/vicuna.py, apps/language_models/src/pipelines/minigpt4_pipeline.py, apps/language_models/langchain/h2oai_pipeline.py
--- a/.github/workflows/gh-pages-releases.yml
+++ b/.github/workflows/gh-pages-releases.yml
@@ -1,37 +0,0 @@
-# See: https://github.com/llvm/torch-mlir/issues/1374
-name: Publish releases page
-
-on:
-  workflow_dispatch:
-
-jobs:
-  scrape_and_publish_releases:
-    name: "Scrape and publish releases"
-    runs-on: ubuntu-latest
-
-    # Don't run this in everyone's forks.
-    if: github.repository == 'nod-ai/SHARK'
-
-    steps:
-      - name: Checking out repository
-        uses: actions/checkout@v2
-        with:
-          token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      - name: Run scrape releases script
-        run: python ./build_tools/scrape_releases.py nod-ai SHARK > /tmp/index.html
-        shell: bash
-      - run: git fetch --all
-      - run: git switch github-pages
-      - run: git config --global user.email "none@none.com"
-      - run: git config --global user.name "nod-ai"
-      - run: mv /tmp/index.html package-index/index.html
-      - run: git add package-index/index.html
-
-      # Only try to make a commit if the file has changed.
-      - run: git diff --cached --exit-code || git commit -m "Update releases."
-
-      - name: GitHub Push
-        uses: ad-m/github-push-action@v0.6.0
-        with:
-          github_token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-          branch: github-pages
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -9,79 +9,13 @@ on:
  workflow_dispatch:

 jobs:
-  windows-build:
-    runs-on: 7950X
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.11"]
-
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Compute version
-      shell: powershell
-      run: |
-        $package_version = $(Get-Date -UFormat "%Y%m%d")+"."+${{ github.run_number }}
-        $package_version_ = $(Get-Date -UFormat "%Y%m%d")+"_"+${{ github.run_number }}
-        $tag_name=$package_version
-        echo "package_version=$package_version" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
-        echo "package_version_=$package_version_" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
-        echo "tag_name=$tag_name" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
-
-    - name: Create Release
-      id: create_release
-      uses: actions/create-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        tag_name: ${{ env.tag_name }}
-        release_name: nod.ai SHARK ${{ env.tag_name }}
-        body: |
-          Automatic snapshot release of nod.ai SHARK.
-        draft: true
-        prerelease: true
-
-    - name: Build Package 
-      shell: powershell
-      run: |
-        ./setup_venv.ps1
-        $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
-        pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
-        python process_skipfiles.py
-        pyinstaller .\apps\stable_diffusion\shark_sd.spec
-        mv ./dist/nodai_shark_studio.exe ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
-        signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
-  
-    - name: Upload Release Assets
-      id: upload-release-assets
-      uses: dwenegar/upload-release-assets@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ./dist/nodai*
-        #asset_content_type: application/vnd.microsoft.portable-executable 
-
-    - name: Publish Release
-      id: publish_release
-      uses: eregon/publish-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      with:
-        release_id: ${{ steps.create_release.outputs.id }}
-
-  linux-build:
+  build:

    runs-on: a100
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.11"]
+        python-version: ["3.10"]
        backend: [IREE, SHARK]

    steps:
@@ -98,13 +32,36 @@ jobs:
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
-
+    
+    - name: Compute version
+      run: |
+        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
+        tag_name="${package_version}"
+        echo "package_version=${package_version}" >> $GITHUB_ENV
+        echo "tag_name=${tag_name}" >> $GITHUB_ENV    
+    - name: Create Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        tag_name: ${{ env.tag_name }}
+        release_name: nod.ai SHARK ${{ env.tag_name }}
+        body: |
+          Automatic snapshot release of nod.ai SHARK.
+        draft: true
+        prerelease: false
+    - name: Find Torch-MLIR Release
+      run: |
+        TM_HTML_URL="$(python3 -c "import urllib.request, json, sys; u=json.loads(urllib.request.urlopen('https://api.github.com/repos/llvm/torch-mlir/releases/latest').read().decode()).get('html_url', False); print(u) if u else sys.exit(1);")"
+        TM_RELEASE_DIR=${TM_HTML_URL/"tag"/"expanded_assets"}
+        echo "TM_RELEASE_DIR=${TM_RELEASE_DIR}" >> $GITHUB_ENV
    - name: Install dependencies
      run: |
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+        echo "Torch-MLIR Release DIR is ${{ env.TM_RELEASE_DIR }}"
        python -m pip install --upgrade pip
        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html; fi
+        if [ -f requirements.txt ]; then pip install -r requirements.txt -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
@@ -113,26 +70,25 @@ jobs:
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
    - name: Build and validate the IREE package
      if: ${{ matrix.backend == 'IREE' }}
-      continue-on-error: true
      run: |
        cd $GITHUB_WORKSPACE
        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
        source iree.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://openxla.github.io/iree/pip-release-links.html
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/iree-org/iree/releases
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
+        pytest tank/test_models.py |
          tail -n 1 |
          tee -a pytest_results.txt
        if !(grep -Fxq " failed" pytest_results.txt) 
          then 
            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
-            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/nightly/
+            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/$SHA
+            gsutil -m cp -r gs://shark_tank/$SHA/* gs://shark_tank/latest/
        fi
        rm -rf ./wheelhouse/nodai*

@@ -144,10 +100,32 @@ jobs:
        source shark.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
+        pytest tank/test_models.py |
          tail -n 1 |
          tee -a pytest_results.txt
+  publish:
+    runs-on: a100
+    needs: build 
+    steps:
+    - name: Upload Release Assets
+      if: ${{ matrix.backend == 'SHARK' }}
+      id: upload-release-assets
+      uses: dwenegar/upload-release-assets@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
+        assets_path: ${GITHUB_WORKSPACE}/wheelhouse/nodai_*.whl
+
+    - name: Publish Release
+      if: ${{ matrix.backend == 'SHARK' }}
+      id: publish_release
+      uses: eregon/publish-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
+      with:
+        release_id: ${{ steps.create_release.outputs.id }}
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -0,0 +1,113 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Validate Models on Shark Runtime
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+jobs:
+  build-validate:
+    strategy:
+      fail-fast: true
+      matrix:
+        os: [icelake, a100, MacStudio, ubuntu-latest]
+        suite: [cpu,cuda,vulkan]
+        python-version: ["3.10"]
+        include:
+          - os: ubuntu-latest
+            suite: lint
+        exclude:
+          - os: ubuntu-latest
+            suite: vulkan
+          - os: ubuntu-latest
+            suite: cuda
+          - os: ubuntu-latest
+            suite: cpu
+          - os: MacStudio
+            suite: cuda
+          - os: MacStudio
+            suite: cpu
+          - os: MacStudio
+            suite: vulkan
+          - os: icelake
+            suite: vulkan
+          - os: icelake
+            suite: cuda
+          - os: a100
+            suite: cpu
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - uses: actions/checkout@v3
+    
+    - name: Set Environment Variables
+      run: |
+        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
+        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+        
+    - name: Set up Python Version File ${{ matrix.python-version }}
+      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
+      run: |
+        # See https://github.com/actions/setup-python/issues/433
+        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
+    
+    - name: Set up Python ${{ matrix.python-version }}
+      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
+      uses: actions/setup-python@v4
+      with:
+        python-version: '${{ matrix.python-version }}'
+        #cache: 'pip'
+        #cache-dependency-path: |
+        #  **/requirements-importer.txt
+        #  **/requirements.txt
+          
+    - name: Install dependencies
+      if: matrix.suite == 'lint'
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest toml black
+        
+    - name: Lint with flake8
+      if: matrix.suite == 'lint'
+      run: |
+        # black format check
+        black --version
+        black --line-length 79 --check .
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude lit.cfg.py
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py
+
+    - name: Validate Models on CPU
+      if: matrix.suite == 'cpu'
+      run: |
+        cd $GITHUB_WORKSPACE
+        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cpu
+        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
+        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
+
+    - name: Validate Models on NVIDIA GPU
+      if: matrix.suite == 'cuda'
+      run: |
+        cd $GITHUB_WORKSPACE
+        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cuda
+        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
+        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
+
+    - name: Validate Vulkan Models
+      if: matrix.suite == 'vulkan'
+      run: |
+        cd $GITHUB_WORKSPACE
+        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k vulkan
--- a/.github/workflows/test-studio.yml
+++ b/.github/workflows/test-studio.yml
@@ -1,86 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Validate Shark Studio
-
-on:
-  push:
-    branches: [ main ]
-    paths-ignore:
-      - '**.md'
-      - 'shark/examples/**'
-  pull_request:
-    branches: [ main ]
-    paths-ignore:
-      - '**.md'
-      - 'shark/examples/**'
-  workflow_dispatch:
-
-# Ensure that only a single job or workflow using the same
-# concurrency group will run at a time. This would cancel
-# any in-progress jobs in the same github workflow and github
-# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  build-validate:
-    strategy:
-      fail-fast: true
-      matrix:
-        os: [nodai-ubuntu-builder-large]
-        suite: [cpu] #,cuda,vulkan]
-        python-version: ["3.11"]
-        include:
-          - os: nodai-ubuntu-builder-large
-            suite: lint
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-    - uses: actions/checkout@v3
-    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-        
-    - name: Set up Python Version File ${{ matrix.python-version }}
-      run: |
-        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
-    
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: '${{ matrix.python-version }}'
-          
-    - name: Install dependencies
-      if: matrix.suite == 'lint'
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest toml black
-        
-    - name: Lint with flake8
-      if: matrix.suite == 'lint'
-      run: |
-        # black format check
-        black --version
-        black --check apps/shark_studio 
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --isolated --count --exit-zero --max-complexity=10 --max-line-length=127 \
-          --statistics --exclude lit.cfg.py
-
-    - name: Validate Models on CPU
-      if: matrix.suite == 'cpu'
-      run: |
-        cd $GITHUB_WORKSPACE
-        python${{ matrix.python-version }} -m venv shark.venv
-        source shark.venv/bin/activate
-        pip install -r requirements.txt --no-cache-dir
-        pip install -e .
-        pip uninstall -y torch
-        pip install torch==2.1.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-        python apps/shark_studio/tests/api_test.py
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,6 @@
 __pycache__/
 *.py[cod]
 *$py.class
-*.mlir
-*.vmfb

 # C extensions
 *.so
@@ -33,6 +31,7 @@ MANIFEST
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
+*.spec

 # Installer logs
 pip-log.txt
@@ -159,46 +158,12 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
-
-# vscode related
-.vscode
+#.idea/

 # Shark related artefacts
 *venv/
 shark_tmp/
-*.vmfb
-.use-iree
-tank/dict_configs.py
-*.csv
-reproducers/

 # ORT related artefacts
 cache_models/
 onnx_models/
-
-# Generated images
-generated_imgs/
-
-# Custom model related artefacts
-variants.json
-/models/
-
-# models folder
-apps/stable_diffusion/web/models/
-
-# Stencil annotators.
-stencil_annotator/
-
-# For DocuChat
-apps/language_models/langchain/user_path/
-db_dir_UserData
-
-# Embeded browser cache and other
-apps/stable_diffusion/web/EBWebView/
-
-# Llama2 tokenizer configs
-llama2_tokenizer_configs/
-
-# Webview2 runtime artefacts
-EBWebView/
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "inference/thirdparty/shark-runtime"]
 	path = inference/thirdparty/shark-runtime
-	url =https://github.com/nod-ai/SRT.git
+	url =https://github.com/nod-ai/SHARK-Runtime.git
 	branch = shark-06032022
--- a/.style.yapf
+++ b/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+  based_on_style = google
+  column_limit = 80
--- a/README.md
+++ b/README.md
@@ -1,161 +1,29 @@
 # SHARK

-High Performance Machine Learning Distribution
+High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerators and Heterogeneous Clusters

 [![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
 [![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)

+## Communication Channels
+
+*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
+*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+
+
+## Installation

 <details>
-  <summary>Prerequisites - Drivers </summary>
-  
-#### Install your Windows hardware drivers
-* [AMD RDNA Users] Download the latest driver (23.2.1 is the oldest supported) [here](https://www.amd.com/en/support).
-* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
-* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
-  
-#### Linux Drivers
-* MESA / RADV drivers wont work with FP16. Please use the latest AMGPU-PRO drivers (non-pro OSS drivers also wont work) or the latest NVidia Linux Drivers.
-
-Other users please ensure you have your latest vendor drivers and Vulkan SDK from [here](https://vulkan.lunarg.com/sdk/home) and if you are using vulkan check `vulkaninfo` works in a terminal window
-
-</details>
-
-
- 
-### Quick Start for SHARK Stable Diffusion for Windows 10/11 Users
-
-Install the Driver from [Prerequisites](https://github.com/nod-ai/SHARK#install-your-hardware-drivers) above 
-
-Download the [stable release](https://github.com/nod-ai/shark/releases/latest)
-
-Double click the .exe and you should have the [UI](http://localhost:8080/) in the browser. 
-
-If you have custom models put them in a `models/` directory where the .exe is. 
-
-Enjoy. 
-
-<details>
-  <summary>More installation notes</summary>
-* We recommend that you download EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files with `rm *.vmfb`. You can also use `--clear_all` flag once to clean all the old files. 
-* If you recently updated the driver or this binary (EXE file), we recommend you clear all the local artifacts with `--clear_all` 
-
-## Running
-
-* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE)
-* The first run may take few minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
-* You will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
-* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/.
-
-## Stopping
-
-* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment or close the terminal. 
-</details>
-
-<details>
-  <summary>Advanced Installation (Only for developers)</summary>
-  
-## Advanced Installation (Windows, Linux and macOS) for developers
-
-## Check out the code
-
-```shell
-git clone https://github.com/nod-ai/SHARK.git
-cd SHARK
-```
-
-## Setup your Python VirtualEnvironment and Dependencies
-
-### Windows 10/11 Users
-
-* Install the latest Python 3.11.x version from [here](https://www.python.org/downloads/windows/)
-
-* Install Git for Windows from [here](https://git-scm.com/download/win)
-
-#### Allow the install script to run in Powershell
-```powershell
-set-executionpolicy remotesigned
-```
-
-#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
-```powershell
-./setup_venv.ps1 #You can re-run this script to get the latest version
-```
-
-### Linux / macOS Users
-
-```shell
-./setup_venv.sh
-source shark.venv/bin/activate
-```
-
-
-### Run Stable Diffusion on your device - WebUI
-
-#### Windows 10/11 Users
-```powershell
-(shark.venv) PS C:\g\shark> cd .\apps\stable_diffusion\web\
-(shark.venv) PS C:\g\shark\apps\stable_diffusion\web> python .\index.py
-```
-#### Linux / macOS Users
-```shell
-(shark.venv) > cd apps/stable_diffusion/web
-(shark.venv) > python index.py
-```
-
-#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
-
-
-<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
-
-
-
-### Run Stable Diffusion on your device - Commandline
-
-#### Windows 10/11 Users
-```powershell
-(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\main.py --app="txt2img" --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
-```
-
-#### Linux / macOS Users
-```shell
-python3.11 apps/stable_diffusion/scripts/main.py --app=txt2img --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
-```
-
-You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
-</details>
-
-The output on a AMD 7900XTX would look something like:
-
-```shell
-Average step time: 47.19188690185547ms/it
-Clip Inference time (ms) = 109.531
-VAE Inference time (ms): 78.590
-
-Total image generation time: 2.5788655281066895sec
-```
-
-Here are some samples generated:
-
-![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
-
-![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
-
-
-Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
-
-
-<details>
-  <summary>Binary Installation</summary>
+  <summary>Installation (Linux and macOS)</summary>

 ### Setup a new pip Virtual Environment

 This step sets up a new VirtualEnv for Python

 ```shell
-python --version #Check you have 3.11 on Linux, macOS or Windows Powershell
+python --version #Check you have 3.7->3.10 on Linux or 3.10 on macOS
 python -m venv shark_venv
-source shark_venv/bin/activate   # Use shark_venv/Scripts/activate on Windows
+source shark_venv/bin/activate

 # If you are using conda create and activate a new conda env

@@ -167,17 +35,12 @@ python -m pip install --upgrade pip

 ### Install SHARK

-This step pip installs SHARK and related packages on Linux Python 3.8, 3.10 and 3.11 and macOS / Windows Python 3.11
+This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10

 ```shell
-pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SRT/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install nodai-shark -f https://github.com/nod-ai/SHARK/releases -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/shark-runtime/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 ```
-
-### Run shark tank model tests.
-```shell
-pytest tank/test_models.py
-```
-See tank/README.md for a more detailed walkthrough of our pytest suite and CLI.
+If you are on an Intel macOS machine you need this [workaround](https://github.com/nod-ai/SHARK/issues/102) for an upstream issue.

 ### Download and run Resnet50 sample

@@ -198,31 +61,33 @@ python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
 </details>


-
 <details>
-  <summary>Development, Testing and Benchmarks</summary>
+  <summary>Source Installation</summary>

-If you want to use Python3.11 and with TF Import tools you can use the environment variables like:
-Set `USE_IREE=1` to use upstream IREE
-```
-# PYTHON=python3.11 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
-```
+## Check out the code

-### Run any of the hundreds of SHARK tank models via the test framework
 ```shell
-python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
-# Or a pytest
-pytest tank/test_models.py -k "MiniLM"
+git clone https://github.com/nod-ai/SHARK.git
 ```
-  
-### How to use your locally built IREE / Torch-MLIR with SHARK
+
+## Setup your Python VirtualEnvironment and Dependencies
+```shell
+# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
+./setup_venv.sh
+source shark.venv/bin/activate
+```
+For example if you want to use Python3.10 and upstream IREE with TF Import tools you can use the environment variables like:
+```
+# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 USE_IREE=1 ./setup_venv.sh 
+```
+
 If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
 the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
-with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
+with Python bindings and set your PYTHONPATH as mentioned [here](https://google.github.io/iree/bindings/python/)
 for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
 for Torch-MLIR.

-How to use your locally built Torch-MLIR with SHARK:
+### How to use your locally built Torch-MLIR with SHARK
 ```shell
 1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
 2.) Run `pip uninstall torch-mlir`.
@@ -237,44 +102,82 @@ How to use your locally built Torch-MLIR with SHARK:
 ```
 Now the SHARK will use your locally build Torch-MLIR repo.

-
-## Benchmarking Dispatches
-
-To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your pytest command line argument.  
-If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`.  E.G. `--dispatch_benchmarks="0 1 2 10"`
-
-For example, to generate and run dispatch benchmarks for MiniLM on CUDA:
+### Run a demo script
+```shell
+python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
+# Or a pytest
+pytest tank/test_models.py -k "MiniLM"
 ```
-pytest -k "MiniLM and torch and static and cuda" --benchmark_dispatches=All -s --dispatch_benchmarks_dir=./my_dispatch_benchmarks                                                                                
-```
-The given command will populate `<dispatch_benchmarks_dir>/<model_name>/` with an `ordered_dispatches.txt` that lists and orders the dispatches and their latencies, as well as folders for each dispatch that contain .mlir, .vmfb, and results of the benchmark for that dispatch.
-
-if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled.  E.G:
-
-```
-shark_module = SharkInference(
-        mlir_model,
-        device=args.device,
-        mlir_dialect="tm_tensor",
-        dispatch_benchmarks="all",
-        dispatch_benchmarks_dir="results"
-    )
-```
-
-Output will include:
- An ordered list ordered-dispatches.txt of all the dispatches with their runtime
- Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
- An .mlir file containing the dispatch benchmark 
- A compiled .vmfb file containing the dispatch benchmark
- An .mlir file containing just the hal executable
- A compiled .vmfb file of the hal executable
- A .txt file containing benchmark output
-
-
-See tank/README.md for further instructions on how to run model tests and benchmarks from the SHARK tank.

 </details>

+<details>
+  <summary>Testing and Benchmarks</summary>
+
+### Run all model tests on CPU/GPU/VULKAN/Metal
+```shell
+pytest tank/test_models.py
+
+# If on Linux for multithreading on CPU (faster results):
+pytest tank/test_models.py -n auto
+```
+
+### Running specific tests
+```shell
+
+# Search for test cases by including a keyword that matches all or part of the test case's name;
+pytest tank/test_models.py -k "keyword" 
+
+# Test cases are named uniformly by format test_module_<model_name_underscores_only>_<torch/tf>_<static/dynamic>_<device>.
+
+# Example: Test all models on nvidia gpu:
+pytest tank/test_models.py -k "cuda"
+
+# Example: Test all tensorflow resnet models on Vulkan backend:
+pytest tank/test_models.py -k "resnet and tf and vulkan"
+
+# Exclude a test case:
+pytest tank/test_models.py -k "not ..."
+
+### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.
+
+(the following requires source installation with `IMPORTER=1 ./setup_venv.sh`)
+
+```shell
+pytest --benchmark tank/test_models.py
+  
+# Just do static GPU benchmarks for PyTorch tests:
+pytest --benchmark tank/test_models.py -k "pytorch and static and cuda"
+
+```
+  
+### Benchmark Resnet50, MiniLM on CPU
+
+(requires source installation with `IMPORTER=1 ./setup_venv.sh`)  
+  
+```shell
+# We suggest running the following commands as root before running benchmarks on CPU:
+  
+cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print $2}' | sort -n | uniq | ( while read X ; do echo $X ; echo 0 > /sys/devices/system/cpu/cpu$X/online ; done )
+echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
+
+# Benchmark canonical Resnet50 on CPU via pytest
+pytest --benchmark tank/test_models -k "resnet50 and tf_static_cpu"
+
+# Benchmark canonical MiniLM on CPU via pytest
+pytest --benchmark tank/test_models -k "MiniLM and cpu"
+
+# Benchmark MiniLM on CPU via transformer-benchmarks:
+git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
+cd transformer-benchmarks
+./perf-ci.sh -n
+# Check detail.csv for MLIR/IREE results.
+
+```
+
+</details>
+
+
 <details>
  <summary>API Reference</summary>

@@ -296,7 +199,7 @@ torch_mlir, func_name = mlir_importer.import_mlir(tracing_required=True)
 # SharkInference accepts mlir in linalg, mhlo, and tosa dialect.

 from shark.shark_inference import SharkInference
-shark_module = SharkInference(torch_mlir, device="cpu", mlir_dialect="linalg")
+shark_module = SharkInference(torch_mlir, func_name, device="cpu", mlir_dialect="linalg")
 shark_module.compile()
 result = shark_module.forward((input))

@@ -319,37 +222,166 @@ mhlo_ir = r"""builtin.module  {

 arg0 = np.ones((1, 4)).astype(np.float32)
 arg1 = np.ones((4, 1)).astype(np.float32)
-shark_module = SharkInference(mhlo_ir, device="cpu", mlir_dialect="mhlo")
+shark_module = SharkInference(mhlo_ir, func_name="forward", device="cpu", mlir_dialect="mhlo")
 shark_module.compile()
 result = shark_module.forward((arg0, arg1))
 ```
 </details>

-## Examples Using the REST API
-
-* [Setting up SHARK for use with Blender](./docs/shark_sd_blender.md)
-* [Setting up SHARK for use with Koboldcpp](./docs/shark_sd_koboldcpp.md)

 ## Supported and Validated Models

-SHARK is maintained to support the latest innovations in ML Models: 
+<details>
+  <summary>PyTorch Models</summary>

-| TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
-|---------------------|----------|----------|-------------|
-| BERT                | :green_heart:         | :green_heart:         | :green_heart:            |
-| DistilBERT         | :green_heart:         | :green_heart:         | :green_heart:            |
-| GPT2         | :green_heart:         | :green_heart:         | :green_heart:            |
-| BLOOM         | :green_heart:         | :green_heart:         | :green_heart:            |
-| Stable Diffusion         | :green_heart:         | :green_heart:         | :green_heart:            |
-| Vision Transformer       | :green_heart:         | :green_heart:         | :green_heart:            |
-| ResNet50         | :green_heart:         | :green_heart:         | :green_heart:            |
+### Huggingface PyTorch Models

-For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).
+| Hugging Face Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :green_heart: (JIT)          | :green_heart:         | :green_heart:         | :green_heart:            |
+| Albert              | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
+| BigBird             | :green_heart: (AOT)            |          |          |             |
+| DistilBERT          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
+| GPT2                | :broken_heart: (AOT)            |          |          |             |
+| MobileBert          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |

-## Communication Channels
+### Torchvision  Models

-*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
-*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|--------------------|----------------------|----------|----------|-------------|
+| AlexNet            | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| DenseNet121        | :green_heart: (Script)         |          |          |             |
+| MNasNet1_0         | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| MobileNetV2        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| MobileNetV3        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Unet               | :broken_heart: (Script)         |          |          |             |
+| Resnet18           | :green_heart: (Script)         | :green_heart:         |  :green_heart:        | :green_heart:            |
+| Resnet50           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| Resnet101           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| Resnext50_32x4d    | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| ShuffleNet_v2      | :broken_heart: (Script)         |          |          |             |
+| SqueezeNet         | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| EfficientNet       | :green_heart: (Script)         |          |          |             |
+| Regnet             | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Resnest            | :broken_heart: (Script)         |          |          |             |
+| Vision Transformer | :green_heart: (Script)         |          |          |             |
+| VGG 16             | :green_heart: (Script)         | :green_heart:         |   :green_heart:       |             |
+| Wide Resnet        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| RAFT               | :broken_heart: (JIT)            |          |          |             |
+
+For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
+
+### PyTorch Training Models
+
+| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>JAX Models</summary>
+
+
+### JAX  Models
+
+| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| DALL-E                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>TFLite Models</summary>
+
+### TFLite Models
+
+| Models | TOSA/LinAlg  | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected      | :green_heart:           | :green_heart:         |          |             |
+| albert | :green_heart:           | :green_heart:         |          |             |
+| asr_conformer | :green_heart:           | :green_heart:         |          |             |
+| bird_classifier | :green_heart:           | :green_heart:         |          |             |
+| cartoon_gan | :green_heart:           | :green_heart:         |          |             |
+| craft_text | :green_heart:           | :green_heart:         |          |             |
+| deeplab_v3 | :green_heart:           | :green_heart:         |          |             |
+| densenet | :green_heart:           | :green_heart:         |          |             |
+| east_text_detector | :green_heart:           | :green_heart:         |          |             |
+| efficientnet_lite0_int8 | :green_heart:           | :green_heart:         |          |             |
+| efficientnet | :green_heart:           | :green_heart:         |          |             |
+| gpt2 | :green_heart:           | :green_heart:         |          |             |
+| image_stylization | :green_heart:           | :green_heart:         |          |             |
+| inception_v4 | :green_heart:           | :green_heart:         |          |             |
+| inception_v4_uint8 | :green_heart:           | :green_heart:         |          |             |
+| lightning_fp16 | :green_heart:           | :green_heart:         |          |             |
+| lightning_i8 | :green_heart:           | :green_heart:         |          |             |
+| lightning | :green_heart:           | :green_heart:         |          |             |
+| magenta | :green_heart:           | :green_heart:         |          |             |
+| midas | :green_heart:           | :green_heart:         |          |             |
+| mirnet | :green_heart:           | :green_heart:         |          |             |
+| mnasnet | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_edgetpu_s_float | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_edgetpu_s_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilebert | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_tf2_float | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_tf2_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_ssd_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v3-large | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v3-large_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v35-int8 | :green_heart:           | :green_heart:         |          |             |
+| nasnet | :green_heart:           | :green_heart:         |          |             |
+| person_detect | :green_heart:           | :green_heart:         |          |             |
+| posenet | :green_heart:           | :green_heart:         |          |             |
+| resnet_50_int8 | :green_heart:           | :green_heart:         |          |             |
+| rosetta | :green_heart:           | :green_heart:         |          |             |
+| spice | :green_heart:           | :green_heart:         |          |             |
+| squeezenet | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_fpnlite | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
+| ssd_spaghettinet_large | :green_heart:           | :green_heart:         |          |             |
+| ssd_spaghettinet_large_uint8 | :green_heart:           | :green_heart:         |          |             |
+| visual_wake_words_i8 | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>TF Models</summary>
+
+### Tensorflow Models (Inference)
+
+| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| albert-base-v2              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| DistilBERT          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| CamemBert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| ConvBert              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| Deberta              |            |         |          |             |
+| electra          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| funnel              |            |         |          |             |
+| layoutlm              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| longformer              |            |         |          |             |
+| mobile-bert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| remembert              |            |         |          |             |
+| tapas              |            |         |          |             |
+| flaubert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| roberta                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| xlm-roberta              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| mpnet              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+
+</details>

 ## Related Projects

--- a/apps/shark_studio/api/llm.py
+++ b/apps/shark_studio/api/llm.py
@@ -1,179 +0,0 @@
-from turbine_models.custom_models import stateless_llama
-import time
-from shark.iree_utils.compile_utils import (
-    get_iree_compiled_module,
-    load_vmfb_using_mmap,
-)
-from apps.shark_studio.api.utils import get_resource_path
-import iree.runtime as ireert
-from itertools import chain
-import gc
-import os
-import torch
-from transformers import AutoTokenizer
-
-llm_model_map = {
-    "llama2_7b": {
-        "initializer": stateless_llama.export_transformer_model,
-        "hf_model_name": "meta-llama/Llama-2-7b-chat-hf",
-        "stop_token": 2,
-        "max_tokens": 4096,
-        "system_prompt": """<s>[INST] <<SYS>>Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>""",
-    },
-    "Trelis/Llama-2-7b-chat-hf-function-calling-v2": {
-        "initializer": stateless_llama.export_transformer_model,
-        "hf_model_name": "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-        "stop_token": 2,
-        "max_tokens": 4096,
-        "system_prompt": """<s>[INST] <<SYS>>Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>""",
-    },
-}
-
-
-class LanguageModel:
-    def __init__(
-        self,
-        model_name,
-        hf_auth_token=None,
-        device=None,
-        precision="fp32",
-        external_weights=None,
-        use_system_prompt=True,
-    ):
-        print(llm_model_map[model_name])
-        self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
-        self.tempfile_name = get_resource_path("llm.torch.tempfile")
-        self.vmfb_name = get_resource_path("llm.vmfb.tempfile")
-        self.device = device
-        self.precision = precision
-        self.safe_name = self.hf_model_name.strip("/").replace("/", "_")
-        self.max_tokens = llm_model_map[model_name]["max_tokens"]
-        self.iree_module_dict = None
-        self.external_weight_file = None
-        if external_weights is not None:
-            self.external_weight_file = get_resource_path(
-                self.safe_name + "." + external_weights
-            )
-        self.use_system_prompt = use_system_prompt
-        self.global_iter = 0
-        if os.path.exists(self.vmfb_name) and (
-            external_weights is None or os.path.exists(str(self.external_weight_file))
-        ):
-            self.iree_module_dict = dict()
-            (
-                self.iree_module_dict["vmfb"],
-                self.iree_module_dict["config"],
-                self.iree_module_dict["temp_file_to_unlink"],
-            ) = load_vmfb_using_mmap(
-                self.vmfb_name,
-                device,
-                device_idx=0,
-                rt_flags=[],
-                external_weight_file=self.external_weight_file,
-            )
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_name,
-                use_fast=False,
-                use_auth_token=hf_auth_token,
-            )
-        elif not os.path.exists(self.tempfile_name):
-            self.torch_ir, self.tokenizer = llm_model_map[model_name]["initializer"](
-                self.hf_model_name,
-                hf_auth_token,
-                compile_to="torch",
-                external_weights=external_weights,
-                external_weight_file=self.external_weight_file,
-            )
-            with open(self.tempfile_name, "w+") as f:
-                f.write(self.torch_ir)
-            del self.torch_ir
-            gc.collect()
-            self.compile()
-        else:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_name,
-                use_fast=False,
-                use_auth_token=hf_auth_token,
-            )
-            self.compile()
-
-    def compile(self) -> None:
-        # this comes with keys: "vmfb", "config", and "temp_file_to_unlink".
-        self.iree_module_dict = get_iree_compiled_module(
-            self.tempfile_name,
-            device=self.device,
-            mmap=True,
-            frontend="torch",
-            external_weight_file=self.external_weight_file,
-            write_to=self.vmfb_name,
-            extra_args=["--iree-global-opt-enable-quantized-matmul-reassociation"],
-        )
-        # TODO: delete the temp file
-
-    def sanitize_prompt(self, prompt):
-        print(prompt)
-        if isinstance(prompt, list):
-            prompt = list(chain.from_iterable(prompt))
-            prompt = " ".join([x for x in prompt if isinstance(x, str)])
-        prompt = prompt.replace("\n", " ")
-        prompt = prompt.replace("\t", " ")
-        prompt = prompt.replace("\r", " ")
-        if self.use_system_prompt and self.global_iter == 0:
-            prompt = llm_model_map["llama2_7b"]["system_prompt"] + prompt
-        prompt += " [/INST]"
-        print(prompt)
-        return prompt
-
-    def chat(self, prompt):
-        prompt = self.sanitize_prompt(prompt)
-
-        input_tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
-
-        def format_out(results):
-            return torch.tensor(results.to_host()[0][0])
-
-        history = []
-        for iter in range(self.max_tokens):
-            st_time = time.time()
-            if iter == 0:
-                device_inputs = [
-                    ireert.asdevicearray(
-                        self.iree_module_dict["config"].device, input_tensor
-                    )
-                ]
-                token = self.iree_module_dict["vmfb"]["run_initialize"](*device_inputs)
-            else:
-                device_inputs = [
-                    ireert.asdevicearray(
-                        self.iree_module_dict["config"].device,
-                        token,
-                    )
-                ]
-                token = self.iree_module_dict["vmfb"]["run_forward"](*device_inputs)
-
-            total_time = time.time() - st_time
-            history.append(format_out(token))
-            yield self.tokenizer.decode(history), total_time
-
-            if format_out(token) == llm_model_map["llama2_7b"]["stop_token"]:
-                break
-
-        for i in range(len(history)):
-            if type(history[i]) != int:
-                history[i] = int(history[i])
-        result_output = self.tokenizer.decode(history)
-        self.global_iter += 1
-        return result_output, total_time
-
-
-if __name__ == "__main__":
-    lm = LanguageModel(
-        "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-        hf_auth_token=None,
-        device="cpu-task",
-        external_weights="safetensors",
-    )
-
-    print("model loaded")
-    for i in lm.chat("hi, what are you?"):
-        print(i)
--- a/apps/shark_studio/api/utils.py
+++ b/apps/shark_studio/api/utils.py
@@ -1,12 +0,0 @@
-import os
-import sys
-
-
-def get_available_devices():
-    return ["cpu-task"]
-
-
-def get_resource_path(relative_path):
-    """Get absolute path to resource, works for dev and for PyInstaller"""
-    base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
-    return os.path.join(base_path, relative_path)
--- a/apps/shark_studio/tests/api_test.py
+++ b/apps/shark_studio/tests/api_test.py
@@ -1,34 +0,0 @@
-# Copyright 2023 Nod Labs, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import logging
-import unittest
-from apps.shark_studio.api.llm import LanguageModel
-
-
-class LLMAPITest(unittest.TestCase):
-    def testLLMSimple(self):
-        lm = LanguageModel(
-            "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-            hf_auth_token=None,
-            device="cpu-task",
-            external_weights="safetensors",
-        )
-        count = 0
-        for msg, _ in lm.chat("hi, what are you?"):
-            # skip first token output
-            if count == 0:
-                count += 1
-                continue
-            assert (
-                msg.strip(" ") == "Hello"
-            ), f"LLM API failed to return correct response, expected 'Hello', received {msg}"
-            break
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    unittest.main()
--- a/apps/shark_studio/web/index.py
+++ b/apps/shark_studio/web/index.py
@@ -1,426 +0,0 @@
-from multiprocessing import Process, freeze_support
-import os
-import sys
-import logging
-from ui.chat import chat_element
-
-if sys.platform == "darwin":
-    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
-    # import before IREE to avoid MLIR library issues
-    import torch_mlir
-
-# import PIL, transformers, sentencepiece  # ensures inclusion in pysintaller exe generation
-# from apps.stable_diffusion.src import args, clear_all
-# import apps.stable_diffusion.web.utils.global_obj as global_obj
-
-
-def launch_app(address):
-    from tkinter import Tk
-    import webview
-
-    window = Tk()
-
-    # get screen width and height of display and make it more reasonably
-    # sized as we aren't making it full-screen or maximized
-    width = int(window.winfo_screenwidth() * 0.81)
-    height = int(window.winfo_screenheight() * 0.91)
-    webview.create_window(
-        "SHARK AI Studio",
-        url=address,
-        width=width,
-        height=height,
-        text_select=True,
-    )
-    webview.start(private_mode=False, storage_path=os.getcwd())
-
-
-if __name__ == "__main__":
-    # if args.debug:
-    logging.basicConfig(level=logging.DEBUG)
-    # required to do multiprocessing in a pyinstaller freeze
-    freeze_support()
-    #    if args.api or "api" in args.ui.split(","):
-    #        from apps.stable_diffusion.web.ui import (
-    #            txt2img_api,
-    #            img2img_api,
-    #            upscaler_api,
-    #            inpaint_api,
-    #            outpaint_api,
-    #            llm_chat_api,
-    #        )
-    #
-    #        from fastapi import FastAPI, APIRouter
-    #        import uvicorn
-    #
-    #        # init global sd pipeline and config
-    #        global_obj._init()
-    #
-    #        app = FastAPI()
-    #        app.add_api_route("/sdapi/v1/txt2img", txt2img_api, methods=["post"])
-    #        app.add_api_route("/sdapi/v1/img2img", img2img_api, methods=["post"])
-    #        app.add_api_route("/sdapi/v1/inpaint", inpaint_api, methods=["post"])
-    #        app.add_api_route("/sdapi/v1/outpaint", outpaint_api, methods=["post"])
-    #        app.add_api_route("/sdapi/v1/upscaler", upscaler_api, methods=["post"])
-    #
-    #        # chat APIs needed for compatibility with multiple extensions using OpenAI API
-    #        app.add_api_route(
-    #            "/v1/chat/completions", llm_chat_api, methods=["post"]
-    #        )
-    #        app.add_api_route("/v1/completions", llm_chat_api, methods=["post"])
-    #        app.add_api_route("/chat/completions", llm_chat_api, methods=["post"])
-    #        app.add_api_route("/completions", llm_chat_api, methods=["post"])
-    #        app.add_api_route(
-    #            "/v1/engines/codegen/completions", llm_chat_api, methods=["post"]
-    #        )
-    #        app.include_router(APIRouter())
-    #        uvicorn.run(app, host="0.0.0.0", port=args.server_port)
-    #        sys.exit(0)
-    #
-    # Setup to use shark_tmp for gradio's temporary image files and clear any
-    # existing temporary images there if they exist. Then we can import gradio.
-    # It has to be in this order or gradio ignores what we've set up.
-    # from apps.stable_diffusion.web.utils.gradio_configs import (
-    #    config_gradio_tmp_imgs_folder,
-    # )
-
-    # config_gradio_tmp_imgs_folder()
-    import gradio as gr
-
-    # Create custom models folders if they don't exist
-    # from apps.stable_diffusion.web.ui.utils import create_custom_models_folders
-
-    # create_custom_models_folders()
-
-    def resource_path(relative_path):
-        """Get absolute path to resource, works for dev and for PyInstaller"""
-        base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
-        return os.path.join(base_path, relative_path)
-
-    dark_theme = resource_path("ui/css/sd_dark_theme.css")
-
-    # from apps.stable_diffusion.web.ui import (
-    # txt2img_web,
-    # txt2img_custom_model,
-    # txt2img_gallery,
-    # txt2img_png_info_img,
-    # txt2img_status,
-    # txt2img_sendto_img2img,
-    # txt2img_sendto_inpaint,
-    # txt2img_sendto_outpaint,
-    # txt2img_sendto_upscaler,
-    ## h2ogpt_upload,
-    ## h2ogpt_web,
-    # img2img_web,
-    # img2img_custom_model,
-    # img2img_gallery,
-    # img2img_init_image,
-    # img2img_status,
-    # img2img_sendto_inpaint,
-    # img2img_sendto_outpaint,
-    # img2img_sendto_upscaler,
-    # inpaint_web,
-    # inpaint_custom_model,
-    # inpaint_gallery,
-    # inpaint_init_image,
-    # inpaint_status,
-    # inpaint_sendto_img2img,
-    # inpaint_sendto_outpaint,
-    # inpaint_sendto_upscaler,
-    # outpaint_web,
-    # outpaint_custom_model,
-    # outpaint_gallery,
-    # outpaint_init_image,
-    # outpaint_status,
-    # outpaint_sendto_img2img,
-    # outpaint_sendto_inpaint,
-    # outpaint_sendto_upscaler,
-    # upscaler_web,
-    # upscaler_custom_model,
-    # upscaler_gallery,
-    # upscaler_init_image,
-    # upscaler_status,
-    # upscaler_sendto_img2img,
-    # upscaler_sendto_inpaint,
-    # upscaler_sendto_outpaint,
-    ##  lora_train_web,
-    ##  model_web,
-    ##  model_config_web,
-    # hf_models,
-    # modelmanager_sendto_txt2img,
-    # modelmanager_sendto_img2img,
-    # modelmanager_sendto_inpaint,
-    # modelmanager_sendto_outpaint,
-    # modelmanager_sendto_upscaler,
-    # stablelm_chat,
-    # minigpt4_web,
-    # outputgallery_web,
-    # outputgallery_tab_select,
-    # outputgallery_watch,
-    # outputgallery_filename,
-    # outputgallery_sendto_txt2img,
-    # outputgallery_sendto_img2img,
-    # outputgallery_sendto_inpaint,
-    # outputgallery_sendto_outpaint,
-    # outputgallery_sendto_upscaler,
-    # )
-
-    # init global sd pipeline and config
-    # global_obj._init()
-
-    def register_button_click(button, selectedid, inputs, outputs):
-        button.click(
-            lambda x: (
-                x[0]["name"] if len(x) != 0 else None,
-                gr.Tabs.update(selected=selectedid),
-            ),
-            inputs,
-            outputs,
-        )
-
-    def register_modelmanager_button(button, selectedid, inputs, outputs):
-        button.click(
-            lambda x: (
-                "None",
-                x,
-                gr.Tabs.update(selected=selectedid),
-            ),
-            inputs,
-            outputs,
-        )
-
-    def register_outputgallery_button(button, selectedid, inputs, outputs):
-        button.click(
-            lambda x: (
-                x,
-                gr.Tabs.update(selected=selectedid),
-            ),
-            inputs,
-            outputs,
-        )
-
-    with gr.Blocks(
-        css=dark_theme, analytics_enabled=False, title="Shark Studio 2.0 Beta"
-    ) as sd_web:
-        with gr.Tabs() as tabs:
-            # NOTE: If adding, removing, or re-ordering tabs, make sure that they
-            # have a unique id that doesn't clash with any of the other tabs,
-            # and that the order in the code here is the order they should
-            # appear in the ui, as the id value doesn't determine the order.
-
-            # Where possible, avoid changing the id of any tab that is the
-            # destination of one of the 'send to' buttons. If you do have to change
-            # that id, make sure you update the relevant register_button_click calls
-            # further down with the new id.
-            # with gr.TabItem(label="Text-to-Image", id=0):
-            #    txt2img_web.render()
-            # with gr.TabItem(label="Image-to-Image", id=1):
-            #    img2img_web.render()
-            # with gr.TabItem(label="Inpainting", id=2):
-            #    inpaint_web.render()
-            # with gr.TabItem(label="Outpainting", id=3):
-            #    outpaint_web.render()
-            # with gr.TabItem(label="Upscaler", id=4):
-            #    upscaler_web.render()
-            # if args.output_gallery:
-            #    with gr.TabItem(label="Output Gallery", id=5) as og_tab:
-            #        outputgallery_web.render()
-
-            #    # extra output gallery configuration
-            #    outputgallery_tab_select(og_tab.select)
-            #    outputgallery_watch(
-            #        [
-            #            txt2img_status,
-            #            img2img_status,
-            #            inpaint_status,
-            #            outpaint_status,
-            #            upscaler_status,
-            #        ]
-            #    )
-            ##  with gr.TabItem(label="Model Manager", id=6):
-            ##      model_web.render()
-            ##  with gr.TabItem(label="LoRA Training (Experimental)", id=7):
-            ##      lora_train_web.render()
-            with gr.TabItem(label="Chat Bot", id=0):
-                chat_element.render()
-            ##  with gr.TabItem(
-            ##      label="Generate Sharding Config (Experimental)", id=9
-            ##  ):
-            ##      model_config_web.render()
-            # with gr.TabItem(label="MultiModal (Experimental)", id=10):
-            #    minigpt4_web.render()
-            # with gr.TabItem(label="DocuChat Upload", id=11):
-            #     h2ogpt_upload.render()
-            # with gr.TabItem(label="DocuChat(Experimental)", id=12):
-            #     h2ogpt_web.render()
-
-        # send to buttons
-        # register_button_click(
-        #    txt2img_sendto_img2img,
-        #    1,
-        #    [txt2img_gallery],
-        #    [img2img_init_image, tabs],
-        # )
-        # register_button_click(
-        #    txt2img_sendto_inpaint,
-        #    2,
-        #    [txt2img_gallery],
-        #    [inpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    txt2img_sendto_outpaint,
-        #    3,
-        #    [txt2img_gallery],
-        #    [outpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    txt2img_sendto_upscaler,
-        #    4,
-        #    [txt2img_gallery],
-        #    [upscaler_init_image, tabs],
-        # )
-        # register_button_click(
-        #    img2img_sendto_inpaint,
-        #    2,
-        #    [img2img_gallery],
-        #    [inpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    img2img_sendto_outpaint,
-        #    3,
-        #    [img2img_gallery],
-        #    [outpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    img2img_sendto_upscaler,
-        #    4,
-        #    [img2img_gallery],
-        #    [upscaler_init_image, tabs],
-        # )
-        # register_button_click(
-        #    inpaint_sendto_img2img,
-        #    1,
-        #    [inpaint_gallery],
-        #    [img2img_init_image, tabs],
-        # )
-        # register_button_click(
-        #    inpaint_sendto_outpaint,
-        #    3,
-        #    [inpaint_gallery],
-        #    [outpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    inpaint_sendto_upscaler,
-        #    4,
-        #    [inpaint_gallery],
-        #    [upscaler_init_image, tabs],
-        # )
-        # register_button_click(
-        #    outpaint_sendto_img2img,
-        #    1,
-        #    [outpaint_gallery],
-        #    [img2img_init_image, tabs],
-        # )
-        # register_button_click(
-        #    outpaint_sendto_inpaint,
-        #    2,
-        #    [outpaint_gallery],
-        #    [inpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    outpaint_sendto_upscaler,
-        #    4,
-        #    [outpaint_gallery],
-        #    [upscaler_init_image, tabs],
-        # )
-        # register_button_click(
-        #    upscaler_sendto_img2img,
-        #    1,
-        #    [upscaler_gallery],
-        #    [img2img_init_image, tabs],
-        # )
-        # register_button_click(
-        #    upscaler_sendto_inpaint,
-        #    2,
-        #    [upscaler_gallery],
-        #    [inpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    upscaler_sendto_outpaint,
-        #    3,
-        #    [upscaler_gallery],
-        #    [outpaint_init_image, tabs],
-        # )
-        # if args.output_gallery:
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_txt2img,
-        #        0,
-        #        [outputgallery_filename],
-        #        [txt2img_png_info_img, tabs],
-        #    )
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_img2img,
-        #        1,
-        #        [outputgallery_filename],
-        #        [img2img_init_image, tabs],
-        #    )
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_inpaint,
-        #        2,
-        #        [outputgallery_filename],
-        #        [inpaint_init_image, tabs],
-        #    )
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_outpaint,
-        #        3,
-        #        [outputgallery_filename],
-        #        [outpaint_init_image, tabs],
-        #    )
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_upscaler,
-        #        4,
-        #        [outputgallery_filename],
-        #        [upscaler_init_image, tabs],
-        #    )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_txt2img,
-        #    0,
-        #    [hf_models],
-        #    [txt2img_custom_model, tabs],
-        # )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_img2img,
-        #    1,
-        #    [hf_models],
-        #    [img2img_custom_model, tabs],
-        # )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_inpaint,
-        #    2,
-        #    [hf_models],
-        #    [inpaint_custom_model, tabs],
-        # )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_outpaint,
-        #    3,
-        #    [hf_models],
-        #    [outpaint_custom_model, tabs],
-        # )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_upscaler,
-        #    4,
-        #    [hf_models],
-        #    [upscaler_custom_model, tabs],
-        # )
-
-    sd_web.queue()
-    # if args.ui == "app":
-    #    t = Process(
-    #        target=launch_app, args=[f"http://localhost:{args.server_port}"]
-    #    )
-    #    t.start()
-    sd_web.launch(
-        share=True,
-        inbrowser=True,
-        server_name="0.0.0.0",
-        server_port=11911,  # args.server_port,
-    )
--- a/apps/shark_studio/web/ui/chat.py
+++ b/apps/shark_studio/web/ui/chat.py
@@ -1,298 +0,0 @@
-import gradio as gr
-import time
-import os
-from pathlib import Path
-from datetime import datetime as dt
-import json
-import sys
-from apps.shark_studio.api.utils import (
-    get_available_devices,
-)
-from apps.shark_studio.api.llm import (
-    llm_model_map,
-    LanguageModel,
-)
-
-
-def user(message, history):
-    # Append the user's message to the conversation history
-    return "", history + [[message, ""]]
-
-
-language_model = None
-
-
-def create_prompt(model_name, history, prompt_prefix):
-    return ""
-
-
-def get_default_config():
-    return False
-
-
-# model_vmfb_key = ""
-
-
-def chat_fn(
-    prompt_prefix,
-    history,
-    model,
-    device,
-    precision,
-    download_vmfb,
-    config_file,
-    cli=False,
-):
-    global language_model
-    if language_model is None:
-        history[-1][-1] = "Getting the model ready..."
-        yield history, ""
-        language_model = LanguageModel(
-            model,
-            device=device,
-            precision=precision,
-            external_weights="safetensors",
-            external_weight_file="llama2_7b.safetensors",
-            use_system_prompt=prompt_prefix,
-        )
-        history[-1][-1] = "Getting the model ready... Done"
-        yield history, ""
-        history[-1][-1] = ""
-    token_count = 0
-    total_time = 0.001  # In order to avoid divide by zero error
-    prefill_time = 0
-    is_first = True
-    for text, exec_time in language_model.chat(history):
-        history[-1][-1] = text
-        if is_first:
-            prefill_time = exec_time
-            is_first = False
-            yield history, f"Prefill: {prefill_time:.2f}"
-        else:
-            total_time += exec_time
-            token_count += 1
-            tokens_per_sec = token_count / total_time
-            yield history, f"Prefill: {prefill_time:.2f} seconds\n Decode: {tokens_per_sec:.2f} tokens/sec"
-
-
-def llm_chat_api(InputData: dict):
-    return None
-    print(f"Input keys : {InputData.keys()}")
-    # print(f"model : {InputData['model']}")
-    is_chat_completion_api = (
-        "messages" in InputData.keys()
-    )  # else it is the legacy `completion` api
-    # For Debugging input data from API
-    # if is_chat_completion_api:
-    #     print(f"message -> role : {InputData['messages'][0]['role']}")
-    #     print(f"message -> content : {InputData['messages'][0]['content']}")
-    # else:
-    #     print(f"prompt : {InputData['prompt']}")
-    # print(f"max_tokens : {InputData['max_tokens']}") # Default to 128 for now
-    global vicuna_model
-    model_name = InputData["model"] if "model" in InputData.keys() else "codegen"
-    model_path = llm_model_map[model_name]
-    device = "cpu-task"
-    precision = "fp16"
-    max_toks = None if "max_tokens" not in InputData.keys() else InputData["max_tokens"]
-    if max_toks is None:
-        max_toks = 128 if model_name == "codegen" else 512
-
-    # make it working for codegen first
-    from apps.language_models.scripts.vicuna import (
-        UnshardedVicuna,
-    )
-
-    device_id = None
-    if vicuna_model == 0:
-        if "cuda" in device:
-            device = "cuda"
-        elif "sync" in device:
-            device = "cpu-sync"
-        elif "task" in device:
-            device = "cpu-task"
-        elif "vulkan" in device:
-            device_id = int(device.split("://")[1])
-            device = "vulkan"
-        else:
-            print("unrecognized device")
-
-        vicuna_model = UnshardedVicuna(
-            model_name,
-            hf_model_path=model_path,
-            device=device,
-            precision=precision,
-            max_num_tokens=max_toks,
-            download_vmfb=True,
-            load_mlir_from_shark_tank=True,
-            device_id=device_id,
-        )
-
-    # TODO: add role dict for different models
-    if is_chat_completion_api:
-        # TODO: add funtionality for multiple messages
-        prompt = create_prompt(model_name, [(InputData["messages"][0]["content"], "")])
-    else:
-        prompt = InputData["prompt"]
-    print("prompt = ", prompt)
-
-    res = vicuna_model.generate(prompt)
-    res_op = None
-    for op in res:
-        res_op = op
-
-    if is_chat_completion_api:
-        choices = [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": res_op,  # since we are yeilding the result
-                },
-                "finish_reason": "stop",  # or length
-            }
-        ]
-    else:
-        choices = [
-            {
-                "text": res_op,
-                "index": 0,
-                "logprobs": None,
-                "finish_reason": "stop",  # or length
-            }
-        ]
-    end_time = dt.now().strftime("%Y%m%d%H%M%S%f")
-    return {
-        "id": end_time,
-        "object": "chat.completion" if is_chat_completion_api else "text_completion",
-        "created": int(end_time),
-        "choices": choices,
-    }
-
-
-def view_json_file(file_obj):
-    content = ""
-    with open(file_obj.name, "r") as fopen:
-        content = fopen.read()
-    return content
-
-
-with gr.Blocks(title="Chat") as chat_element:
-    with gr.Row():
-        model_choices = list(llm_model_map.keys())
-        model = gr.Dropdown(
-            label="Select Model",
-            value=model_choices[0],
-            choices=model_choices,
-            allow_custom_value=True,
-        )
-        supported_devices = get_available_devices()
-        enabled = True
-        if len(supported_devices) == 0:
-            supported_devices = ["cpu-task"]
-        supported_devices = [x for x in supported_devices if "sync" not in x]
-        device = gr.Dropdown(
-            label="Device",
-            value=supported_devices[0],
-            choices=supported_devices,
-            interactive=enabled,
-            allow_custom_value=True,
-        )
-        precision = gr.Radio(
-            label="Precision",
-            value="int4",
-            choices=[
-                # "int4",
-                # "int8",
-                # "fp16",
-                "fp32",
-            ],
-            visible=False,
-        )
-        tokens_time = gr.Textbox(label="Tokens generated per second")
-        with gr.Column():
-            download_vmfb = gr.Checkbox(
-                label="Download vmfb from Shark tank if available",
-                value=True,
-                interactive=True,
-            )
-            prompt_prefix = gr.Checkbox(
-                label="Add System Prompt",
-                value=False,
-                interactive=True,
-            )
-
-    chatbot = gr.Chatbot(height=500)
-    with gr.Row():
-        with gr.Column():
-            msg = gr.Textbox(
-                label="Chat Message Box",
-                placeholder="Chat Message Box",
-                show_label=False,
-                interactive=enabled,
-                container=False,
-            )
-        with gr.Column():
-            with gr.Row():
-                submit = gr.Button("Submit", interactive=enabled)
-                stop = gr.Button("Stop", interactive=enabled)
-                clear = gr.Button("Clear", interactive=enabled)
-
-    with gr.Row(visible=False):
-        with gr.Group():
-            config_file = gr.File(label="Upload sharding configuration", visible=False)
-            json_view_button = gr.Button(label="View as JSON", visible=False)
-        json_view = gr.JSON(interactive=True, visible=False)
-        json_view_button.click(
-            fn=view_json_file, inputs=[config_file], outputs=[json_view]
-        )
-    submit_event = msg.submit(
-        fn=user,
-        inputs=[msg, chatbot],
-        outputs=[msg, chatbot],
-        show_progress=False,
-        queue=False,
-    ).then(
-        fn=chat_fn,
-        inputs=[
-            prompt_prefix,
-            chatbot,
-            model,
-            device,
-            precision,
-            download_vmfb,
-            config_file,
-        ],
-        outputs=[chatbot, tokens_time],
-        show_progress=False,
-        queue=True,
-    )
-    submit_click_event = submit.click(
-        fn=user,
-        inputs=[msg, chatbot],
-        outputs=[msg, chatbot],
-        show_progress=False,
-        queue=False,
-    ).then(
-        fn=chat_fn,
-        inputs=[
-            prompt_prefix,
-            chatbot,
-            model,
-            device,
-            precision,
-            download_vmfb,
-            config_file,
-        ],
-        outputs=[chatbot, tokens_time],
-        show_progress=False,
-        queue=True,
-    )
-    stop.click(
-        fn=None,
-        inputs=None,
-        outputs=None,
-        cancels=[submit_event, submit_click_event],
-        queue=False,
-    )
-    clear.click(lambda: None, None, [chatbot], queue=False)
--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -42,7 +42,7 @@ class TFHuggingFaceLanguage(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=tf_bert_input, jit_compile=True)
+    @tf.function(input_signature=tf_bert_input)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

@@ -129,12 +129,12 @@ pytest_benchmark_param = pytest.mark.parametrize(
        pytest.param(True, "cpu", marks=pytest.mark.skip),
        pytest.param(
            False,
-            "cuda",
+            "gpu",
            marks=pytest.mark.skipif(
-                check_device_drivers("cuda"), reason="nvidia-smi not found"
+                check_device_drivers("gpu"), reason="nvidia-smi not found"
            ),
        ),
-        pytest.param(True, "cuda", marks=pytest.mark.skip),
+        pytest.param(True, "gpu", marks=pytest.mark.skip),
        pytest.param(
            False,
            "vulkan",
--- a/build_tools/docker/Dockerfile-ubuntu-22.04
+++ b/build_tools/docker/Dockerfile-ubuntu-22.04
@@ -1,88 +0,0 @@
-ARG IMAGE_NAME
-FROM ${IMAGE_NAME}:12.2.0-runtime-ubuntu22.04 as base
-
-ENV NV_CUDA_LIB_VERSION "12.2.0-1"
-
-FROM base as base-amd64
-
-ENV NV_CUDA_CUDART_DEV_VERSION 12.2.53-1
-ENV NV_NVML_DEV_VERSION 12.2.81-1
-ENV NV_LIBCUSPARSE_DEV_VERSION 12.1.1.53-1
-ENV NV_LIBNPP_DEV_VERSION 12.1.1.14-1
-ENV NV_LIBNPP_DEV_PACKAGE libnpp-dev-12-2=${NV_LIBNPP_DEV_VERSION}
-
-ENV NV_LIBCUBLAS_DEV_VERSION 12.2.1.16-1
-ENV NV_LIBCUBLAS_DEV_PACKAGE_NAME libcublas-dev-12-2
-ENV NV_LIBCUBLAS_DEV_PACKAGE ${NV_LIBCUBLAS_DEV_PACKAGE_NAME}=${NV_LIBCUBLAS_DEV_VERSION}
-
-ENV NV_CUDA_NSIGHT_COMPUTE_VERSION 12.2.0-1
-ENV NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE cuda-nsight-compute-12-2=${NV_CUDA_NSIGHT_COMPUTE_VERSION}
-
-ENV NV_NVPROF_VERSION 12.2.60-1
-ENV NV_NVPROF_DEV_PACKAGE cuda-nvprof-12-2=${NV_NVPROF_VERSION}
-FROM base as base-arm64
-
-ENV NV_CUDA_CUDART_DEV_VERSION 12.2.53-1
-ENV NV_NVML_DEV_VERSION 12.2.81-1
-ENV NV_LIBCUSPARSE_DEV_VERSION 12.1.1.53-1
-ENV NV_LIBNPP_DEV_VERSION 12.1.1.14-1
-ENV NV_LIBNPP_DEV_PACKAGE libnpp-dev-12-2=${NV_LIBNPP_DEV_VERSION}
-
-ENV NV_LIBCUBLAS_DEV_PACKAGE_NAME libcublas-dev-12-2
-ENV NV_LIBCUBLAS_DEV_VERSION 12.2.1.16-1
-ENV NV_LIBCUBLAS_DEV_PACKAGE ${NV_LIBCUBLAS_DEV_PACKAGE_NAME}=${NV_LIBCUBLAS_DEV_VERSION}
-
-ENV NV_CUDA_NSIGHT_COMPUTE_VERSION 12.2.0-1
-ENV NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE cuda-nsight-compute-12-2=${NV_CUDA_NSIGHT_COMPUTE_VERSION}
-
-FROM base-${TARGETARCH}
-
-ARG TARGETARCH
-
-LABEL maintainer "SHARK<stdin@nod.com>"
-
-# Register the ROCM package repository, and install rocm-dev package
-ARG ROCM_VERSION=5.6
-ARG AMDGPU_VERSION=5.6
-
-ARG APT_PREF
-RUN echo "$APT_PREF" > /etc/apt/preferences.d/rocm-pin-600
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg \
-  && curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-  && printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list \
-  && printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list \
-  && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-  sudo \
-  libelf1 \
-  kmod \
-  file \
-  python3 \
-  python3-pip \
-  rocm-dev \
-  rocm-libs \
-  rocm-hip-libraries \
-  build-essential && \
-  apt-get clean && \
-  rm -rf /var/lib/apt/lists/*
-
-RUN  groupadd -g 109 render
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    cuda-cudart-dev-12-2=${NV_CUDA_CUDART_DEV_VERSION} \
-    cuda-command-line-tools-12-2=${NV_CUDA_LIB_VERSION} \
-    cuda-minimal-build-12-2=${NV_CUDA_LIB_VERSION} \
-    cuda-libraries-dev-12-2=${NV_CUDA_LIB_VERSION} \
-    cuda-nvml-dev-12-2=${NV_NVML_DEV_VERSION} \
-    ${NV_NVPROF_DEV_PACKAGE} \
-    ${NV_LIBNPP_DEV_PACKAGE} \
-    libcusparse-dev-12-2=${NV_LIBCUSPARSE_DEV_VERSION} \
-    ${NV_LIBCUBLAS_DEV_PACKAGE} \
-    ${NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE} \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN apt install rocm-hip-libraries
-
-# Keep apt from auto upgrading the cublas and nccl packages. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88
-RUN apt-mark hold ${NV_LIBCUBLAS_DEV_PACKAGE_NAME}
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
--- a/build_tools/docker/README.md
+++ b/build_tools/docker/README.md
@@ -1,41 +0,0 @@
-On your host install your Nvidia or AMD gpu drivers. 
-
-**HOST Setup**
-
-*Ubuntu 23.04 Nvidia*
-```
-sudo ubuntu-drivers install
-```
-
-Install [docker](https://docs.docker.com/engine/install/ubuntu/) and the post-install to run as a [user](https://docs.docker.com/engine/install/linux-postinstall/)
-
-Install Nvidia [Container and register it](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). In Ubuntu 23.04 systems follow [this](https://github.com/NVIDIA/nvidia-container-toolkit/issues/72#issuecomment-1584574298)
-
-
-Build docker with :
-
-```
-docker build . -f Dockerfile-ubuntu-22.04 -t shark/dev-22.04:5.6 --build-arg=ROCM_VERSION=5.6 --build-arg=AMDGPU_VERSION=5.6 --build-arg=APT_PREF="Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" --build-arg=IMAGE_NAME=nvidia/cuda --build-arg=TARGETARCH=amd64
-```
-
-Run with:
-
-*CPU*
-
-```
-docker run  -it docker.io/shark/dev-22.04:5.6
-```
-
-*Nvidia GPU*
-
-```
-docker run --rm -it --gpus all docker.io/shark/dev-22.04:5.6
-```
-
-*AMD GPUs*
-
-```
-docker run --device /dev/kfd --device /dev/dri  docker.io/shark/dev-22.04:5.6
-```
-
-More AMD instructions are [here](https://docs.amd.com/en/latest/deploy/docker.html)
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -1,51 +0,0 @@
-import argparse
-from PIL import Image
-import numpy as np
-
-import requests
-import shutil
-import os
-import subprocess
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("-n", "--newfile")
-parser.add_argument(
-    "-g",
-    "--golden_url",
-    default="https://storage.googleapis.com/shark_tank/testdata/cyberpunk_fores_42_0_230119_021148.png",
-)
-
-
-def get_image(url, local_filename):
-    res = requests.get(url, stream=True)
-    if res.status_code == 200:
-        with open(local_filename, "wb") as f:
-            shutil.copyfileobj(res.raw, f)
-
-
-def compare_images(new_filename, golden_filename, upload=False):
-    new = np.array(Image.open(new_filename)) / 255.0
-    golden = np.array(Image.open(golden_filename)) / 255.0
-    diff = np.abs(new - golden)
-    mean = np.mean(diff)
-    if mean > 0.1:
-        if os.name != "nt" and upload == True:
-            subprocess.run(
-                [
-                    "gsutil",
-                    "cp",
-                    new_filename,
-                    "gs://shark_tank/testdata/builder/",
-                ]
-            )
-        raise AssertionError("new and golden not close")
-    else:
-        print("SUCCESS")
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    tempfile_name = os.path.join(os.getcwd(), "golden.png")
-    get_image(args.golden_url, tempfile_name)
-    compare_images(args.newfile, tempfile_name)
--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -1,6 +1,5 @@
 #!/bin/bash

-IMPORTER=1 BENCHMARK=1 NO_BREVITAS=1 ./setup_venv.sh
+IMPORTER=1 ./setup_venv.sh
 source $GITHUB_WORKSPACE/shark.venv/bin/activate
-python build_tools/stable_diffusion_testing.py --gen
-python tank/generate_sharktank.py
+python generate_sharktank.py --upload=False --ci_tank_dir=True
--- a/build_tools/scrape_releases.py
+++ b/build_tools/scrape_releases.py
@@ -1,37 +0,0 @@
-"""Scrapes the github releases API to generate a static pip-install-able releases page.
-
-See https://github.com/llvm/torch-mlir/issues/1374
-"""
-import argparse
-import json
-
-import requests
-
-# Parse arguments
-parser = argparse.ArgumentParser()
-parser.add_argument("owner", type=str)
-parser.add_argument("repo", type=str)
-args = parser.parse_args()
-
-# Get releases
-response = requests.get(
-    f"https://api.github.com/repos/{args.owner}/{args.repo}/releases"
-)
-body = json.loads(response.content)
-
-# Parse releases
-releases = []
-for row in body:
-    for asset in row["assets"]:
-        releases.append((asset["name"], asset["browser_download_url"]))
-
-# Output HTML
-html = """<!DOCTYPE html>
-<html>
-  <body>
-"""
-for name, url in releases:
-    html += f"    <a href='{url}'>{name}</a><br />\n"
-html += """  </body>
-</html>"""
-print(html)
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -1,284 +0,0 @@
-import os
-from sys import executable
-import subprocess
-from apps.stable_diffusion.src.utils.resources import (
-    get_json_file,
-)
-from datetime import datetime as dt
-from shark.shark_downloader import download_public_file
-from image_comparison import compare_images
-import argparse
-from glob import glob
-import shutil
-import requests
-
-model_config_dicts = get_json_file(
-    os.path.join(
-        os.getcwd(),
-        "apps/stable_diffusion/src/utils/resources/model_config.json",
-    )
-)
-
-
-def parse_sd_out(filename, command, device, use_tune, model_name, import_mlir):
-    with open(filename, "r+") as f:
-        lines = f.readlines()
-    metrics = {}
-    vals_to_read = [
-        "Clip Inference time",
-        "Average step",
-        "VAE Inference time",
-        "Total image generation",
-    ]
-    for line in lines:
-        for val in vals_to_read:
-            if val in line:
-                metrics[val] = line.split(" ")[-1].strip("\n")
-
-    metrics["Average step"] = metrics["Average step"].strip("ms/it")
-    metrics["Total image generation"] = metrics["Total image generation"].strip("sec")
-    metrics["device"] = device
-    metrics["use_tune"] = use_tune
-    metrics["model_name"] = model_name
-    metrics["import_mlir"] = import_mlir
-    metrics["command"] = command
-    return metrics
-
-
-def get_inpaint_inputs():
-    os.mkdir("./test_images/inputs")
-    img_url = (
-        "https://huggingface.co/datasets/diffusers/test-arrays/resolve"
-        "/main/stable_diffusion_inpaint/input_bench_image.png"
-    )
-    mask_url = (
-        "https://huggingface.co/datasets/diffusers/test-arrays/resolve"
-        "/main/stable_diffusion_inpaint/input_bench_mask.png"
-    )
-    img = requests.get(img_url)
-    mask = requests.get(mask_url)
-    open("./test_images/inputs/image.png", "wb").write(img.content)
-    open("./test_images/inputs/mask.png", "wb").write(mask.content)
-
-
-def test_loop(
-    device="vulkan",
-    beta=False,
-    extra_flags=[],
-    upload_bool=True,
-    exit_on_fail=True,
-    do_gen=False,
-):
-    # Get golden values from tank
-    shutil.rmtree("./test_images", ignore_errors=True)
-    model_metrics = []
-    os.mkdir("./test_images")
-    os.mkdir("./test_images/golden")
-    get_inpaint_inputs()
-    hf_model_names = model_config_dicts[0].values()
-    tuned_options = [
-        "--no-use_tuned",
-        "--use_tuned",
-    ]
-    import_options = ["--import_mlir", "--no-import_mlir"]
-    prompt_text = "--prompt=cyberpunk forest by Salvador Dali"
-    inpaint_prompt_text = (
-        "--prompt=Face of a yellow cat, high resolution, sitting on a park bench"
-    )
-    if os.name == "nt":
-        prompt_text = '--prompt="cyberpunk forest by Salvador Dali"'
-        inpaint_prompt_text = (
-            '--prompt="Face of a yellow cat, high resolution, sitting on a park bench"'
-        )
-    if beta:
-        extra_flags.append("--beta_models=True")
-    extra_flags.append("--no-progress_bar")
-    if do_gen:
-        extra_flags.append("--import_debug")
-    to_skip = [
-        "Linaqruf/anything-v3.0",
-        "prompthero/openjourney",
-        "wavymulder/Analog-Diffusion",
-        "dreamlike-art/dreamlike-diffusion-1.0",
-    ]
-    counter = 0
-    for import_opt in import_options:
-        for model_name in hf_model_names:
-            if model_name in to_skip:
-                continue
-            for use_tune in tuned_options:
-                if (
-                    model_name == "stabilityai/stable-diffusion-2-1"
-                    and use_tune == tuned_options[0]
-                ):
-                    continue
-                elif (
-                    model_name == "stabilityai/stable-diffusion-2-1-base"
-                    and use_tune == tuned_options[1]
-                ):
-                    continue
-                elif use_tune == tuned_options[1]:
-                    continue
-                command = (
-                    [
-                        executable,  # executable is the python from the venv used to run this
-                        "apps/stable_diffusion/scripts/txt2img.py",
-                        "--device=" + device,
-                        prompt_text,
-                        "--negative_prompts=" + '""',
-                        "--seed=42",
-                        import_opt,
-                        "--output_dir="
-                        + os.path.join(os.getcwd(), "test_images", model_name),
-                        "--hf_model_id=" + model_name,
-                        use_tune,
-                    ]
-                    if "inpainting" not in model_name
-                    else [
-                        executable,
-                        "apps/stable_diffusion/scripts/inpaint.py",
-                        "--device=" + device,
-                        inpaint_prompt_text,
-                        "--negative_prompts=" + '""',
-                        "--img_path=./test_images/inputs/image.png",
-                        "--mask_path=./test_images/inputs/mask.png",
-                        "--seed=42",
-                        "--import_mlir",
-                        "--output_dir="
-                        + os.path.join(os.getcwd(), "test_images", model_name),
-                        "--hf_model_id=" + model_name,
-                        use_tune,
-                    ]
-                )
-                command += extra_flags
-                if os.name == "nt":
-                    command = " ".join(command)
-                dumpfile_name = "_".join(model_name.split("/")) + ".txt"
-                dumpfile_name = os.path.join(os.getcwd(), dumpfile_name)
-                with open(dumpfile_name, "w+") as f:
-                    generated_image = not subprocess.call(
-                        command,
-                        stdout=f,
-                        stderr=f,
-                    )
-                if os.name != "nt":
-                    command = " ".join(command)
-                if generated_image:
-                    model_metrics.append(
-                        parse_sd_out(
-                            dumpfile_name,
-                            command,
-                            device,
-                            use_tune,
-                            model_name,
-                            import_opt,
-                        )
-                    )
-                    print(command)
-                    print("Successfully generated image")
-                    os.makedirs("./test_images/golden/" + model_name, exist_ok=True)
-                    download_public_file(
-                        "gs://shark_tank/testdata/golden/" + model_name,
-                        "./test_images/golden/" + model_name,
-                    )
-                    test_file_path = os.path.join(
-                        os.getcwd(),
-                        "test_images",
-                        model_name,
-                        "generated_imgs",
-                        dt.now().strftime("%Y%m%d"),
-                        "*.png",
-                    )
-                    test_file = glob(test_file_path)[0]
-
-                    golden_path = "./test_images/golden/" + model_name + "/*.png"
-                    golden_file = glob(golden_path)[0]
-                    try:
-                        compare_images(test_file, golden_file, upload=upload_bool)
-                    except AssertionError as e:
-                        print(e)
-                        if exit_on_fail == True:
-                            raise
-                else:
-                    print(command)
-                    print("failed to generate image for this configuration")
-                    with open(dumpfile_name, "r+") as f:
-                        output = f.readlines()
-                        print("\n".join(output))
-                    exit(1)
-                if os.name == "nt":
-                    counter += 1
-                    if counter % 2 == 0:
-                        extra_flags.append(
-                            "--iree_vulkan_target_triple=rdna2-unknown-windows"
-                        )
-                    else:
-                        if counter != 1:
-                            extra_flags.remove(
-                                "--iree_vulkan_target_triple=rdna2-unknown-windows"
-                            )
-            if do_gen:
-                prepare_artifacts()
-
-    with open(os.path.join(os.getcwd(), "sd_testing_metrics.csv"), "w+") as f:
-        header = "model_name;device;use_tune;import_opt;Clip Inference time(ms);Average Step (ms/it);VAE Inference time(ms);total image generation(s);command\n"
-        f.write(header)
-        for metric in model_metrics:
-            output = [
-                metric["model_name"],
-                metric["device"],
-                metric["use_tune"],
-                metric["import_mlir"],
-                metric["Clip Inference time"],
-                metric["Average step"],
-                metric["VAE Inference time"],
-                metric["Total image generation"],
-                metric["command"],
-            ]
-            f.write(";".join(output) + "\n")
-
-
-def prepare_artifacts():
-    gen_path = os.path.join(os.getcwd(), "gen_shark_tank")
-    if not os.path.isdir(gen_path):
-        os.mkdir(gen_path)
-    for dirname in os.listdir(os.getcwd()):
-        for modelname in ["clip", "unet", "vae"]:
-            if modelname in dirname and "vmfb" not in dirname:
-                if not os.path.isdir(os.path.join(gen_path, dirname)):
-                    shutil.move(os.path.join(os.getcwd(), dirname), gen_path)
-                    print(f"Moved dir: {dirname} to {gen_path}.")
-
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("-d", "--device", default="vulkan")
-parser.add_argument(
-    "-b", "--beta", action=argparse.BooleanOptionalAction, default=False
-)
-parser.add_argument("-e", "--extra_args", type=str, default=None)
-parser.add_argument(
-    "-u", "--upload", action=argparse.BooleanOptionalAction, default=True
-)
-parser.add_argument(
-    "-x", "--exit_on_fail", action=argparse.BooleanOptionalAction, default=True
-)
-parser.add_argument("-g", "--gen", action=argparse.BooleanOptionalAction, default=False)
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    print(args)
-    extra_args = []
-    if args.extra_args:
-        for arg in args.extra_args.split(","):
-            extra_args.append(arg)
-    test_loop(
-        args.device,
-        args.beta,
-        extra_args,
-        args.upload,
-        args.exit_on_fail,
-        args.gen,
-    )
-    if args.gen:
-        prepare_artifacts()
--- a/build_tools/vicuna_testing.py
+++ b/build_tools/vicuna_testing.py
@@ -1,14 +0,0 @@
-import os
-from sys import executable
-import subprocess
-from apps.language_models.scripts import vicuna
-
-
-def test_loop():
-    precisions = ["fp16", "int8", "int4"]
-    devices = ["cpu"]
-    for precision in precisions:
-        for device in devices:
-            model = vicuna.UnshardedVicuna(device=device, precision=precision)
-            model.compile()
-            del model
--- a/conftest.py
+++ b/conftest.py
@@ -2,11 +2,9 @@ def pytest_addoption(parser):
    # Attaches SHARK command-line arguments to the pytest machinery.
    parser.addoption(
        "--benchmark",
-        action="store",
-        type=str,
-        default=None,
-        choices=("baseline", "native", "all"),
-        help="Benchmarks specified engine(s) and writes bench_results.csv.",
+        action="store_true",
+        default="False",
+        help="Pass option to benchmark and write results.csv",
    )
    parser.addoption(
        "--onnx_bench",
@@ -38,18 +36,6 @@ def pytest_addoption(parser):
        default="False",
        help="Enables uploading of reproduction artifacts upon test case failure during iree-compile or validation. Must be passed with --ci_sha option ",
    )
-    parser.addoption(
-        "--update_tank",
-        action="store_true",
-        default="False",
-        help="Update local shark tank with latest artifacts if model artifact hash mismatched.",
-    )
-    parser.addoption(
-        "--force_update_tank",
-        action="store_true",
-        default="False",
-        help="Force-update local shark tank with artifacts from specified shark_tank URL (defaults to nightly).",
-    )
    parser.addoption(
        "--ci_sha",
        action="store",
@@ -59,34 +45,12 @@ def pytest_addoption(parser):
    parser.addoption(
        "--local_tank_cache",
        action="store",
-        default=None,
+        default="",
        help="Specify the directory in which all downloaded shark_tank artifacts will be cached.",
    )
    parser.addoption(
        "--tank_url",
        type=str,
-        default="gs://shark_tank/nightly",
+        default="gs://shark_tank/latest",
        help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
    )
-    parser.addoption(
-        "--tank_prefix",
-        type=str,
-        default=None,
-        help="Prefix to gs://shark_tank/ model directories from which to download SHARK tank artifacts. Default is nightly.",
-    )
-    parser.addoption(
-        "--benchmark_dispatches",
-        default=None,
-        help="Benchmark individual dispatch kernels produced by IREE compiler. Use 'All' for all, or specific dispatches e.g. '0 1 2 10'",
-    )
-    parser.addoption(
-        "--dispatch_benchmarks_dir",
-        default="./temp_dispatch_benchmarks",
-        help="Directory in which dispatch benchmarks are saved.",
-    )
-    parser.addoption(
-        "--batchsize",
-        default=1,
-        type=int,
-        help="Batch size for the tested model.",
-    )
--- a/cpp/.gitignore
+++ b/cpp/.gitignore
@@ -1,3 +0,0 @@
-*.mlir
-*.vmfb
-*.ini
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -27,7 +27,7 @@ include(FetchContent)

 FetchContent_Declare(
  iree
-  GIT_REPOSITORY https://github.com/nod-ai/srt.git
+  GIT_REPOSITORY https://github.com/nod-ai/shark-runtime.git
  GIT_TAG shark 
  GIT_SUBMODULES_RECURSE OFF
  GIT_SHALLOW OFF
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -40,7 +40,7 @@ cmake --build build/
 *Prepare the model*
 ```bash
 wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  resnet50_tf.mlir -o resnet50_tf.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvm-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
 ```
 *Prepare the input*

@@ -54,29 +54,5 @@ python -m pip install tensorflow

 *Run the vulkan_gui*
 ```bash
-./build/vulkan_gui/iree-samples-resnet-vulkan-gui
-```
-
-## Other models
-A tool for benchmarking other models is built and can be invoked with a command like the following
-```bash
-./build/vulkan_gui/iree-vulkan-gui --module-file=path/to/.vmfb --function_input=...
-```
-see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
-```bash
-wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  stable_diff_tf.mlir -o stable_diff_tf.vmfb
-./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
-```
-VAE and Autoencoder are also available
-```bash
-# VAE
-wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  vae.mlir -o vae.vmfb
-./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32
-
-# CLIP Autoencoder
-wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  clip_autoencoder.mlir -o clip_autoencoder.vmfb
-./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
+./build/vulkan_gui/iree-samples-vulkan-gui
 ```
--- a/cpp/save_img.py
+++ b/cpp/save_img.py
@@ -1,6 +1,7 @@
 import numpy as np
 import tensorflow as tf
 from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model


 def load_and_preprocess_image(fname: str):
--- a/cpp/vision_inference/CMakeLists.txt
+++ b/cpp/vision_inference/CMakeLists.txt
@@ -21,7 +21,7 @@ endif()
 # Compile mnist.mlir to mnist.vmfb.
 set(_COMPILE_TOOL_EXECUTABLE $<TARGET_FILE:iree-compile>)
 set(_COMPILE_ARGS)
-list(APPEND _COMPILE_ARGS "--iree-input-type=auto")
+list(APPEND _COMPILE_ARGS "--iree-input-type=mhlo")
 list(APPEND _COMPILE_ARGS "--iree-hal-target-backends=llvm-cpu")
 list(APPEND _COMPILE_ARGS "${IREE_SOURCE_DIR}/samples/models/mnist.mlir")
 list(APPEND _COMPILE_ARGS "-o")
--- a/cpp/vulkan_gui/CMakeLists.txt
+++ b/cpp/vulkan_gui/CMakeLists.txt
@@ -40,77 +40,45 @@ set(IMGUI_DIR ${CMAKE_BINARY_DIR}/_deps/imgui-src)
 message("Looking for Imgui in ${IMGUI_DIR}")
 include_directories(${IMGUI_DIR} ${IMGUI_DIR}/backends ..)

-
-function(iree_vulkan_sample)
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME"
-    "SRCS"
-    ${ARGN}
-  )
-
-
-  # Define the sample executable.
-  set(_NAME "${_RULE_NAME}")
-  set(SRCS "${_RULE_SRCS}")
-  add_executable(${_NAME} "")
-  target_sources(${_NAME}
-    PRIVATE
-      ${SRCS}
-      "${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
-      "${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
-      "${IMGUI_DIR}/imgui.cpp"
-      "${IMGUI_DIR}/imgui_draw.cpp"
-      "${IMGUI_DIR}/imgui_demo.cpp"
-      "${IMGUI_DIR}/imgui_tables.cpp"
-      "${IMGUI_DIR}/imgui_widgets.cpp"
-  )
-  set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_NAME}")
-  target_include_directories(${_NAME} PUBLIC
-      $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
-  )
-  target_link_libraries(${_NAME}
-    SDL2::SDL2
-    Vulkan::Vulkan
-    iree_runtime_runtime
-    iree_base_internal_main
-    iree_hal_drivers_vulkan_registration_registration
-    iree_modules_hal_hal
-    iree_vm_vm
-    iree_vm_bytecode_module
-    iree_vm_cc
-    iree_tooling_vm_util_cc
-    iree_tooling_context_util
-  )
-
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-    set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
-  else()
-    set(_GUI_LINKOPTS "")
-  endif()
-
-  target_link_options(${_NAME}
-    PRIVATE
-      ${_GUI_LINKOPTS}
-  )
-endfunction()
-
-iree_vulkan_sample(
-    NAME
-      iree-samples-resnet-vulkan-gui
-
-    SRCS
-      vulkan_resnet_inference_gui.cc
+# Define the sample executable.
+set(_NAME "iree-samples-vulkan-gui")
+add_executable(${_NAME} "")
+target_sources(${_NAME}
+  PRIVATE
+    vulkan_inference_gui.cc
+    "${IMGUI_DIR}/backends/imgui_impl_sdl.cpp"
+    "${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp"
+    "${IMGUI_DIR}/imgui.cpp"
+    "${IMGUI_DIR}/imgui_draw.cpp"
+    "${IMGUI_DIR}/imgui_demo.cpp"
+    "${IMGUI_DIR}/imgui_tables.cpp"
+    "${IMGUI_DIR}/imgui_widgets.cpp"
+)
+set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "iree-samples-vulkan-gui")
+target_include_directories(${_NAME} PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+)
+target_link_libraries(${_NAME}
+  SDL2::SDL2
+  Vulkan::Vulkan
+  iree_runtime_runtime
+  iree_base_internal_main
+  iree_hal_drivers_vulkan_registration_registration
+  iree_modules_hal_hal
+  iree_vm_vm
+  iree_vm_bytecode_module
+  iree_vm_cc
 )

-iree_vulkan_sample(
-    NAME
-      iree-vulkan-gui
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+  set(_GUI_LINKOPTS "-SUBSYSTEM:CONSOLE")
+else()
+  set(_GUI_LINKOPTS "")
+endif()

-    SRCS
-      vulkan_inference_gui.cc
+target_link_options(${_NAME}
+  PRIVATE
+    ${_GUI_LINKOPTS}
 )

 message(STATUS "Configured vulkan_gui sample successfully")
--- a/cpp/vulkan_gui/vulkan_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_inference_gui.cc
@@ -18,12 +18,6 @@
 #include <set>
 #include <vector>
 #include <fstream>
-#include <array>
-#include <cstdio>
-#include <cstdlib>
-#include <iterator>
-#include <string>
-#include <utility>

 #include "iree/hal/drivers/vulkan/api.h"

@@ -36,15 +30,6 @@
 #include "iree/vm/bytecode_module.h"
 #include "iree/vm/ref_cc.h"

-// iree-run-module
-#include "iree/base/internal/flags.h"
-#include "iree/base/status_cc.h"
-#include "iree/base/tracing.h"
-#include "iree/modules/hal/types.h"
-#include "iree/tooling/comparison.h"
-#include "iree/tooling/context_util.h"
-#include "iree/tooling/vm_util_cc.h"
-
 // Other dependencies (helpers, etc.)
 #include "iree/base/internal/main.h"

@@ -53,49 +38,6 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

-IREE_FLAG(string, entry_function, "",
-          "Name of a function contained in the module specified by module_file "
-          "to run.");
-
-// TODO(benvanik): move --function_input= flag into a util.
-static iree_status_t parse_function_io(iree_string_view_t flag_name,
-                                       void* storage,
-                                       iree_string_view_t value) {
-  auto* list = (std::vector<std::string>*)storage;
-  list->push_back(std::string(value.data, value.size));
-  return iree_ok_status();
-}
-static void print_function_io(iree_string_view_t flag_name, void* storage,
-                              FILE* file) {
-  auto* list = (std::vector<std::string>*)storage;
-  if (list->empty()) {
-    fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
-  } else {
-    for (size_t i = 0; i < list->size(); ++i) {
-      fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
-              list->at(i).c_str());
-    }
-  }
-}
-static std::vector<std::string> FLAG_function_inputs;
-IREE_FLAG_CALLBACK(
-    parse_function_io, print_function_io, &FLAG_function_inputs, function_input,
-    "An input (a) value or (b) buffer of the format:\n"
-    "  (a) scalar value\n"
-    "     value\n"
-    "     e.g.: --function_input=\"3.14\"\n"
-    "  (b) buffer:\n"
-    "     [shape]xtype=[value]\n"
-    "     e.g.: --function_input=\"2x2xi32=1 2 3 4\"\n"
-    "Optionally, brackets may be used to separate the element values:\n"
-    "  2x2xi32=[[1 2][3 4]]\n"
-    "Raw binary files can be read to provide buffer contents:\n"
-    "  2x2xi32=@some/file.bin\n"
-    "numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
-    "  @some.npy\n"
-    "Each occurrence of the flag indicates an input in the order they were\n"
-    "specified on the command line.");
-
 typedef struct iree_file_toc_t {
  const char* name;             // the file's original name
  char* data;             // beginning of the file
@@ -145,6 +87,225 @@ static void check_vk_result(VkResult err) {
  abort();
 }

+// Helper function to find Vulkan memory type bits. See ImGui_ImplVulkan_MemoryType() in imgui_impl_vulkan.cpp
+uint32_t findMemoryType(uint32_t type_filter, VkMemoryPropertyFlags properties)
+{
+  VkPhysicalDeviceMemoryProperties mem_properties;
+  vkGetPhysicalDeviceMemoryProperties(g_PhysicalDevice, &mem_properties);
+
+  for (uint32_t i = 0; i < mem_properties.memoryTypeCount; i++)
+  {
+    if ((type_filter & (1 << i)) && (mem_properties.memoryTypes[i].propertyFlags & properties) == properties)
+    {
+      return i;
+    }
+  }
+
+  return 0xFFFFFFFF; // Unable to find memoryType
+}
+
+// Helper function to load an image with common settings and return a VkDescriptorSet as a sort of Vulkan pointer
+bool LoadTextureFromFile(const char* filename, VkDescriptorSet* img_ds, int* image_width, int* image_height)
+{
+  // Specifying 4 channels forces stb to load the image in RGBA which is an easy format for Vulkan
+  int image_channels = 4;
+  unsigned char* image_data = stbi_load(filename, image_width, image_height, 0, image_channels);
+
+  if (image_data == NULL)
+  {
+    return false;
+  }
+
+  // Calculate allocation size (in number of bytes)
+  size_t image_size = (*image_width)*(*image_height)*image_channels;
+
+  VkResult err;
+
+  // Create the Vulkan image.
+  VkImage texture_image;
+  VkDeviceMemory texture_image_memory;
+  {
+    VkImageCreateInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+    info.imageType = VK_IMAGE_TYPE_2D;
+    info.format = VK_FORMAT_R8G8B8A8_UNORM;
+    info.extent.width = *image_width;
+    info.extent.height = *image_height;
+    info.extent.depth = 1;
+    info.mipLevels = 1;
+    info.arrayLayers = 1;
+    info.samples = VK_SAMPLE_COUNT_1_BIT;
+    info.tiling = VK_IMAGE_TILING_OPTIMAL;
+    info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+    info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    err = vkCreateImage(g_Device, &info, g_Allocator, &texture_image);
+    check_vk_result(err);
+    VkMemoryRequirements req;
+    vkGetImageMemoryRequirements(g_Device, texture_image, &req);
+    VkMemoryAllocateInfo alloc_info = {};
+    alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    alloc_info.allocationSize = req.size;
+    alloc_info.memoryTypeIndex = findMemoryType(req.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+    err = vkAllocateMemory(g_Device, &alloc_info, g_Allocator, &texture_image_memory);
+    check_vk_result(err);
+    err = vkBindImageMemory(g_Device, texture_image, texture_image_memory, 0);
+    check_vk_result(err);
+  }
+
+  // Create the Image View
+  VkImageView image_view;
+  {
+    VkImageViewCreateInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+    info.image = texture_image;
+    info.viewType = VK_IMAGE_VIEW_TYPE_2D;
+    info.format = VK_FORMAT_R8G8B8A8_UNORM;
+    info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    info.subresourceRange.levelCount = 1;
+    info.subresourceRange.layerCount = 1;
+    err = vkCreateImageView(g_Device, &info, g_Allocator, &image_view);
+    check_vk_result(err);
+  }
+
+  // Create Sampler
+  VkSampler sampler;
+  {
+    VkSamplerCreateInfo sampler_info{};
+    sampler_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
+    sampler_info.magFilter = VK_FILTER_LINEAR;
+    sampler_info.minFilter = VK_FILTER_LINEAR;
+    sampler_info.mipmapMode  = VK_SAMPLER_MIPMAP_MODE_LINEAR;
+    sampler_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; // outside image bounds just use border color
+    sampler_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+    sampler_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+    sampler_info.minLod = -1000;
+    sampler_info.maxLod = 1000;
+    sampler_info.maxAnisotropy = 1.0f;
+    err = vkCreateSampler(g_Device, &sampler_info, g_Allocator, &sampler);
+    check_vk_result(err);
+  }
+
+  // Create Descriptor Set using ImGUI's implementation
+  *img_ds = ImGui_ImplVulkan_AddTexture(sampler, image_view, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+
+  // Create Upload Buffer
+  VkBuffer upload_buffer;
+  VkDeviceMemory upload_buffer_memory;
+  {
+    VkBufferCreateInfo buffer_info = {};
+    buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    buffer_info.size = image_size;
+    buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+    buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    err = vkCreateBuffer(g_Device, &buffer_info, g_Allocator, &upload_buffer);
+    check_vk_result(err);
+    VkMemoryRequirements req;
+    vkGetBufferMemoryRequirements(g_Device, upload_buffer, &req);
+    VkMemoryAllocateInfo alloc_info = {};
+    alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    alloc_info.allocationSize = req.size;
+    alloc_info.memoryTypeIndex = findMemoryType(req.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+    err = vkAllocateMemory(g_Device, &alloc_info, g_Allocator, &upload_buffer_memory);
+    check_vk_result(err);
+    err = vkBindBufferMemory(g_Device, upload_buffer, upload_buffer_memory, 0);
+    check_vk_result(err);
+  }
+
+  // Upload to Buffer:
+  {
+    void* map = NULL;
+    err = vkMapMemory(g_Device, upload_buffer_memory, 0, image_size, 0, &map);
+    check_vk_result(err);
+    memcpy(map, image_data, image_size);
+    VkMappedMemoryRange range[1] = {};
+    range[0].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+    range[0].memory = upload_buffer_memory;
+    range[0].size = image_size;
+    err = vkFlushMappedMemoryRanges(g_Device, 1, range);
+    check_vk_result(err);
+    vkUnmapMemory(g_Device, upload_buffer_memory);
+  }
+
+  // Release image memory using stb
+  stbi_image_free(image_data);
+
+  // Create a command buffer that will perform following steps when hit in the command queue.
+  // TODO: this works in the example, but may need input if this is an acceptable way to access the pool/create the command buffer.
+  VkCommandPool command_pool = g_MainWindowData.Frames[g_MainWindowData.FrameIndex].CommandPool;
+  VkCommandBuffer command_buffer;
+  {
+    VkCommandBufferAllocateInfo alloc_info{};
+    alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    alloc_info.commandPool = command_pool;
+    alloc_info.commandBufferCount = 1;
+
+    err = vkAllocateCommandBuffers(g_Device, &alloc_info, &command_buffer);
+    check_vk_result(err);
+
+    VkCommandBufferBeginInfo begin_info = {};
+    begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    begin_info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    err = vkBeginCommandBuffer(command_buffer, &begin_info);
+    check_vk_result(err);
+  }
+
+  // Copy to Image
+  {
+    VkImageMemoryBarrier copy_barrier[1] = {};
+    copy_barrier[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    copy_barrier[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    copy_barrier[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    copy_barrier[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+    copy_barrier[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    copy_barrier[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    copy_barrier[0].image = texture_image;
+    copy_barrier[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    copy_barrier[0].subresourceRange.levelCount = 1;
+    copy_barrier[0].subresourceRange.layerCount = 1;
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, NULL, 0, NULL, 1, copy_barrier);
+
+    VkBufferImageCopy region = {};
+    region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    region.imageSubresource.layerCount = 1;
+    region.imageExtent.width = *image_width;
+    region.imageExtent.height = *image_height;
+    region.imageExtent.depth = 1;
+    vkCmdCopyBufferToImage(command_buffer, upload_buffer, texture_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);
+
+    VkImageMemoryBarrier use_barrier[1] = {};
+    use_barrier[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    use_barrier[0].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    use_barrier[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+    use_barrier[0].oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+    use_barrier[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+    use_barrier[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    use_barrier[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    use_barrier[0].image = texture_image;
+    use_barrier[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    use_barrier[0].subresourceRange.levelCount = 1;
+    use_barrier[0].subresourceRange.layerCount = 1;
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, 0, NULL, 0, NULL, 1, use_barrier);
+  }
+
+  // End command buffer
+  {
+    VkSubmitInfo end_info = {};
+    end_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    end_info.commandBufferCount = 1;
+    end_info.pCommandBuffers = &command_buffer;
+    err = vkEndCommandBuffer(command_buffer);
+    check_vk_result(err);
+    err = vkQueueSubmit(g_Queue, 1, &end_info, VK_NULL_HANDLE);
+    check_vk_result(err);
+    err = vkDeviceWaitIdle(g_Device);
+    check_vk_result(err);
+  }
+
+  return true;
+}
+
 // Returns the names of the Vulkan layers used for the given IREE
 // |extensibility_set| and |features|.
 std::vector<const char*> GetIreeLayers(
@@ -562,16 +723,7 @@ namespace iree {

 extern "C" int iree_main(int argc, char** argv) {

-  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
-  if (argc > 1) {
-    // Avoid iree-run-module spinning endlessly on stdin if the user uses single
-    // dashes for flags.
-    printf(
-        "[ERROR] unexpected positional argument (expected none)."
-        " Did you use pass a flag with a single dash ('-')?"
-        " Use '--' instead.\n");
-    return 1;
-  }
+  fprintf(stdout, "starting yo\n");

  // --------------------------------------------------------------------------
  // Create a window.
@@ -683,6 +835,8 @@ extern "C" int iree_main(int argc, char** argv) {

  // Demo state.
  bool show_iree_window = true;
+  // --------------------------------------------------------------------------
+
  // --------------------------------------------------------------------------
  // Setup IREE.

@@ -746,44 +900,69 @@ extern "C" int iree_main(int argc, char** argv) {


  // Load bytecode module
-  //iree_file_toc_t module_file_toc;
-  //const char network_model[] = "resnet50_tf.vmfb";
-  //fprintf(stdout, "Loading: %s\n", network_model);
-  //if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
-  //{
-  //    abort();
-  //    return 1;
-  //}
-  //fprintf(stdout, "module size: %zu\n", module_file_toc.size);
+  iree_file_toc_t module_file_toc;
+  const char network_model[] = "resnet50_tf.vmfb";
+  fprintf(stdout, "Loading: %s\n", network_model);
+  if (load_file(network_model, &module_file_toc.data, &module_file_toc.size) == false)
+  {
+      abort();
+      return 1;
+  }
+  fprintf(stdout, "module size: %zu\n", module_file_toc.size);
+
+  static float input_res50[224*224*3];
+  static float output_res50[1000];
+
+  char filename[] = "dog_imagenet.jpg";
+  fprintf(stdout, "loading: %s\n", filename);
+  int x,y,n;
+  //unsigned char *image_raw = stbi_load(filename, &x, &y, &n, 3);
+  stbi_load(filename, &x, &y, &n, 3);
+  fprintf(stdout, "res: %i x %i x %i\n", x, y, n);
+
+  /* Preprocessing needs to go here. For now use a buffer preprocessed in python.
+
+  //convert image into floating point format
+  for(int i=0;i<224*224*3;i++)
+  {
+    input_res50[i]= ((float)image_raw[i])/255.0f;
+  }*/
+
+  std::ifstream fin("dog.bin", std::ifstream::in | std::ifstream::binary);
+  fin.read((char*)input_res50, 224*224*3*sizeof(float));
+
+  // load image again so imgui can display it
+  int my_image_width = 0;
+  int my_image_height = 0;
+  VkDescriptorSet my_image_texture = 0;
+  bool ret = LoadTextureFromFile(filename, &my_image_texture, &my_image_width, &my_image_height);
+  fprintf(stdout, "creating vulkan image: %s\n", ret ?"OK":"FAIL");
+  IM_ASSERT(ret);

  iree_vm_module_t* bytecode_module = nullptr;
-  iree_status_t module_status = iree_tooling_load_module_from_flags(
-      iree_instance, iree_allocator_system(), &bytecode_module);
-  if (!iree_status_is_ok(module_status))
-    return -1;
-  //IREE_CHECK_OK(iree_vm_bytecode_module_create(
-  //    iree_instance,
-  //    iree_const_byte_span_t{
-  //        reinterpret_cast<const uint8_t*>(module_file_toc.data),
-  //        module_file_toc.size},
-  //    iree_allocator_null(), iree_allocator_system(), &bytecode_module));
-  //// Query for details about what is in the loaded module.
-  //iree_vm_module_signature_t bytecode_module_signature =
-  //    iree_vm_module_signature(bytecode_module);
-  //fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
-  //        bytecode_module_signature.export_function_count);
-  //for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
-  //  iree_vm_function_t function;
-  //  IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
-  //      bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
-  //  auto function_name = iree_vm_function_name(&function);
-  //  auto function_signature = iree_vm_function_signature(&function);
+  IREE_CHECK_OK(iree_vm_bytecode_module_create(
+      iree_instance,
+      iree_const_byte_span_t{
+          reinterpret_cast<const uint8_t*>(module_file_toc.data),
+          module_file_toc.size},
+      iree_allocator_null(), iree_allocator_system(), &bytecode_module));
+  // Query for details about what is in the loaded module.
+  iree_vm_module_signature_t bytecode_module_signature =
+      iree_vm_module_signature(bytecode_module);
+  fprintf(stdout, "Module loaded, have <%" PRIhsz "> exported functions:\n",
+          bytecode_module_signature.export_function_count);
+  for (int i = 0; i < bytecode_module_signature.export_function_count; ++i) {
+    iree_vm_function_t function;
+    IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
+        bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
+    auto function_name = iree_vm_function_name(&function);
+    auto function_signature = iree_vm_function_signature(&function);

-  //  fprintf(stdout, "  %d: '%.*s' with calling convention '%.*s'\n", i,
-  //          (int)function_name.size, function_name.data,
-  //          (int)function_signature.calling_convention.size,
-  //          function_signature.calling_convention.data);
-  //}
+    fprintf(stdout, "  %d: '%.*s' with calling convention '%.*s'\n", i,
+            (int)function_name.size, function_name.data,
+            (int)function_signature.calling_convention.size,
+            function_signature.calling_convention.data);
+  }

  // Allocate a context that will hold the module state across invocations.
  iree_vm_context_t* iree_context = nullptr;
@@ -809,42 +988,33 @@ extern "C" int iree_main(int argc, char** argv) {
        // Write inputs into mappable buffers.
        iree_hal_allocator_t* allocator =
            iree_hal_device_allocator(iree_vk_device);
-        //iree_hal_memory_type_t input_memory_type =
-        //    static_cast<iree_hal_memory_type_t>(
-        //        IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
-        //        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
-        //iree_hal_buffer_usage_t input_buffer_usage =
-        //    static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
-        //iree_hal_buffer_params_t buffer_params;
-        //buffer_params.type = input_memory_type;
-        //buffer_params.usage = input_buffer_usage;
-        //buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;
+        iree_hal_memory_type_t input_memory_type =
+            static_cast<iree_hal_memory_type_t>(
+                IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+                IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE);
+        iree_hal_buffer_usage_t input_buffer_usage =
+            static_cast<iree_hal_buffer_usage_t>(IREE_HAL_BUFFER_USAGE_DEFAULT);
+        iree_hal_buffer_params_t buffer_params;
+        buffer_params.type = input_memory_type;
+        buffer_params.usage = input_buffer_usage;
+        buffer_params.access = IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE;

       // Wrap input buffers in buffer views.

-        vm::ref<iree_vm_list_t> inputs;
-        iree_status_t input_status = ParseToVariantList(
+        iree_hal_buffer_view_t* input0_buffer_view = nullptr;
+        constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
+        IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
            allocator,
-            iree::span<const std::string>{FLAG_function_inputs.data(),
-                                          FLAG_function_inputs.size()},
-            iree_allocator_system(), &inputs);
-        if (!iree_status_is_ok(input_status))
-            return -1;
-        //vm::ref<iree_vm_list_t> inputs;
-        //IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
+            /*shape_rank=*/4, /*shape=*/input_buffer_shape,
+            IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+            IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
+            iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
+            &input0_buffer_view));

-        //iree_hal_buffer_view_t* input0_buffer_view = nullptr;
-        //constexpr iree_hal_dim_t input_buffer_shape[] = {1, 224, 224, 3};
-        //IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
-        //    allocator,
-        //    /*shape_rank=*/4, /*shape=*/input_buffer_shape,
-        //    IREE_HAL_ELEMENT_TYPE_FLOAT_32,
-        //    IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
-        //    iree_make_const_byte_span(&input_res50, sizeof(input_res50)),
-        //    &input0_buffer_view));
-
-        //auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
-        //IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));
+        vm::ref<iree_vm_list_t> inputs;
+        IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 6, iree_allocator_system(), &inputs));
+        auto input0_buffer_view_ref = iree_hal_buffer_view_move_ref(input0_buffer_view);
+        IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), &input0_buffer_view_ref));

        // Prepare outputs list to accept results from the invocation.

@@ -853,7 +1023,6 @@ extern "C" int iree_main(int argc, char** argv) {
        IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, kOutputCount * sizeof(float), iree_allocator_system(), &outputs));

  // --------------------------------------------------------------------------
-
  // Main loop.
  bool done = false;
  while (!done) {
@@ -907,11 +1076,46 @@ extern "C" int iree_main(int argc, char** argv) {
                                     /*policy=*/nullptr, inputs.get(),
                                     outputs.get(), iree_allocator_system()));

+        // Read back the results.
+        auto* output_buffer_view = reinterpret_cast<iree_hal_buffer_view_t*>(
+            iree_vm_list_get_ref_deref(outputs.get(),
+            0,
+            iree_hal_buffer_view_get_descriptor()));
+        IREE_CHECK_OK(iree_hal_device_transfer_d2h(
+            iree_vk_device,
+            iree_hal_buffer_view_buffer(output_buffer_view),
+            0,
+            output_res50, sizeof(output_res50),
+            IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout()));

        // we want to run continuously so we can use tools like RenderDoc, RGP, etc...
        dirty = true;
      }

+      // find maxarg from results
+      float max = 0.0f;
+      int max_idx = -1;
+      for(int i=0;i<1000;i++)
+      {
+        if (output_res50[i] > max)
+        {
+          max = output_res50[i];
+          max_idx = i;
+        }
+      }
+
+      ImGui::Text("pointer = %p", my_image_texture);
+      ImGui::Text("size = %d x %d", my_image_width, my_image_height);
+      ImGui::Image((ImTextureID)my_image_texture, ImVec2(my_image_width, my_image_height));
+
+      // Display the latest computation output.
+      ImGui::Text("Max   idx = [%i]", max_idx);
+      ImGui::Text("Max value = [%f]", max);
+
+      ImGui::Text("Resnet50 categories:");
+      ImGui::PlotHistogram("Histogram", output_res50, IM_ARRAYSIZE(output_res50), 0, NULL, 0.0f, 1.0f, ImVec2(0,80));
+      ImGui::Separator();
+
      // Framerate counter.
      ImGui::Text("Application average %.3f ms/frame (%.1f FPS)",
                  1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate);
@@ -933,7 +1137,6 @@ extern "C" int iree_main(int argc, char** argv) {
  iree_vm_module_release(bytecode_module);
  iree_vm_context_release(iree_context);
  iree_hal_device_release(iree_vk_device);
-  iree_hal_allocator_release(allocator);
  iree_hal_driver_release(iree_vk_driver);
  iree_hal_vulkan_syms_release(iree_vk_syms);
  iree_vm_instance_release(iree_instance);
--- a/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
+++ b/cpp/vulkan_gui/vulkan_resnet_inference_gui.cc
--- a/dataset/README.md
+++ b/dataset/README.md
@@ -1,27 +0,0 @@
-# Dataset annotation tool
-
-SHARK annotator for adding or modifying prompts of dataset images
-
-## Set up
-
-Activate SHARK Python virtual environment and install additional packages
-```shell
-source ../shark.venv/bin/activate
-pip install -r requirements.txt
-```
-
-## Run annotator
-
-```shell
-python annotation_tool.py
-```
-
-<img width="1280" alt="annotator" src="https://user-images.githubusercontent.com/49575973/214521137-7ef6ae10-7cd8-46e6-b270-b6c0445157f1.png">
-
-* Select a dataset from `Dataset` dropdown list
-* Select an image from `Image` dropdown list
-* Image and the existing prompt will be loaded
-* Select a prompt from `Prompt` dropdown list to modify or "Add new" to add a prompt
-* Click `Save` to save changes, click `Delete` to delete prompt
-* Click `Back` or `Next` to switch image, you could also select other images from `Image`
-* Click `Finish` when finishing annotation or before switching dataset
--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -1,233 +0,0 @@
-import gradio as gr
-import json
-import jsonlines
-import os
-from args import args
-from pathlib import Path
-from PIL import Image
-from utils import get_datasets
-
-
-shark_root = Path(__file__).parent.parent
-demo_css = shark_root.joinpath("web/demo.css").resolve()
-nodlogo_loc = shark_root.joinpath("web/models/stable_diffusion/logos/nod-logo.png")
-
-
-with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
-    with gr.Row(elem_id="ui_title"):
-        nod_logo = Image.open(nodlogo_loc)
-        with gr.Column(scale=1, elem_id="demo_title_outer"):
-            gr.Image(
-                value=nod_logo,
-                show_label=False,
-                interactive=False,
-                show_download_button=False,
-                elem_id="top_logo",
-                width=150,
-                height=100,
-            )
-
-    datasets, images, ds_w_prompts = get_datasets(args.gs_url)
-    prompt_data = dict()
-
-    with gr.Row(elem_id="ui_body"):
-        # TODO: add multiselect dataset, there is a gradio version conflict
-        dataset = gr.Dropdown(label="Dataset", choices=datasets)
-        image_name = gr.Dropdown(label="Image", choices=[])
-
-    with gr.Row(elem_id="ui_body"):
-        # TODO: add ability to search image by typing
-        with gr.Column(scale=1, min_width=600):
-            image = gr.Image(type="filepath", height=512)
-
-        with gr.Column(scale=1, min_width=600):
-            prompts = gr.Dropdown(
-                label="Prompts",
-                choices=[],
-            )
-            prompt = gr.Textbox(
-                label="Editor",
-                lines=3,
-            )
-            with gr.Row():
-                save = gr.Button("Save")
-                delete = gr.Button("Delete")
-            with gr.Row():
-                back_image = gr.Button("Back")
-                next_image = gr.Button("Next")
-            finish = gr.Button("Finish")
-
-    def filter_datasets(dataset):
-        if dataset is None:
-            return gr.Dropdown.update(value=None, choices=[])
-
-        # create the dataset dir if doesn't exist and download prompt file
-        dataset_path = str(shark_root) + "/dataset/" + dataset
-        if not os.path.exists(dataset_path):
-            os.mkdir(dataset_path)
-
-        # read prompt jsonlines file
-        prompt_data.clear()
-        if dataset in ds_w_prompts:
-            prompt_gs_path = args.gs_url + "/" + dataset + "/metadata.jsonl"
-            os.system(f'gsutil cp "{prompt_gs_path}" "{dataset_path}"/')
-            with jsonlines.open(dataset_path + "/metadata.jsonl") as reader:
-                for line in reader.iter(type=dict, skip_invalid=True):
-                    prompt_data[line["file_name"]] = (
-                        [line["text"]] if type(line["text"]) is str else line["text"]
-                    )
-
-        return gr.Dropdown.update(choices=images[dataset])
-
-    dataset.change(fn=filter_datasets, inputs=dataset, outputs=image_name)
-
-    def display_image(dataset, image_name):
-        if dataset is None or image_name is None:
-            return gr.Image.update(value=None), gr.Dropdown.update(value=None)
-
-        # download and load the image
-        img_gs_path = args.gs_url + "/" + dataset + "/" + image_name
-        img_sub_path = "/".join(image_name.split("/")[:-1])
-        img_dst_path = (
-            str(shark_root) + "/dataset/" + dataset + "/" + img_sub_path + "/"
-        )
-        if not os.path.exists(img_dst_path):
-            os.mkdir(img_dst_path)
-        os.system(f'gsutil cp "{img_gs_path}" "{img_dst_path}"')
-        img = Image.open(img_dst_path + image_name.split("/")[-1])
-
-        if image_name not in prompt_data.keys():
-            prompt_data[image_name] = []
-        prompt_choices = ["Add new"]
-        prompt_choices += prompt_data[image_name]
-        return gr.Image.update(value=img), gr.Dropdown.update(choices=prompt_choices)
-
-    image_name.change(
-        fn=display_image,
-        inputs=[dataset, image_name],
-        outputs=[image, prompts],
-    )
-
-    def edit_prompt(prompts):
-        if prompts == "Add new":
-            return gr.Textbox.update(value=None)
-
-        return gr.Textbox.update(value=prompts)
-
-    prompts.change(fn=edit_prompt, inputs=prompts, outputs=prompt)
-
-    def save_prompt(dataset, image_name, prompts, prompt):
-        if dataset is None or image_name is None or prompts is None or prompt is None:
-            return
-
-        if prompts == "Add new":
-            prompt_data[image_name].append(prompt)
-        else:
-            idx = prompt_data[image_name].index(prompts)
-            prompt_data[image_name][idx] = prompt
-
-        prompt_path = str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
-        # write prompt jsonlines file
-        with open(prompt_path, "w") as f:
-            for key, value in prompt_data.items():
-                if not value:
-                    continue
-                v = value if len(value) > 1 else value[0]
-                f.write(json.dumps({"file_name": key, "text": v}))
-                f.write("\n")
-
-        prompt_choices = ["Add new"]
-        prompt_choices += prompt_data[image_name]
-        return gr.Dropdown.update(choices=prompt_choices, value=None)
-
-    save.click(
-        fn=save_prompt,
-        inputs=[dataset, image_name, prompts, prompt],
-        outputs=prompts,
-    )
-
-    def delete_prompt(dataset, image_name, prompts):
-        if dataset is None or image_name is None or prompts is None:
-            return
-        if prompts == "Add new":
-            return
-
-        prompt_data[image_name].remove(prompts)
-        prompt_path = str(shark_root) + "/dataset/" + dataset + "/metadata.jsonl"
-        # write prompt jsonlines file
-        with open(prompt_path, "w") as f:
-            for key, value in prompt_data.items():
-                if not value:
-                    continue
-                v = value if len(value) > 1 else value[0]
-                f.write(json.dumps({"file_name": key, "text": v}))
-                f.write("\n")
-
-        prompt_choices = ["Add new"]
-        prompt_choices += prompt_data[image_name]
-        return gr.Dropdown.update(choices=prompt_choices, value=None)
-
-    delete.click(
-        fn=delete_prompt,
-        inputs=[dataset, image_name, prompts],
-        outputs=prompts,
-    )
-
-    def get_back_image(dataset, image_name):
-        if dataset is None or image_name is None:
-            return
-
-        # remove local image
-        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
-        os.system(f'rm "{img_path}"')
-        # get the index for the back image
-        idx = images[dataset].index(image_name)
-        if idx == 0:
-            return gr.Dropdown.update(value=None)
-
-        return gr.Dropdown.update(value=images[dataset][idx - 1])
-
-    back_image.click(
-        fn=get_back_image, inputs=[dataset, image_name], outputs=image_name
-    )
-
-    def get_next_image(dataset, image_name):
-        if dataset is None or image_name is None:
-            return
-
-        # remove local image
-        img_path = str(shark_root) + "/dataset/" + dataset + "/" + image_name
-        os.system(f'rm "{img_path}"')
-        # get the index for the next image
-        idx = images[dataset].index(image_name)
-        if idx == len(images[dataset]) - 1:
-            return gr.Dropdown.update(value=None)
-
-        return gr.Dropdown.update(value=images[dataset][idx + 1])
-
-    next_image.click(
-        fn=get_next_image, inputs=[dataset, image_name], outputs=image_name
-    )
-
-    def finish_annotation(dataset):
-        if dataset is None:
-            return
-
-        # upload prompt and remove local data
-        dataset_path = str(shark_root) + "/dataset/" + dataset
-        dataset_gs_path = args.gs_url + "/" + dataset + "/"
-        os.system(f'gsutil cp "{dataset_path}/metadata.jsonl" "{dataset_gs_path}"')
-        os.system(f'rm -rf "{dataset_path}"')
-
-        return gr.Dropdown.update(value=None)
-
-    finish.click(fn=finish_annotation, inputs=dataset, outputs=dataset)
-
-
-if __name__ == "__main__":
-    shark_web.launch(
-        share=args.share,
-        inbrowser=True,
-        server_name="0.0.0.0",
-        server_port=args.server_port,
-    )
--- a/dataset/args.py
+++ b/dataset/args.py
@@ -1,34 +0,0 @@
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-##############################################################################
-### Dataset Annotator flags
-##############################################################################
-
-p.add_argument(
-    "--gs_url",
-    type=str,
-    required=True,
-    help="URL to datasets in GS bucket",
-)
-
-p.add_argument(
-    "--share",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="flag for generating a public URL",
-)
-
-p.add_argument(
-    "--server_port",
-    type=int,
-    default=8080,
-    help="flag for setting server port",
-)
-
-##############################################################################
-
-args = p.parse_args()
--- a/dataset/requirements.txt
+++ b/dataset/requirements.txt
@@ -1,3 +0,0 @@
-# SHARK Annotator
-gradio==3.34.0
-jsonlines
--- a/dataset/utils.py
+++ b/dataset/utils.py
@@ -1,29 +0,0 @@
-from google.cloud import storage
-
-
-def get_datasets(gs_url):
-    datasets = set()
-    images = dict()
-    ds_w_prompts = []
-
-    storage_client = storage.Client()
-    bucket_name = gs_url.split("/")[2]
-    source_blob_name = "/".join(gs_url.split("/")[3:])
-    blobs = storage_client.list_blobs(bucket_name, prefix=source_blob_name)
-
-    for blob in blobs:
-        dataset_name = blob.name.split("/")[1]
-        if dataset_name == "":
-            continue
-        datasets.add(dataset_name)
-        if dataset_name not in images.keys():
-            images[dataset_name] = []
-
-        # check if image or jsonl
-        file_sub_path = "/".join(blob.name.split("/")[2:])
-        if "/" in file_sub_path:
-            images[dataset_name] += [file_sub_path]
-        elif "metadata.jsonl" in file_sub_path:
-            ds_w_prompts.append(dataset_name)
-
-    return list(datasets), images, ds_w_prompts
--- a/docs/shark_iree_profiling.md
+++ b/docs/shark_iree_profiling.md
@@ -1,118 +0,0 @@
-# Overview
-
-This document is intended to provide a starting point for profiling with SHARK/IREE. At it's core
-[SHARK](https://github.com/nod-ai/SHARK/tree/main/tank) is a python API that links the MLIR lowerings from various
-frameworks + frontends (e.g. PyTorch -> Torch-MLIR) with the compiler + runtime offered by IREE. More information
-on model coverage and framework support can be found [here](https://github.com/nod-ai/SHARK/tree/main/tank). The intended
-use case for SHARK is for compilation and deployment of performant state of the art AI models.
-
-![image](https://user-images.githubusercontent.com/22101546/217151219-9bb184a3-cfb9-4788-bb7e-5b502953525c.png)
-
-## Benchmarking with SHARK
-
-TODO: Expand this section.
-
-SHARK offers native benchmarking support, although because it is model focused, fine grain profiling is
-hidden when compared against the common "model benchmarking suite" use case SHARK is good at.
-
-### SharkBenchmarkRunner
-
-SharkBenchmarkRunner is a class designed for benchmarking models against other runtimes.
-TODO: List supported runtimes for comparison + example on how to benchmark with it.
-
-## Directly profiling IREE
-
-A number of excellent developer resources on profiling with IREE can be
-found [here](https://github.com/iree-org/iree/tree/main/docs/developers/developing_iree). As a result this section will
-focus on the bridging the gap between the two.
- - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling.md
- - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_with_tracy.md
- - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_vulkan_gpu.md
- - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_cpu_events.md
-
-Internally, SHARK builds a pair of IREE commands to compile + run a model. At a high level the flow starts with the
-model represented with a high level dialect (commonly Linalg) and is compiled to a flatbuffer (.vmfb) that
-the runtime is capable of ingesting. At this point (with potentially a few runtime flags) the compiled model is then run
-through the IREE runtime. This is all facilitated with the IREE python bindings, which offers a convenient method
-to capture the compile command SHARK comes up with. This is done by setting the environment variable
-`IREE_SAVE_TEMPS` to point to a directory of choice, e.g. for stable diffusion
-```
-# Linux
-$ export IREE_SAVE_TEMPS=/path/to/some/directory
-# Windows
-$ $env:IREE_SAVE_TEMPS="C:\path\to\some\directory"
-$ python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse" --save_vmfb
-```
-NOTE: Currently this will only save the compile command + input MLIR for a single model if run in a pipeline.
-In the case of stable diffusion this (should) be UNet so to get examples for other models in the pipeline they
-need to be extracted and tested individually.
-
-The save temps directory should contain three files: `core-command-line.txt`, `core-input.mlir`, and `core-output.bin`.
-The command line for compilation will start something like this, where the `-` needs to be replaced with the path to `core-input.mlir`.
-```
-/home/quinn/nod/iree-build/compiler/bindings/python/iree/compiler/tools/../_mlir_libs/iree-compile - --iree-input-type=none ...
-```
-The `-o output_filename.vmfb` flag can be used to specify the location to save the compiled vmfb. Note that a dump of the
-dispatches that can be compiled + run in isolation can be generated by adding `--iree-hal-dump-executable-benchmarks-to=/some/directory`. Say, if they are in the `benchmarks` directory, the following compile/run commands would work for Vulkan on RDNA3.
-```
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux  benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb
-
-iree-benchmark-module --module=benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb --function=forward --device=vulkan
-```
-Where `${NUM}` is the dispatch number that you want to benchmark/profile in isolation.
-
-### Enabling Tracy for Vulkan profiling
-
-To begin profiling with Tracy, a build of IREE runtime with tracing enabled is needed. SHARK-Runtime (SRT) builds an
-instrumented version alongside the normal version nightly (.whls typically found [here](https://github.com/nod-ai/SRT/releases)), however this is only available for Linux. For Windows, tracing can be enabled by enabling a CMake flag.
-```
-$env:IREE_ENABLE_RUNTIME_TRACING="ON"
-```
-Getting a trace can then be done by setting environment variable `TRACY_NO_EXIT=1` and running the program that is to be
-traced. Then, to actually capture the trace, use the `iree-tracy-capture` tool in a different terminal. Note that to get
-the capture and profiler tools the `IREE_BUILD_TRACY=ON` CMake flag needs to be set.
-```
-TRACY_NO_EXIT=1 python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse"
-
-# (in another terminal, either on the same machine or through ssh with a tunnel through port 8086)
-iree-tracy-capture -o trace_filename.tracy
-```
-To do it over ssh, the flow looks like this
-```
-# From terminal 1 on local machine
-ssh -L 8086:localhost:8086 <remote_server_name>
-TRACY_NO_EXIT=1 python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse"
-
-# From terminal 2 on local machine. Requires having built IREE with the CMake flag `IREE_BUILD_TRACY=ON` to build the required tooling.
-iree-tracy-capture -o /path/to/trace.tracy
-```
-
-The trace can then be viewed with
-```
-iree-tracy-profiler /path/to/trace.tracy
-```
-Capturing a runtime trace will work with any IREE tooling that uses the runtime. For example, `iree-benchmark-module`
-can be used for benchmarking an individual module. Importantly this means that any SHARK script can be profiled with tracy.
-
-NOTE: Not all backends have the same tracy support. This writeup is focused on CPU/Vulkan backends but there is recently added support for tracing on CUDA (requires the `--cuda_tracing` flag).
-
-## Experimental RGP support
-
-TODO: This section is temporary until proper RGP support is added.
-
-Currently, for stable diffusion there is a flag for enabling UNet to be visible to RGP with `--enable_rgp`. To get a proper capture though, the `DevModeSqttPrepareFrameCount=1` flag needs to be set for the driver (done with `VkPanel` on Windows).
-With these two settings, a single iteration of UNet can be captured.
-
-(AMD only) To get a dump of the pipelines (result of compiled SPIR-V) the `EnablePipelineDump=1` driver flag can be set. The
-files will typically be dumped to a directory called `spvPipeline` (on Linux `/var/tmp/spvPipeline`. The dumped files will
-include header information that can be used to map back to the source dispatch/SPIR-V, e.g.
-```
-[Version]
-version = 57 
-
-[CsSpvFile]
-fileName = Shader_0x946C08DFD0C10D9A.spv
-
-[CsInfo]
-entryPoint = forward_dispatch_193_matmul_256x65536x2304
-```
--- a/docs/shark_sd_blender.md
+++ b/docs/shark_sd_blender.md
@@ -1,75 +0,0 @@
-# Overview
-
-This document is intended to provide a starting point for using SHARK stable diffusion with Blender. 
-
-We currently make use of the [AI-Render Plugin](https://github.com/benrugg/AI-Render) to integrate with Blender.
-
-## Setup SHARK and prerequisites:
-
- * Download the latest SHARK SD webui .exe from [here](https://github.com/nod-ai/SHARK/releases) or follow instructions on the [README](https://github.com/nod-ai/SHARK#readme)
- * Once you have the .exe where you would like SHARK to install, run the .exe from terminal/PowerShell with the `--api` flag:
-```
-## Run the .exe in API mode:
-.\shark_sd_<date>_<ver>.exe --api
-
-## For example:
-.\shark_sd_20230411_671.exe --api --server_port=8082
-
-## From a the base directory of a source clone of SHARK:
-./setup_venv.ps1
-python apps\stable_diffusion\web\index.py --api
-
-```
-
-Your local SD server should start and look something like this:
-![image](https://user-images.githubusercontent.com/87458719/231369758-e2c3c45a-eccc-4fe5-a788-4a3bf1ace1d1.png)
-
- * Note: When running in api mode with `--api`, the .exe will not function as a webUI. Thus, the address in the terminal output will only be useful for API requests.
-
-### Install AI Render
-
- Get AI Render on [Blender Market](https://blendermarket.com/products/ai-render) or [Gumroad](https://airender.gumroad.com/l/ai-render)
- Open Blender, then go to Edit > Preferences > Add-ons > Install and then find the zip file
- We will be using the Automatic1111 SD backend for the AI-Render plugin. Follow instructions [here](https://github.com/benrugg/AI-Render/wiki/Local-Installation) to setup local SD backend.
-
-Your AI-Render preferences should be configured as shown; the highlighted part should match your terminal output:
-![image](https://user-images.githubusercontent.com/87458719/231390322-59a54a09-520a-4a08-b658-6e37bd63e932.png)
-
-
-The [AI-Render README](https://github.com/benrugg/AI-Render/blob/main/README.md) has more details on installation and usage, as well as video tutorials.
-
-## Using AI-Render + SHARK in your Blender project
-
- In the Render Properties tab, in the AI-Render dropdown, enable AI-Render.
-
-![image](https://user-images.githubusercontent.com/87458719/231392843-9bd51744-3ce2-464e-843a-0c4d4c96df0c.png)
-
- Select an image size (it's usually better to upscale later than go high on the img2img resolution here.)
-
-![image](https://user-images.githubusercontent.com/87458719/231394288-0c4ab8c5-dc30-4dbe-8bc1-7520ded5efe8.png)
-
- From here, you can enter a prompt and configure img2img Stable Diffusion parameters, and AI-Render will run SHARK SD img2img on the rendered scene.
- AI-Render has useful presets for aesthetic styles, so you should be able to keep your subject prompt simple and focus on creating a decent Blender scene to start from.
-
-![image](https://user-images.githubusercontent.com/87458719/231440729-2fe69586-41cb-4274-9ce7-f6c08def600b.png)
-
-## Examples:
-Scene (Input image):
-
-![blender-sample-2](https://user-images.githubusercontent.com/87458719/231450408-0e680086-3e52-4962-a5c1-c703a94d1583.png)
-
-Prompt:
-"A bowl of tangerines in front of rocks, masterpiece, oil on canvas, by Georgia O'Keefe, trending on artstation, landscape painting by Caspar David Friedrich"
-
-Negative Prompt (default):
-"ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
-
-Example output:
-
-![blender-sample-2_out](https://user-images.githubusercontent.com/87458719/231451145-a0b56897-a7d0-4add-bbed-7e8af21a65df.png)
-
-
-
-
-
-
--- a/docs/shark_sd_koboldcpp.md
+++ b/docs/shark_sd_koboldcpp.md
@@ -1,140 +0,0 @@
-# Overview
-
-In [1.47.2](https://github.com/LostRuins/koboldcpp/releases/tag/v1.47.2) [Koboldcpp](https://github.com/LostRuins/koboldcpp) added AUTOMATIC1111 integration for image generation. Since SHARK implements a small subset of the A1111 REST api, you can also use SHARK for this. This document gives a starting point for how to get this working.
-
-## In Action
-
-![preview](https://user-images.githubusercontent.com/121311569/280557602-bb97bad0-fdf5-4922-a2cc-4f327f2760db.jpg)
-
-## Memory considerations
-
-Since both Koboldcpp and SHARK will use VRAM on your graphic card(s) running both at the same time using the same card will impose extra limitations on the model size you can fully offload to the video card in Koboldcpp. For me, on a RX 7900 XTX on Windows with 24 GiB of VRAM, the limit was about a 13 Billion parameter model with Q5_K_M quantisation.
-
-## Performance Considerations
-
-When using SHARK for image generation, especially with Koboldcpp, you need to be aware that it is currently designed to pay a large upfront cost in time compiling and tuning the model you select, to get an optimal individual image generation time. You need to be the judge as to whether this trade-off is going to be worth it for your OS and hardware combination.
-
-It means that the first time you run a particular Stable Diffusion model for a particular combination of image size, LoRA, and VAE, SHARK will spend *many minutes* - even on a beefy machaine with very fast graphics card with lots of memory - building that model combination just so it can save it to disk. It may even have to go away and download the model if it doesn't already have it locally. Once it has done its build of a model combination for your hardware once, it shouldn't need to do it again until you upgrade to a newer SHARK version, install different drivers or change your graphics hardware. It will just upload the files it generated the first time to your graphics card and proceed from there.
-
-This does mean however, that on a brand new fresh install of SHARK that has not generated any images on a model you haven't selected before, the first image Koboldcpp requests may look like it is *never* going finish and that the whole process has broken. Be forewarned, make yourself a cup of coffee, and expect a lot of messages about compilation and tuning from SHARK in the terminal you ran it from.
-
-## Setup SHARK and prerequisites:
-
- * Make sure you have suitable drivers for your graphics card installed. See the prerequisties section of the [README](https://github.com/nod-ai/SHARK#readme).
- * Download the latest SHARK studio .exe from [here](https://github.com/nod-ai/SHARK/releases) or follow the instructions in the [README](https://github.com/nod-ai/SHARK#readme) for an advanced, Linux or Mac install.
- * Run SHARK from terminal/PowerShell with the `--api` flag. Since koboldcpp also expects both CORS support and the image generator to be running on port `7860` rather than SHARK default of `8080`, also include both the `--api_accept_origin` flag with a suitable origin (use `="*"` to enable all origins) and `--server_port=7860` on the command line. (See the if you want to run SHARK on a different port)
-
-```powershell
-## Run the .exe in API mode, with CORS support, on the A1111 endpoint port:
-.\node_ai_shark_studio_<date>_<ver>.exe --api --api_accept_origin="*"  --server_port=7860
-
-## Run trom the base directory of a source clone of SHARK on Windows:
-.\setup_venv.ps1
-python .\apps\stable_diffusion\web\index.py --api --api_accept_origin="*"  --server_port=7860
-
-## Run a the base directory of a source clone of SHARK on Linux:
-./setup_venv.sh
-source shark.venv/bin/activate
-python ./apps/stable_diffusion/web/index.py --api --api_accept_origin="*"  --server_port=7860
-
-## An example giving improved performance on AMD cards using vulkan, that runs on the same port as A1111
-.\node_ai_shark_studio_20320901_2525.exe --api --api_accept_origin="*" --device_allocator="caching" --server_port=7860
-
-## Since the api respects most applicable SHARK command line arguments for options not specified,
-## or currently unimplemented by API, there might be some you want to set, as listed in `--help`
-.\node_ai_shark_studio_20320901_2525.exe --help
-
-## For instance, the example above, but with a a custom VAE specified
-.\node_ai_shark_studio_20320901_2525.exe --api --api_accept_origin="*" --device_allocator="caching" --server_port=7860 --custom_vae="clearvae_v23.safetensors"
-
-## An example with multiple specific CORS origins
-python apps/stable_diffusion/web/index.py --api --api_accept_origin="koboldcpp.example.com:7001" --api_accept_origin="koboldcpp.example.com:7002" --server_port=7860
-```
-
-SHARK should start in server mode, and you should see something like this:
-
-![SHARK API startup](https://user-images.githubusercontent.com/121311569/280556294-c3f7fc1a-c8e2-467d-afe6-365638d6823a.png)
-
-* Note: When running in api mode with `--api`, the .exe will not function as a webUI. Thus, the address or port shown in the terminal output will only be useful for API requests.
-
-
-## Configure Koboldcpp for local image generation:
-
-* Get the latest [Koboldcpp](https://github.com/LostRuins/koboldcpp/releases) if you don't already have it. If you have a recent AMD card that has ROCm HIP [support for Windows](https://rocmdocs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus) or [support for Linux](https://rocmdocs.amd.com/en/latest/release/gpu_os_support.html#linux-supported-gpus), you'll likely prefer [YellowRosecx's ROCm fork](https://github.com/YellowRoseCx/koboldcpp-rocm).
-* Start Koboldcpp in another terminal/Powershell and setup your model configuration. Refer to the [Koboldcpp README](https://github.com/YellowRoseCx/koboldcpp-rocm) for more details on how to do this if this is your first time using Koboldcpp.
-* Once the main UI has loaded into your browser click the settings button, go to the advanced tab, and then choose *Local A1111* from the generate images dropdown:
-
-  ![Settings button location](https://user-images.githubusercontent.com/121311569/280556246-10692d79-e89f-4fdf-87ba-82f3d78ed49d.png)
-
-  ![Advanced Settings with 'Local A1111' location](https://user-images.githubusercontent.com/121311569/280556234-6ebc8ba7-1469-442a-93a7-5626a094ddf1.png)
-
-  *if you get an error here, see the next section [below](#connecting-to-shark-on-a-different-address-or-port)*
-
-* A list of Stable Diffusion models available to your SHARK instance should now be listed in the box below *generate images*. The default value will usually be set to `stabilityai/stable-diffusion-2-1-base`. Choose the model you want to use for image generation from the list (but see [performance considerations](#performance-considerations)).
-* You should now be ready to generate images, either by clicking the 'Add Img' button above the text entry box:
-
-  ![Add Image Button](https://user-images.githubusercontent.com/121311569/280556161-846c7883-4a83-4458-a56a-bd9f93ca354c.png)
-
-  ...or by selecting the 'Autogenerate' option in the settings:
-
-  ![Setting the autogenerate images option](https://user-images.githubusercontent.com/121311569/280556230-ae221a46-ba68-499b-a519-c8f290bbbeae.png)
-
-  *I often find that even if I have selected autogenerate I have to do an 'add img' to get things started off*
-
-* There is one final piece of image generation configuration within Koboldcpp you might want to do. This is also in the generate images section of advanced settings. Here there is, not very obviously, a 'style' button:
-
-  ![Selecting the 'styles' button](https://user-images.githubusercontent.com/121311569/280556694-55cd1c55-a059-4b54-9293-63d66a32368e.png)
-
-  This will bring up a dialog box where you can enter a short text that will sent as a prefix to the Prompt sent to SHARK:
-
-  ![Entering extra image styles](https://user-images.githubusercontent.com/121311569/280556172-4aab9794-7a77-46d7-bdda-43df570ad19a.png)
-
-
-## Connecting to SHARK on a different address or port
-
-If you didn't set the port to `--server_port=7860` when starting SHARK, or you are running it on different machine on your network than you are running Koboldcpp, or to where you are running the koboldcpp's kdlite client frontend, then you very likely got the following error:
-
-  ![Can't find the A1111 endpoint error](https://user-images.githubusercontent.com/121311569/280555857-601f53dc-35e9-4027-9180-baa61d2393ba.png)
-
-As long as SHARK is running correctly, this means you need to set the url and port to the correct values in Koboldcpp. For instance. to set the port that Koboldcpp looks for an image generator to SHARK's default port of 8080:
-
-* Select the cog icon the Generate Images section of Advanced settings:
-
-     ![Selecting the endpoint cog](https://user-images.githubusercontent.com/121311569/280555866-4287ecc5-f29f-4c03-8f5a-abeaf31b0442.png)
-
-* Then edit the port number at the end of the url in the 'A1111 Endpoint Selection' dialog box to read 8080:
-
-     ![Changing the endpoint port](https://user-images.githubusercontent.com/121311569/280556170-f8848b7b-6fc9-4cf7-80eb-5c312f332fd9.png)
-
-* Similarly, when running SHARK on a different machine you will need to change host part of the endpoint url to the hostname or ip address where SHARK is running, similarly:
-
-    ![Changing the endpoint hostname](https://user-images.githubusercontent.com/121311569/280556167-c6541dea-0f85-417a-b661-fdf4dc40d05f.png)
-
-## Examples
-
-Here's how Koboldcpp shows an image being requested:
-
-  ![An image being generated]((https://user-images.githubusercontent.com/121311569/280556210-bb1c9efd-79ac-478e-b726-b25b82ef2186.png)
-
-The generated image in context in story mode:
-
- ![A generated image](https://user-images.githubusercontent.com/121311569/280556179-4e9f3752-f349-4cba-bc6a-f85f8dc79b10.jpg)
-
-And the same image when clicked on:
-
- ![A selected image](https://user-images.githubusercontent.com/121311569/280556216-2ca4c0a4-3889-4ef5-8a09-30084fb34081.jpg)
-
-
-## Where to find the images in SHARK
-
-Even though Koboldcpp requests images at a size of 512x512, it resizes then to 256x256, converts them to `.jpeg`, and only shows them at 200x200 in the main text window. It does this so it can save them compactly embedded in your story as a `data://` uri.
-
-However the images at the original size are saved by SHARK in its `output_dir` which is usually a folder named for the current date. inside `generated_imgs` folder in the SHARK installation directory.
-
-You can browse these, either using the Output Gallery tab from within the SHARK web ui:
-
-  ![SHARK web ui output gallery tab](https://user-images.githubusercontent.com/121311569/280556582-9303ca85-2594-4a8c-97a2-fbd72337980b.jpg)
-
-...or by browsing to the `output_dir` in your operating system's file manager:
-
-  ![SHARK output directory subfolder in Windows File Explorer](https://user-images.githubusercontent.com/121311569/280556297-66173030-2324-415c-a236-ef3fcd73e6ed.jpg)
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -0,0 +1,251 @@
+# Lint as: python3
+"""SHARK Tank"""
+# python generate_sharktank.py, you have to give a csv tile with [model_name, model_download_url]
+# will generate local shark tank folder like this:
+#   HOME
+#     /.local
+#       /shark_tank
+#           /albert_lite_base
+#           /...model_name...
+#
+
+import os
+import csv
+import argparse
+from shark.shark_importer import SharkImporter
+from shark.parser import shark_args
+import tensorflow as tf
+import subprocess as sp
+import hashlib
+import numpy as np
+from pathlib import Path
+
+visible_default = tf.config.list_physical_devices("GPU")
+try:
+    tf.config.set_visible_devices([], "GPU")
+    visible_devices = tf.config.get_visible_devices()
+    for device in visible_devices:
+        assert device.device_type != "GPU"
+except:
+    # Invalid device or cannot modify virtual devices once initialized.
+    pass
+
+
+def create_hash(file_name):
+    with open(file_name, "rb") as f:
+        file_hash = hashlib.blake2b()
+        while chunk := f.read(2**20):
+            file_hash.update(chunk)
+
+    return file_hash.hexdigest()
+
+
+def save_torch_model(torch_model_list):
+    from tank.model_utils import get_hf_model
+    from tank.model_utils import get_vision_model
+    from tank.model_utils import get_hf_img_cls_model
+
+    with open(torch_model_list) as csvfile:
+        torch_reader = csv.reader(csvfile, delimiter=",")
+        fields = next(torch_reader)
+        for row in torch_reader:
+            torch_model_name = row[0]
+            tracing_required = row[1]
+            model_type = row[2]
+            is_dynamic = row[3]
+
+            tracing_required = False if tracing_required == "False" else True
+            is_dynamic = False if is_dynamic == "False" else True
+
+            model = None
+            input = None
+            if model_type == "vision":
+                model, input, _ = get_vision_model(torch_model_name)
+            elif model_type == "hf":
+                model, input, _ = get_hf_model(torch_model_name)
+            elif model_type == "hf_img_cls":
+                model, input, _ = get_hf_img_cls_model(torch_model_name)
+
+            torch_model_name = torch_model_name.replace("/", "_")
+            torch_model_dir = os.path.join(
+                WORKDIR, str(torch_model_name) + "_torch"
+            )
+            os.makedirs(torch_model_dir, exist_ok=True)
+
+            mlir_importer = SharkImporter(
+                model,
+                (input,),
+                frontend="torch",
+            )
+            mlir_importer.import_debug(
+                is_dynamic=False,
+                tracing_required=tracing_required,
+                dir=torch_model_dir,
+                model_name=torch_model_name,
+            )
+            mlir_hash = create_hash(
+                os.path.join(
+                    torch_model_dir, torch_model_name + "_torch" + ".mlir"
+                )
+            )
+            np.save(os.path.join(torch_model_dir, "hash"), np.array(mlir_hash))
+            # Generate torch dynamic models.
+            if is_dynamic:
+                mlir_importer.import_debug(
+                    is_dynamic=True,
+                    tracing_required=tracing_required,
+                    dir=torch_model_dir,
+                    model_name=torch_model_name + "_dynamic",
+                )
+
+
+def save_tf_model(tf_model_list):
+    from tank.model_utils_tf import (
+        get_causal_image_model,
+        get_causal_lm_model,
+        get_keras_model,
+        get_TFhf_model,
+    )
+
+    with open(tf_model_list) as csvfile:
+        tf_reader = csv.reader(csvfile, delimiter=",")
+        fields = next(tf_reader)
+        for row in tf_reader:
+            tf_model_name = row[0]
+            model_type = row[1]
+
+            model = None
+            input = None
+            print(f"Generating artifacts for model {tf_model_name}")
+            if model_type == "hf":
+                model, input, _ = get_causal_lm_model(tf_model_name)
+            if model_type == "img":
+                model, input, _ = get_causal_image_model(tf_model_name)
+            if model_type == "keras":
+                model, input, _ = get_keras_model(tf_model_name)
+            if model_type == "TFhf":
+                model, input, _ = get_TFhf_model(tf_model_name)
+
+            tf_model_name = tf_model_name.replace("/", "_")
+            tf_model_dir = os.path.join(WORKDIR, str(tf_model_name) + "_tf")
+            os.makedirs(tf_model_dir, exist_ok=True)
+
+            mlir_importer = SharkImporter(
+                model,
+                input,
+                frontend="tf",
+            )
+            mlir_importer.import_debug(
+                dir=tf_model_dir,
+                model_name=tf_model_name,
+            )
+            mlir_hash = create_hash(
+                os.path.join(tf_model_dir, tf_model_name + "_tf" + ".mlir")
+            )
+            np.save(os.path.join(tf_model_dir, "hash"), np.array(mlir_hash))
+
+
+def save_tflite_model(tflite_model_list):
+    from shark.tflite_utils import TFLitePreprocessor
+
+    with open(tflite_model_list) as csvfile:
+        tflite_reader = csv.reader(csvfile, delimiter=",")
+        for row in tflite_reader:
+            print("\n")
+            tflite_model_name = row[0]
+            tflite_model_link = row[1]
+            print("tflite_model_name", tflite_model_name)
+            print("tflite_model_link", tflite_model_link)
+            tflite_model_name_dir = os.path.join(
+                WORKDIR, str(tflite_model_name) + "_tflite"
+            )
+            os.makedirs(tflite_model_name_dir, exist_ok=True)
+            print(f"TMP_TFLITE_MODELNAME_DIR = {tflite_model_name_dir}")
+
+            # Preprocess to get SharkImporter input args
+            tflite_preprocessor = TFLitePreprocessor(str(tflite_model_name))
+            raw_model_file_path = tflite_preprocessor.get_raw_model_file()
+            inputs = tflite_preprocessor.get_inputs()
+            tflite_interpreter = tflite_preprocessor.get_interpreter()
+
+            # Use SharkImporter to get SharkInference input args
+            my_shark_importer = SharkImporter(
+                module=tflite_interpreter,
+                inputs=inputs,
+                frontend="tflite",
+                raw_model_file=raw_model_file_path,
+            )
+            my_shark_importer.import_debug(
+                dir=tflite_model_name_dir,
+                model_name=tflite_model_name,
+                func_name="main",
+            )
+            mlir_hash = create_hash(
+                os.path.join(
+                    tflite_model_name_dir,
+                    tflite_model_name + "_tflite" + ".mlir",
+                )
+            )
+            np.save(
+                os.path.join(tflite_model_name_dir, "hash"),
+                np.array(mlir_hash),
+            )
+
+
+# Validates whether the file is present or not.
+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--torch_model_csv",
+        type=lambda x: is_valid_file(x),
+        default="./tank/pytorch/torch_model_list.csv",
+        help="""Contains the file with torch_model name and args.
+             Please see: https://github.com/nod-ai/SHARK/blob/main/tank/pytorch/torch_model_list.csv""",
+    )
+    parser.add_argument(
+        "--tf_model_csv",
+        type=lambda x: is_valid_file(x),
+        default="./tank/tf/tf_model_list.csv",
+        help="Contains the file with tf model name and args.",
+    )
+    parser.add_argument(
+        "--tflite_model_csv",
+        type=lambda x: is_valid_file(x),
+        default="./tank/tflite/tflite_model_list.csv",
+        help="Contains the file with tf model name and args.",
+    )
+    parser.add_argument(
+        "--ci_tank_dir",
+        type=bool,
+        default=False,
+    )
+    parser.add_argument("--upload", type=bool, default=False)
+
+    args = parser.parse_args()
+
+    home = str(Path.home())
+    if args.ci_tank_dir == True:
+        WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
+    else:
+        WORKDIR = os.path.join(home, ".local/shark_tank/")
+
+    if args.torch_model_csv:
+        save_torch_model(args.torch_model_csv)
+
+    if args.tf_model_csv:
+        save_tf_model(args.tf_model_csv)
+
+    if args.tflite_model_csv:
+        save_tflite_model(args.tflite_model_csv)
+
+    if args.upload:
+        git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
+        print("uploading files to gs://shark_tank/" + git_hash)
+        os.system(f"gsutil cp -r {WORKDIR}* gs://shark_tank/" + git_hash)
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -0,0 +1,192 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.17)
+
+project(sharkbackend LANGUAGES C CXX)
+
+#
+# Options
+#
+
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
+option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+#
+# Dependencies
+#
+# FetchContent requires us to include the transitive closure of all
+# repos that we depend on so that we can override the tags.
+#
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-core
+  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_TAG ${TRITON_CORE_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-backend
+  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(repo-common repo-core repo-backend)
+
+#
+# The backend must be built into a shared library. Use an ldscript to
+# hide all symbols except for the TRITONBACKEND API.
+#
+configure_file(src/libtriton_dshark.ldscript libtriton_dshark.ldscript COPYONLY)
+
+add_library(
+  triton-dshark-backend SHARED
+  src/dshark.cc
+  #src/dshark_driver_module.c
+)
+
+add_library(
+  SharkBackend::triton-dshark-backend ALIAS triton-dshark-backend
+)
+
+target_include_directories(
+  triton-dshark-backend
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_BINARY_DIR}/lib/cmake/mlir")
+
+add_subdirectory(thirdparty/shark-runtime EXCLUDE_FROM_ALL)
+
+target_link_libraries(triton-dshark-backend PRIVATE iree_base_base
+  iree_hal_hal
+  iree_hal_cuda_cuda
+  iree_hal_cuda_registration_registration
+  iree_hal_vmvx_registration_registration
+  iree_hal_dylib_registration_registration
+  iree_modules_hal_hal
+  iree_vm_vm
+  iree_vm_bytecode_module
+  iree_hal_local_loaders_system_library_loader
+  iree_hal_local_loaders_vmvx_module_loader
+  )
+
+target_compile_features(triton-dshark-backend PRIVATE cxx_std_11)
+
+
+target_link_libraries(
+  triton-dshark-backend
+  PRIVATE
+    triton-core-serverapi   # from repo-core
+    triton-core-backendapi  # from repo-core
+    triton-core-serverstub  # from repo-core
+    triton-backend-utils    # from repo-backend
+)
+
+if(WIN32)
+  set_target_properties(
+    triton-dshark-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_dshark
+  )
+else()
+  set_target_properties(
+    triton-dshark-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_dshark
+    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_dshark.ldscript
+    LINK_FLAGS "-Wl,--version-script libtriton_dshark.ldscript"
+  )
+endif()
+
+
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/SharkBackend)
+
+install(
+  TARGETS
+    triton-dshark-backend
+  EXPORT
+    triton-dshark-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
+  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
+)
+
+install(
+  EXPORT
+    triton-dshark-backend-targets
+  FILE
+    SharkBackendTargets.cmake
+  NAMESPACE
+    SharkBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/SharkBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-dshark-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendTargets.cmake
+  NAMESPACE SharkBackend::
+)
+
+export(PACKAGE SharkBackend)
+
--- a/inference/README.md
+++ b/inference/README.md
@@ -0,0 +1,100 @@
+# SHARK Triton Backend
+
+The triton backend for shark.
+
+# Build
+
+Install SHARK
+
+```
+git clone https://github.com/nod-ai/SHARK.git
+# skip above step if dshark is already installed
+cd SHARK/inference
+```
+
+install dependancies
+
+```
+apt-get install patchelf rapidjson-dev python3-dev
+git submodule update --init
+```
+
+update the submodules of iree
+
+```
+cd thirdparty/shark-runtime
+git submodule update --init
+```
+
+Next, make the backend and install it
+
+```
+cd ../..
+mkdir build && cd build
+cmake -DTRITON_ENABLE_GPU=ON \
+-DIREE_HAL_DRIVER_CUDA=ON \
+-DIREE_TARGET_BACKEND_CUDA=ON \
+-DMLIR_ENABLE_CUDA_RUNNER=ON \
+-DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \
+-DTRITON_BACKEND_REPO_TAG=r22.02 \
+-DTRITON_CORE_REPO_TAG=r22.02 \
+-DTRITON_COMMON_REPO_TAG=r22.02 ..
+make install
+```
+
+# Incorporating into Triton
+
+There are much more in depth explenations for the following steps in triton's documentation:
+https://github.com/triton-inference-server/server/blob/main/docs/compose.md#triton-with-unsupported-and-custom-backends
+
+There should be a file at /build/install/backends/dshark/libtriton_dshark.so.  You will need to copy it into your triton server image.  
+More documentation is in the link above, but to create the docker image, you need to run the compose.py command in the triton-backend server repo
+
+
+To first build your image, clone the tritonserver repo.
+
+```
+git clone https://github.com/triton-inference-server/server.git
+```
+
+then run `compose.py` to build a docker compose file 
+```
+cd server
+python3 compose.py --repoagent checksum --dry-run
+```
+
+Because dshark is a third party backend, you will need to manually modify the `Dockerfile.compose` to include the dshark backend.  To do this, in the Dockerfile.compose file produced, copy this line.
+the dshark backend will be located in the build folder from earlier under `/build/install/backends`
+
+```
+COPY /path/to/build/install/backends/dshark /opt/tritonserver/backends/dshark
+```
+
+Next run 
+```
+docker build -t tritonserver_custom -f Dockerfile.compose .
+docker run -it --gpus=1 --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
+```
+
+where `path/to/model_repos` is where you are storing the models you want to run
+
+if your not using gpus, omit `--gpus=1`
+
+```
+docker run -it  --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
+```
+
+# Setting up a model
+
+to include a model in your backend, add a directory with your model name to your model repository directory.  examples of models can be seen here: https://github.com/triton-inference-server/backend/tree/main/examples/model_repos/minimal_models
+
+make sure to adjust the input correctly in the config.pbtxt file, and save a vmfb file under 1/model.vmfb
+
+# CUDA
+
+if you're having issues with cuda, make sure your correct drivers are installed, and that `nvidia-smi` works, and also make sure that the nvcc compiler is on the path.
+
+
+
+
+
--- a/inference/cmake/SharkBackendConfig.cmake.in
+++ b/inference/cmake/SharkBackendConfig.cmake.in
@@ -0,0 +1,39 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  SHARKBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${SHARKBACKEND_CMAKE_DIR})
+
+if(NOT TARGET SharkBackend::triton-dshark-backend)
+  include("${SHARKBACKEND_CMAKE_DIR}/SharkBackendTargets.cmake")
+endif()
+
+set(SHARKBACKEND_LIBRARIES SharkBackend::triton-dshark-backend)
--- a/inference/src/dshark.cc
+++ b/inference/src/dshark.cc
--- a/inference/src/libtriton_dshark.ldscript
+++ b/inference/src/libtriton_dshark.ldscript
@@ -0,0 +1,30 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
--- a/inference/thirdparty/shark-runtime
+++ b/inference/thirdparty/shark-runtime
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -1,66 +0,0 @@
-# This script will toggle the comment/uncommenting aspect for dealing
-# with __file__ AttributeError arising in case of a few modules in
-# `torch/_dynamo/skipfiles.py` (within shark.venv)
-
-from distutils.sysconfig import get_python_lib
-import fileinput
-from pathlib import Path
-
-# Temporary workaround for transformers/__init__.py.
-path_to_transformers_hook = Path(
-    get_python_lib() + "/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-transformers.py"
-)
-if path_to_transformers_hook.is_file():
-    pass
-else:
-    with open(path_to_transformers_hook, "w") as f:
-        f.write("module_collection_mode = 'pyz+py'")
-
-path_to_skipfiles = Path(get_python_lib() + "/torch/_dynamo/skipfiles.py")
-
-modules_to_comment = ["abc,", "os,", "posixpath,", "_collections_abc,"]
-startMonitoring = 0
-for line in fileinput.input(path_to_skipfiles, inplace=True):
-    if "SKIP_DIRS = " in line:
-        startMonitoring = 1
-        print(line, end="")
-    elif startMonitoring in [1, 2]:
-        if "]" in line:
-            startMonitoring += 1
-            print(line, end="")
-        else:
-            flag = True
-            for module in modules_to_comment:
-                if module in line:
-                    if not line.startswith("#"):
-                        print(f"#{line}", end="")
-                    else:
-                        print(f"{line[1:]}", end="")
-                    flag = False
-                    break
-            if flag:
-                print(line, end="")
-    else:
-        print(line, end="")
-
-# For getting around scikit-image's packaging, laze_loader has had a patch merged but yet to be released.
-# Refer: https://github.com/scientific-python/lazy_loader
-path_to_lazy_loader = Path(get_python_lib() + "/lazy_loader/__init__.py")
-
-for line in fileinput.input(path_to_lazy_loader, inplace=True):
-    if 'stubfile = filename if filename.endswith("i")' in line:
-        print(
-            '    stubfile = (filename if filename.endswith("i") else f"{os.path.splitext(filename)[0]}.pyi")',
-            end="",
-        )
-    else:
-        print(line, end="")
-
-# For getting around timm's packaging.
-# Refer: https://github.com/pyinstaller/pyinstaller/issues/5673#issuecomment-808731505
-path_to_timm_activations = Path(get_python_lib() + "/timm/layers/activations_jit.py")
-for line in fileinput.input(path_to_timm_activations, inplace=True):
-    if "@torch.jit.script" in line:
-        print("@torch.jit._script_if_tracing", end="\n")
-    else:
-        print(line, end="")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,26 +4,9 @@ requires = [
    "wheel",
    "packaging",

-    "numpy>=1.22.4",
-    "iree-compiler>=20221022.190",
-    "iree-runtime>=20221022.190",
+    "numpy==1.22.4",
+    "torch-mlir>=20220428.420",
+    "iree-compiler>=20220427.13",
+    "iree-runtime>=20220427.13",
 ]
 build-backend = "setuptools.build_meta"
-
-[tool.black]
-include = '\.pyi?$'
-exclude = '''
-(
-  /(
-    | apps/stable_diffusion
-    | apps/language_models
-    | shark
-    | benchmarks
-    | tank
-    | build
-    | generated_imgs
-    | shark.venv
-  )/
-  | setup.py
-)
-'''
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
-addopts = --verbose -s -p no:warnings
-norecursedirs = inference tank/tflite examples benchmarks shark apps/shark_studio
+addopts = --verbose -p no:warnings
+norecursedirs = inference tank/tflite 
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -1,4 +1,4 @@
-f https://download.pytorch.org/whl/nightly/cpu/
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 --pre

 numpy
@@ -8,8 +8,19 @@ torchvision
 tqdm

 #iree-compiler  | iree-runtime should already be installed
+#these dont work ok osx
+#iree-tools-tflite
+#iree-tools-xla
+#iree-tools-tf

+# TensorFlow and JAX.
+gin-config
+tensorflow-macos
+tensorflow-metal
+#tf-models-nightly
+#tensorflow-text-nightly
 transformers
+tensorflow-probability
 #jax[cpu]

 # tflitehub dependencies.
@@ -17,7 +28,6 @@ Pillow

 # web dependecies.
 gradio
-altair

 # Testing and support.
 #lit
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -1,21 +1,29 @@
 -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 --pre

-numpy>1.22.4
-pytorch-triton
-torchvision 
-tabulate
+numpy==1.22.4
+torch
+torchvision

 tqdm

 #iree-compiler  | iree-runtime should already be installed
+iree-tools-tflite
 iree-tools-xla
+iree-tools-tf

-# Modelling and JAX.
+# TensorFlow and JAX.
 gin-config
+tensorflow
+#tf-models-nightly
+#tensorflow-text-nightly
 transformers
 diffusers
+#tensorflow-probability
 #jax[cpu]
+
+
+# tflitehub dependencies.
 Pillow

 # Testing and support.
@@ -23,11 +31,9 @@ lit
 pyyaml
 python-dateutil
 sacremoses
-sentencepiece

 # web dependecies.
-gradio==3.44.3
-altair
+gradio
 scipy

 #ONNX and ORT for benchmarking
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,54 +1,14 @@
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-f https://openxla.github.io/iree/pip-release-links.html
--pre
-
 setuptools
 wheel

-shark-turbine @ git+https://github.com/nod-ai/SHARK-Turbine.git@main
-turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine#egg=turbine-models&subdirectory=python/turbine_models
-
 # SHARK Runner
 tqdm

 # SHARK Downloader
-google-cloud-storage
+gsutil

 # Testing
 pytest
 pytest-xdist
-pytest-forked
 Pillow
 parameterized
-
-# Add transformers, diffusers and scipy since it most commonly used
-#accelerate is now required for diffusers import from ckpt.
-accelerate
-scipy
-ftfy
-gradio==4.8.0
-altair
-omegaconf
-# 0.3.2 doesn't have binaries for arm64
-safetensors==0.3.1
-opencv-python
-scikit-image
-pytorch_lightning # for runwayml models
-tk
-pywebview
-sentencepiece
-py-cpuinfo
-tiktoken # for codegen
-joblib # for langchain
-timm # for MiniGPT4
-langchain
-einops # for zoedepth
-pydantic==2.4.1 # pin until pyinstaller-hooks-contrib works with beta versions
-
-# Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
-pefile
-pyinstaller
-
-# For quantized GPTQ models
-optimum
-auto_gptq
--- a/rest_api_tests/api_test.py
+++ b/rest_api_tests/api_test.py
@@ -1,348 +0,0 @@
-import requests
-from PIL import Image
-import base64
-from io import BytesIO
-
-
-def upscaler_test(verbose=False):
-    # Define values here
-    prompt = ""
-    negative_prompt = ""
-    seed = 2121991605
-    height = 512
-    width = 512
-    steps = 50
-    noise_level = 10
-    cfg_scale = 7
-    image_path = r"./rest_api_tests/dog.png"
-
-    # Converting Image to base64
-    img_file = open(image_path, "rb")
-    init_images = [
-        "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
-    ]
-
-    url = "http://127.0.0.1:8080/sdapi/v1/upscaler"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    data = {
-        "prompt": prompt,
-        "negative_prompt": negative_prompt,
-        "seed": seed,
-        "height": height,
-        "width": width,
-        "steps": steps,
-        "noise_level": noise_level,
-        "cfg_scale": cfg_scale,
-        "init_images": init_images,
-    }
-
-    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
-
-    print(f"[upscaler] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
-
-
-def img2img_test(verbose=False):
-    # Define values here
-    prompt = "Paint a rabbit riding on the dog"
-    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
-    seed = 2121991605
-    height = 512
-    width = 512
-    steps = 50
-    denoising_strength = 0.75
-    cfg_scale = 7
-    image_path = r"./rest_api_tests/dog.png"
-
-    # Converting Image to Base64
-    img_file = open(image_path, "rb")
-    init_images = [
-        "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
-    ]
-
-    url = "http://127.0.0.1:8080/sdapi/v1/img2img"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    data = {
-        "prompt": prompt,
-        "negative_prompt": negative_prompt,
-        "init_images": init_images,
-        "height": height,
-        "width": width,
-        "steps": steps,
-        "denoising_strength": denoising_strength,
-        "cfg_scale": cfg_scale,
-        "seed": seed,
-    }
-
-    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
-
-    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
-
-    print(f"[img2img] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
-
-    # NOTE Uncomment below to save the picture
-
-    # print("Extracting response object")
-    # response_obj = res.json()
-    # img_b64 = response_obj.get("images", [False])[0] or response_obj.get(
-    #     "image"
-    # )
-    # img_b2 = base64.b64decode(img_b64.replace("data:image/png;base64,", ""))
-    # im_file = BytesIO(img_b2)
-    # response_img = Image.open(im_file)
-    # print("Saving Response Image to: response_img")
-    # response_img.save(r"rest_api_tests/response_img.png")
-
-
-def inpainting_test(verbose=False):
-    prompt = "Paint a rabbit riding on the dog"
-    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
-    seed = 2121991605
-    height = 512
-    width = 512
-    steps = 50
-    noise_level = 10
-    cfg_scale = 7
-    is_full_res = False
-    full_res_padding = 32
-    image_path = r"./rest_api_tests/dog.png"
-
-    img_file = open(image_path, "rb")
-    image = "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
-    img_file = open(image_path, "rb")
-    mask = "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
-
-    url = "http://127.0.0.1:8080/sdapi/v1/inpaint"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    data = {
-        "prompt": prompt,
-        "negative_prompt": negative_prompt,
-        "image": image,
-        "mask": mask,
-        "height": height,
-        "width": width,
-        "steps": steps,
-        "noise_level": noise_level,
-        "cfg_scale": cfg_scale,
-        "seed": seed,
-        "is_full_res": is_full_res,
-        "full_res_padding": full_res_padding,
-    }
-
-    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
-
-    print(f"[inpaint] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
-
-
-def outpainting_test(verbose=False):
-    prompt = "Paint a rabbit riding on the dog"
-    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
-    seed = 2121991605
-    height = 512
-    width = 512
-    steps = 50
-    cfg_scale = 7
-    color_variation = 0.2
-    noise_q = 0.2
-    directions = ["up", "down", "right", "left"]
-    pixels = 32
-    mask_blur = 64
-    image_path = r"./rest_api_tests/dog.png"
-
-    # Converting Image to Base64
-    img_file = open(image_path, "rb")
-    init_images = [
-        "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
-    ]
-
-    url = "http://127.0.0.1:8080/sdapi/v1/outpaint"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    data = {
-        "prompt": prompt,
-        "negative_prompt": negative_prompt,
-        "seed": seed,
-        "height": height,
-        "width": width,
-        "steps": steps,
-        "cfg_scale": cfg_scale,
-        "color_variation": color_variation,
-        "noise_q": noise_q,
-        "directions": directions,
-        "pixels": pixels,
-        "mask_blur": mask_blur,
-        "init_images": init_images,
-    }
-
-    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
-
-    print(f"[outpaint] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
-
-
-def txt2img_test(verbose=False):
-    prompt = "Paint a rabbit in a top hate"
-    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
-    seed = 2121991605
-    height = 512
-    width = 512
-    steps = 50
-    cfg_scale = 7
-
-    url = "http://127.0.0.1:8080/sdapi/v1/txt2img"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    data = {
-        "prompt": prompt,
-        "negative_prompt": negative_prompt,
-        "seed": seed,
-        "height": height,
-        "width": width,
-        "steps": steps,
-        "cfg_scale": cfg_scale,
-    }
-
-    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
-
-    print(f"[txt2img] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json()['info'] if res.status_code == 200 else res.content}\n")
-
-
-def sd_models_test(verbose=False):
-    url = "http://127.0.0.1:8080/sdapi/v1/sd-models"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    res = requests.get(url=url, headers=headers, timeout=1000)
-
-    print(f"[sd_models] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json() if res.status_code == 200 else res.content}\n")
-
-
-def sd_samplers_test(verbose=False):
-    url = "http://127.0.0.1:8080/sdapi/v1/samplers"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    res = requests.get(url=url, headers=headers, timeout=1000)
-
-    print(f"[sd_samplers] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json() if res.status_code == 200 else res.content}\n")
-
-
-def options_test(verbose=False):
-    url = "http://127.0.0.1:8080/sdapi/v1/options"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    res = requests.get(url=url, headers=headers, timeout=1000)
-
-    print(f"[options] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json() if res.status_code == 200 else res.content}\n")
-
-
-def cmd_flags_test(verbose=False):
-    url = "http://127.0.0.1:8080/sdapi/v1/cmd-flags"
-
-    headers = {
-        "User-Agent": "PythonTest",
-        "Accept": "*/*",
-        "Accept-Encoding": "gzip, deflate, br",
-    }
-
-    res = requests.get(url=url, headers=headers, timeout=1000)
-
-    print(f"[cmd-flags] response from server was : {res.status_code} {res.reason}")
-
-    if verbose or res.status_code != 200:
-        print(f"\n{res.json() if res.status_code == 200 else res.content}\n")
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description=(
-            "Exercises the Stable Diffusion REST API of Shark. Make sure "
-            "Shark is running in API mode on 127.0.0.1:8080 before running"
-            "this script."
-        ),
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        help=(
-            "also display selected info from the JSON response for "
-            "successful requests"
-        ),
-    )
-    args = parser.parse_args()
-
-    sd_models_test(args.verbose)
-    sd_samplers_test(args.verbose)
-    options_test(args.verbose)
-    cmd_flags_test(args.verbose)
-    txt2img_test(args.verbose)
-    img2img_test(args.verbose)
-    upscaler_test(args.verbose)
-    inpainting_test(args.verbose)
-    outpainting_test(args.verbose)
--- a/rest_api_tests/dog.png
+++ b/rest_api_tests/dog.png
--- a/setup.py
+++ b/setup.py
@@ -2,13 +2,17 @@ from setuptools import find_packages
 from setuptools import setup

 import os
-import glob

 with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

-PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.5"
+PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
 backend_deps = []
+if "NO_BACKEND" in os.environ.keys():
+    backend_deps = [
+        "iree-compiler>=20220427.13",
+        "iree-runtime>=20220427.13",
+    ]

 setup(
    name="nodai-SHARK",
@@ -29,10 +33,11 @@ setup(
        "Operating System :: OS Independent",
    ],
    packages=find_packages(exclude=("examples")),
-    python_requires=">=3.9",
-    data_files=glob.glob("apps/stable_diffusion/resources/**"),
+    python_requires=">=3.7",
    install_requires=[
        "numpy",
        "PyYAML",
+        "torch-mlir>=20220428.420",
    ]
+    + backend_deps,
 )
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -1,97 +0,0 @@
-<#
-.SYNOPSIS
-  A script to update and install the SHARK runtime and its dependencies.
-
-.DESCRIPTION
-  This script updates and installs the SHARK runtime and its dependencies.
-  It checks the Python version installed and installs any required build
-  dependencies into a Python virtual environment.
-  If that environment does not exist, it creates it.
-  
-.PARAMETER update-src
-  git pulls latest version
-
-.PARAMETER force
-  removes and recreates venv to force update of all dependencies
-  
-.EXAMPLE
-  .\setup_venv.ps1 --force
-
-.EXAMPLE
-  .\setup_venv.ps1 --update-src
-
-.INPUTS
-  None
-
-.OUTPUTS
-  None
-
-#>
-
-param([string]$arguments)
-
-if ($arguments -eq "--update-src"){
-	git pull
-}
-
-if ($arguments -eq "--force"){
-	if (Test-Path env:VIRTUAL_ENV) {
-        Write-Host "deactivating..."
-        Deactivate
-    }
-    
-    if (Test-Path .\shark.venv\) {
-        Write-Host "removing and recreating venv..."
-        Remove-Item .\shark.venv -Force -Recurse
-        if (Test-Path .\shark.venv\) {
-            Write-Host 'could not remove .\shark-venv - please try running ".\setup_venv.ps1 --force" again!'
-            exit 1
-        }
-    }
-}
-
-# redirect stderr into stdout
-$p = &{python -V} 2>&1
-# check if an ErrorRecord was returned
-$version = if($p -is [System.Management.Automation.ErrorRecord])
-{
-    # grab the version string from the error message
-    $p.Exception.Message
-}
-else
-{
-    # otherwise return complete Python list
-    $ErrorActionPreference = 'SilentlyContinue'
-    $PyVer = py --list
-}
-
-# deactivate any activated venvs
-if ($PyVer -like "*venv*")
-{
-  deactivate # make sure we don't update the wrong venv
-  $PyVer = py --list # update list
-}
-
-Write-Host "Python versions found are"
-Write-Host ($PyVer | Out-String) # formatted output with line breaks
-if (!($PyVer.length -ne 0)) {$p} # return Python --version String if py.exe is unavailable
-if (!($PyVer -like "*3.11*") -and !($p -like "*3.11*")) # if 3.11 is not in any list
-{
-    Write-Host "Please install Python 3.11 and try again"
-    exit 34
-}
-
-Write-Host "Installing Build Dependencies"
-# make sure we really use 3.11 from list, even if it's not the default.
-if ($NULL -ne $PyVer) {py -3.11 -m venv .\shark.venv\}
-else {python -m venv .\shark.venv\}
-.\shark.venv\Scripts\activate
-python -m pip install --upgrade pip
-pip install wheel
-pip install -r requirements.txt
-pip install --pre torch-mlir torchvision torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
-pip install --upgrade -f https://nod-ai.github.io/SRT/pip-release-links.html iree-compiler iree-runtime
-Write-Host "Building SHARK..."
-pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
-Write-Host "Build and installation completed successfully"
-Write-Host "Source your venv with ./shark.venv/Scripts/activate"
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -2,10 +2,9 @@
 # Sets up a venv suitable for running samples.
 # e.g:
 # ./setup_venv.sh  #setup a default $PYTHON3 shark.venv
-# Environment variables used by the script.
+# Environment Variables by the script.
 # PYTHON=$PYTHON3.10 ./setup_venv.sh  #pass a version of $PYTHON to use
 # VENV_DIR=myshark.venv #create a venv called myshark.venv
-# SKIP_VENV=1 #Don't create and activate a Python venv. Use the current environment. 
 # USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
 # IMPORTER=1 #Install importer deps
 # BENCHMARK=1 #Install benchmark deps
@@ -27,22 +26,15 @@ PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; prin
 echo "Python: $PYTHON"
 echo "Python version: $PYTHON_VERSION_X_Y"

-if [ "$PYTHON_VERSION_X_Y" != "3.11" ]; then
-    echo "Error: Python version 3.11 is required."
-    exit 1
-fi
-
-if [[ "$SKIP_VENV" != "1" ]]; then
-  if [[ -z "${CONDA_PREFIX}" ]]; then
-    # Not a conda env. So create a new VENV dir
-    VENV_DIR=${VENV_DIR:-shark.venv}
-    echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
-    $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
-    source "$VENV_DIR/bin/activate" || die "Could not activate venv"
-    PYTHON="$(which python3)"
-  else
-    echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
-  fi
+if [[ -z "${CONDA_PREFIX}" ]]; then
+  # Not a conda env. So create a new VENV dir
+  VENV_DIR=${VENV_DIR:-shark.venv}
+  echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
+  $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
+  source "$VENV_DIR/bin/activate" || die "Could not activate venv"
+  PYTHON="$(which python3)"
+else
+  echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
 fi

 Red=`tput setaf 1`
@@ -50,7 +42,7 @@ Green=`tput setaf 2`
 Yellow=`tput setaf 3`

 # Assume no binary torch-mlir.
-# Currently available for macOS m1&intel (3.11) and Linux(3.8,3.10,3.11)
+# Currently available for macOS m1&intel (3.10) and Linux(3.7,3.8,3.9,3.10)
 torch_mlir_bin=false
 if [[ $(uname -s) = 'Darwin' ]]; then
  echo "${Yellow}Apple macOS detected"
@@ -68,12 +60,12 @@ if [[ $(uname -s) = 'Darwin' ]]; then
  fi
  echo "${Yellow}Run the following commands to setup your SSL certs for your Python version if you see SSL errors with tests"
  echo "${Yellow}/Applications/Python\ 3.XX/Install\ Certificates.command"
-  if [ "$PYTHON_VERSION_X_Y" == "3.11" ]; then
+  if [ "$PYTHON_VERSION_X_Y" == "3.10" ]; then
    torch_mlir_bin=true
  fi
 elif [[ $(uname -s) = 'Linux' ]]; then
  echo "${Yellow}Linux detected"
-  if [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.10" ] || [ "$PYTHON_VERSION_X_Y" == "3.11" ] ; then
+  if [ "$PYTHON_VERSION_X_Y" == "3.7" ] || [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.9" ] || [ "$PYTHON_VERSION_X_Y" == "3.10" ] ; then
    torch_mlir_bin=true
  fi
 else
@@ -84,78 +76,65 @@ fi
 $PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
 $PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
 if [ "$torch_mlir_bin" = true ]; then
-  if [[ $(uname -s) = 'Darwin' ]]; then
-    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
-    $PYTHON -m pip uninstall -y timm #TEMP FIX FOR MAC
-    $PYTHON -m pip install --pre --no-cache-dir torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
+  $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
+  if [ $? -eq 0 ];then
+    echo "Successfully Installed torch-mlir"
  else
-    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
-    if [ $? -eq 0 ];then
-      echo "Successfully Installed torch-mlir"
-    else
-      echo "Could not install torch-mlir" >&2
-    fi
+    echo "Could not install torch-mlir" >&2
  fi
 else
  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
-  echo "${Yello}Python 3.11 supported on macOS and 3.8,3.10 and 3.11 on Linux"
+  echo "${Yello}Python 3.10 supported on macOS and 3.7,3.8,3.9 and 3.10 on Linux"
  echo "${Red}Please build torch-mlir from source in your environment"
  exit 1
 fi
 if [[ -z "${USE_IREE}" ]]; then
-  rm .use-iree
-  RUNTIME="https://nod-ai.github.io/SRT/pip-release-links.html"
+  RUNTIME="nod-ai/SHARK-Runtime"
 else
-  touch ./.use-iree
-  RUNTIME="https://openxla.github.io/iree/pip-release-links.html"
+  RUNTIME="google/iree"
 fi
 if [[ -z "${NO_BACKEND}" ]]; then
  echo "Installing ${RUNTIME}..."
-  $PYTHON -m pip install --pre --upgrade --no-index --find-links ${RUNTIME} iree-compiler iree-runtime
+  $PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime
 else
  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
 fi
-
 if [[ ! -z "${IMPORTER}" ]]; then
  echo "${Yellow}Installing importer tools.."
  if [[ $(uname -s) = 'Linux' ]]; then
    echo "${Yellow}Linux detected.. installing Linux importer tools"
-    #Always get the importer tools from upstream IREE
-    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://openxla.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  elif [[ $(uname -s) = 'Darwin' ]]; then
    echo "${Yellow}macOS detected.. installing macOS importer tools"
    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
-    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer-macos.txt" -f ${RUNTIME} --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  fi
 fi

-if [[ $(uname -s) = 'Darwin' ]]; then
-  PYTORCH_URL=https://download.pytorch.org/whl/nightly/torch/
-else
-  PYTORCH_URL=https://download.pytorch.org/whl/nightly/cpu/
-fi
+$PYTHON -m pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://github.com/${RUNTIME}/releases

-$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f ${PYTORCH_URL}
-
-if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
-  T_VER=$($PYTHON -m pip show torch | grep Version)
-  T_VER_MIN=${T_VER:14:12}
-  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
-  TV_VER_MAJ=${TV_VER:9:6}
-  $PYTHON -m pip uninstall -y torchvision
-  $PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
+if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
+  $PYTHON -m pip uninstall -y torch torchvision
+  $PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch + cu118."
+    echo "Successfully Installed torch + cu116."
  else
-    echo "Could not install torch + cu118." >&2
+    echo "Could not install torch + cu116." >&2
  fi
 fi

-if [[ -z "${NO_BREVITAS}" ]]; then
-  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@dev
+if [[ ! -z "${ONNX}" ]]; then
+  echo "${Yellow}Installing ONNX and onnxruntime for benchmarks..."
+  $PYTHON -m pip install onnx onnxruntime psutil
+  if [ $? -eq 0 ];then
+    echo "Successfully installed ONNX and ONNX runtime."
+  else
+    echo "Could not install ONNX." >&2
+  fi
 fi

-if [[ -z "${CONDA_PREFIX}" && "$SKIP_VENV" != "1" ]]; then
+if [[ -z "${CONDA_PREFIX}" ]]; then
  echo "${Green}Before running examples activate venv with:"
  echo "  ${Green}source $VENV_DIR/bin/activate"
 fi
+
--- a/shark/init.py
+++ b/shark/init.py
@@ -1,28 +0,0 @@
-import importlib
-import logging
-
-from torch._dynamo import register_backend
-
-log = logging.getLogger(__name__)
-
-
-@register_backend
-def shark(model, inputs, *, options):
-    try:
-        from shark.dynamo_backend.utils import SharkBackend
-    except ImportError:
-        log.exception(
-            "Unable to import SHARK - High Performance Machine Learning Distribution"
-            "Please install the right version of SHARK that matches the PyTorch version being used. "
-            "Refer to https://github.com/nod-ai/SHARK/ for details."
-        )
-        raise
-    return SharkBackend(model, inputs, options)
-
-
-def has_shark():
-    try:
-        importlib.import_module("shark")
-        return True
-    except ImportError:
-        return False
--- a/shark/backward_makefx.py
+++ b/shark/backward_makefx.py
@@ -15,7 +15,7 @@
 import torch
 from torch._decomp import get_decompositions
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.nn.utils import stateless
+from torch.nn.utils import _stateless

 from torch import fx
 import tempfile
--- a/shark/dynamo_backend/utils.py
+++ b/shark/dynamo_backend/utils.py
@@ -1,154 +0,0 @@
-import functools
-from typing import List, Optional
-import torch
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._functorch.compile_utils import strip_overloads
-from shark.shark_inference import SharkInference
-from torch._decomp import get_decompositions
-from torch.func import functionalize
-import io
-import torch_mlir
-
-
-# TODO: Control decompositions.
-def default_decompositions():
-    return get_decompositions(
-        [
-            torch.ops.aten.embedding_dense_backward,
-            torch.ops.aten.native_layer_norm_backward,
-            torch.ops.aten.slice_backward,
-            torch.ops.aten.select_backward,
-            torch.ops.aten.norm.ScalarOpt_dim,
-            torch.ops.aten.native_group_norm,
-            torch.ops.aten.upsample_bilinear2d.vec,
-            torch.ops.aten.split.Tensor,
-            torch.ops.aten.split_with_sizes,
-            torch.ops.aten.native_layer_norm,
-            torch.ops.aten.masked_fill.Tensor,
-            torch.ops.aten.masked_fill.Scalar,
-        ]
-    )
-
-
-def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
-    removed_indexes = []
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert (
-                len(node.args) == 1
-            ), "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, (list, tuple)):
-                node_arg = list(node_arg)
-                node_args_len = len(node_arg)
-                for i in range(node_args_len):
-                    curr_index = node_args_len - (i + 1)
-                    if node_arg[curr_index] is None:
-                        removed_indexes.append(curr_index)
-                        node_arg.pop(curr_index)
-                node.args = (tuple(node_arg),)
-                break
-
-    if len(removed_indexes) > 0:
-        fx_g.graph.lint()
-        fx_g.graph.eliminate_dead_code()
-        fx_g.recompile()
-    removed_indexes.sort()
-    return removed_indexes
-
-
-def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert (
-                len(node.args) == 1
-            ), "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, tuple):
-                return len(node_arg) == 0
-    return False
-
-
-def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
-    """
-    Replace tuple with tuple element in functions that return one-element tuples.
-    Returns true if an unwrapping took place, and false otherwise.
-    """
-    unwrapped_tuple = False
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert (
-                len(node.args) == 1
-            ), "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, tuple):
-                if len(node_arg) == 1:
-                    node.args = (node_arg[0],)
-                    unwrapped_tuple = True
-                    break
-
-    if unwrapped_tuple:
-        fx_g.graph.lint()
-        fx_g.recompile()
-    return unwrapped_tuple
-
-
-class SharkBackend:
-    def __init__(
-        self, fx_g: torch.fx.GraphModule, inputs: tuple, options: dict
-    ):
-        self.fx_g = fx_g
-        self.inputs = inputs
-        self.shark_module = None
-        self.device: str = options.get("device", "cpu")
-        self.was_unwrapped: bool = False
-        self.none_indices: list = []
-        self._modify_fx_g()
-        self.compile()
-
-    def _modify_fx_g(self):
-        self.none_indices = _remove_nones(self.fx_g)
-        self.was_unwrapped = _unwrap_single_tuple_return(self.fx_g)
-
-    def compile(self):
-        gm = make_fx(
-            functionalize(self.fx_g),
-            decomposition_table=default_decompositions(),
-        )(*self.inputs)
-        gm.graph.set_codegen(torch.fx.graph.CodeGen())
-        gm.recompile()
-        strip_overloads(gm)
-        ts_g = torch.jit.script(gm)
-        mlir_module = torch_mlir.compile(
-            ts_g, self.inputs, output_type="linalg-on-tensors"
-        )
-        bytecode_stream = io.BytesIO()
-        mlir_module.operation.write_bytecode(bytecode_stream)
-        bytecode = bytecode_stream.getvalue()
-        from shark.shark_inference import SharkInference
-
-        shark_module = SharkInference(
-            mlir_module=bytecode,
-            device=self.device,
-            mlir_dialect="tm_tensor",
-        )
-        shark_module.compile(extra_args=[])
-        self.shark_module = shark_module
-
-    def __call__(self, *inputs):
-        np_inputs = [x.contiguous().detach().cpu().numpy() for x in inputs]
-        np_outs = self.shark_module("forward", np_inputs)
-        if self.was_unwrapped:
-            np_outs = [
-                np_outs,
-            ]
-
-        if not isinstance(np_outs, list):
-            res = torch.from_numpy(np_outs)
-            return res
-
-        result = [torch.from_numpy(x) for x in np_outs]
-        for r_in in self.none_indices:
-            result.insert(r_in, None)
-        result = tuple(result)
-        return result
--- a/shark/examples/shark_dynamo/basic_examples.py
+++ b/shark/examples/shark_dynamo/basic_examples.py
@@ -1,25 +1,70 @@
+import torchdynamo
 import torch
-import shark
+import torch_mlir
+from shark.sharkdynamo.utils import make_shark_compiler


-def foo(x, a):
-    if x.shape[0] > 3:
-        return x + a
-    else:
-        return x + 3
+import warnings, logging
+
+warnings.simplefilter("ignore")
+torchdynamo.config.log_level = logging.ERROR


-shark_options = {"device": "cpu"}
-compiled = torch.compile(foo, backend="shark", options=shark_options)
+torchdynamo.reset()

-input = torch.ones(4)

-x = compiled(input, input)
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
+)
+def foo(t):
+    return 2 * t

+
+example_input = torch.rand((2, 3))
+x = foo(example_input)
 print(x)

-input = torch.ones(3)

-x = compiled(input, input)
+torchdynamo.reset()

-print(x)
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=False)
+)
+def foo(a, b):
+    x = a / (a + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+print(foo(torch.rand((2, 3)), -torch.rand((2, 3))))
+
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
+)
+def foo(a):
+    for i in range(10):
+        a += 1.0
+    return a
+
+
+print(foo(torch.rand((1, 2))))
+
+torchdynamo.reset()
+
+
+@torchdynamo.optimize(
+    make_shark_compiler(use_tracing=False, device="cuda", verbose=True)
+)
+def test_unsupported_types(t, y):
+    return t, 2 * y
+
+
+str_input = "hello"
+tensor_input = torch.randn(2)
+print(test_unsupported_types(str_input, tensor_input))
--- a/shark/examples/shark_eager/dynamo_demo.ipynb
+++ b/shark/examples/shark_eager/dynamo_demo.ipynb
@@ -36,9 +36,7 @@
    "    from torchdynamo.optimizations.backends import create_backend\n",
    "    from torchdynamo.optimizations.subgraph import SubGraph\n",
    "except ModuleNotFoundError:\n",
-    "    print(\n",
-    "        \"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\"\n",
-    "    )\n",
+    "    print(\"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\")\n",
    "    exit()\n",
    "\n",
    "# torch-mlir imports for compiling\n",
@@ -99,9 +97,7 @@
    "\n",
    "        for node in fx_g.graph.nodes:\n",
    "            if node.op == \"output\":\n",
-    "                assert (\n",
-    "                    len(node.args) == 1\n",
-    "                ), \"Output node must have a single argument\"\n",
+    "                assert len(node.args) == 1, \"Output node must have a single argument\"\n",
    "                node_arg = node.args[0]\n",
    "                if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
    "                    node.args = (node_arg[0],)\n",
@@ -120,12 +116,8 @@
    "    if len(args) == 1 and isinstance(args[0], list):\n",
    "        args = args[0]\n",
    "\n",
-    "    linalg_module = compile(\n",
-    "        ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS\n",
-    "    )\n",
-    "    callable, _ = get_iree_compiled_module(\n",
-    "        linalg_module, \"cuda\", func_name=\"forward\"\n",
-    "    )\n",
+    "    linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)\n",
+    "    callable, _ = get_iree_compiled_module(linalg_module, \"cuda\", func_name=\"forward\")\n",
    "\n",
    "    def forward(*inputs):\n",
    "        return callable(*inputs)\n",
@@ -220,7 +212,6 @@
    "    assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
    "    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
    "\n",
-    "\n",
    "@torchdynamo.optimize(\"torch_mlir\")\n",
    "def toy_example2(*args):\n",
    "    a, b = args\n",
--- a/shark/examples/shark_inference/CLIPModel_tf.py
+++ b/shark/examples/shark_inference/CLIPModel_tf.py
@@ -22,7 +22,7 @@ class CLIPModule(tf.Module):
            input_ids=x, attention_mask=y, pixel_values=z
        )

-    @tf.function(input_signature=clip_vit_inputs, jit_compile=True)
+    @tf.function(input_signature=clip_vit_inputs)
    def forward(self, input_ids, attention_mask, pixel_values):
        return self.m.predict(
            input_ids, attention_mask, pixel_values
--- a/shark/examples/shark_inference/ESRGAN/README.md
+++ b/shark/examples/shark_inference/ESRGAN/README.md
@@ -1,15 +0,0 @@
-## Running ESRGAN
-
-```
-1. pip install numpy opencv-python
-2. mkdir InputImages
-   (this is where all the input images will reside in)
-3. mkdir OutputImages
-   (this is where the model will generate all the images)
-4. mkdir models
-   (save the .pth checkpoint file here)
-5. python esrgan.py
-```
-
- Download [RRDB_ESRGAN_x4.pth](https://drive.google.com/drive/u/0/folders/17VYV_SoZZesU6mbxz2dMAIccSSlqLecY) and place it in the `models` directory as mentioned above in step 4.
- Credits : [ESRGAN](https://github.com/xinntao/ESRGAN)
--- a/shark/examples/shark_inference/ESRGAN/esrgan.py
+++ b/shark/examples/shark_inference/ESRGAN/esrgan.py
@@ -1,239 +0,0 @@
-from ast import arg
-import os.path as osp
-import glob
-import cv2
-import numpy as np
-import torch
-
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-from shark.shark_inference import SharkInference
-import torch_mlir
-import tempfile
-import functools
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def make_layer(block, n_layers):
-    layers = []
-    for _ in range(n_layers):
-        layers.append(block())
-    return nn.Sequential(*layers)
-
-
-class ResidualDenseBlock_5C(nn.Module):
-    def __init__(self, nf=64, gc=32, bias=True):
-        super(ResidualDenseBlock_5C, self).__init__()
-        # gc: growth channel, i.e. intermediate channels
-        self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
-        self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
-        self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
-        self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
-        self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-
-        # initialization
-        # mutil.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
-
-    def forward(self, x):
-        x1 = self.lrelu(self.conv1(x))
-        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
-        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
-        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
-        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
-        return x5 * 0.2 + x
-
-
-class RRDB(nn.Module):
-    """Residual in Residual Dense Block"""
-
-    def __init__(self, nf, gc=32):
-        super(RRDB, self).__init__()
-        self.RDB1 = ResidualDenseBlock_5C(nf, gc)
-        self.RDB2 = ResidualDenseBlock_5C(nf, gc)
-        self.RDB3 = ResidualDenseBlock_5C(nf, gc)
-
-    def forward(self, x):
-        out = self.RDB1(x)
-        out = self.RDB2(out)
-        out = self.RDB3(out)
-        return out * 0.2 + x
-
-
-class RRDBNet(nn.Module):
-    def __init__(self, in_nc, out_nc, nf, nb, gc=32):
-        super(RRDBNet, self).__init__()
-        RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
-
-        self.conv_first = nn.Conv2d(in_nc, nf, 3, 1, 1, bias=True)
-        self.RRDB_trunk = make_layer(RRDB_block_f, nb)
-        self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        #### upsampling
-        self.upconv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        self.upconv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        self.HRconv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
-
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-
-    def forward(self, x):
-        fea = self.conv_first(x)
-        trunk = self.trunk_conv(self.RRDB_trunk(fea))
-        fea = fea + trunk
-
-        fea = self.lrelu(
-            self.upconv1(F.interpolate(fea, scale_factor=2, mode="nearest"))
-        )
-        fea = self.lrelu(
-            self.upconv2(F.interpolate(fea, scale_factor=2, mode="nearest"))
-        )
-        out = self.conv_last(self.lrelu(self.HRconv(fea)))
-
-        return out
-
-
-############### Parsing args #####################
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-p.add_argument("--device", type=str, default="cpu", help="the device to use")
-p.add_argument(
-    "--mlir_loc",
-    type=str,
-    default=None,
-    help="location of the model's mlir file",
-)
-args = p.parse_args()
-###################################################
-
-
-def inference(input_m):
-    return model(input_m)
-
-
-def load_mlir(mlir_loc):
-    import os
-
-    if mlir_loc == None:
-        return None
-    print(f"Trying to load the model from {mlir_loc}.")
-    with open(os.path.join(mlir_loc)) as f:
-        mlir_module = f.read()
-    return mlir_module
-
-
-def compile_through_fx(model, inputs, mlir_loc=None):
-    module = load_mlir(mlir_loc)
-    if module == None:
-        fx_g = make_fx(
-            model,
-            decomposition_table=get_decompositions(
-                [
-                    torch.ops.aten.embedding_dense_backward,
-                    torch.ops.aten.native_layer_norm_backward,
-                    torch.ops.aten.slice_backward,
-                    torch.ops.aten.select_backward,
-                    torch.ops.aten.norm.ScalarOpt_dim,
-                    torch.ops.aten.native_group_norm,
-                    torch.ops.aten.upsample_bilinear2d.vec,
-                    torch.ops.aten.split.Tensor,
-                    torch.ops.aten.split_with_sizes,
-                ]
-            ),
-        )(inputs)
-
-        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-        fx_g.recompile()
-
-        def strip_overloads(gm):
-            """
-            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-            Args:
-                gm(fx.GraphModule): The input Fx graph module to be modified
-            """
-            for node in gm.graph.nodes:
-                if isinstance(node.target, torch._ops.OpOverload):
-                    node.target = node.target.overloadpacket
-            gm.recompile()
-
-        strip_overloads(fx_g)
-
-        ts_g = torch.jit.script(fx_g)
-
-        print("Torchscript graph generated successfully")
-        module = torch_mlir.compile(
-            ts_g,
-            inputs,
-            torch_mlir.OutputType.LINALG_ON_TENSORS,
-            use_tracing=False,
-            verbose=False,
-        )
-
-    mlir_model = str(module)
-    func_name = "forward"
-    shark_module = SharkInference(
-        mlir_model, device=args.device, mlir_dialect="linalg"
-    )
-    shark_module.compile()
-
-    return shark_module
-
-
-model_path = "models/RRDB_ESRGAN_x4.pth"  # models/RRDB_ESRGAN_x4.pth OR models/RRDB_PSNR_x4.pth
-# device = torch.device('cuda')  # if you want to run on CPU, change 'cuda' -> cpu
-device = torch.device("cpu")
-
-test_img_folder = "InputImages/*"
-
-model = RRDBNet(3, 3, 64, 23, gc=32)
-model.load_state_dict(torch.load(model_path), strict=True)
-model.eval()
-model = model.to(device)
-
-print("Model path {:s}. \nTesting...".format(model_path))
-
-if __name__ == "__main__":
-    idx = 0
-    for path in glob.glob(test_img_folder):
-        idx += 1
-        base = osp.splitext(osp.basename(path))[0]
-        print(idx, base)
-        # read images
-        img = cv2.imread(path, cv2.IMREAD_COLOR)
-        img = img * 1.0 / 255
-        img = torch.from_numpy(
-            np.transpose(img[:, :, [2, 1, 0]], (2, 0, 1))
-        ).float()
-        img_LR = img.unsqueeze(0)
-        img_LR = img_LR.to(device)
-
-        with torch.no_grad():
-            shark_module = compile_through_fx(inference, img_LR)
-            shark_output = shark_module.forward((img_LR,))
-            shark_output = torch.from_numpy(shark_output)
-            shark_output = (
-                shark_output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
-            )
-            esrgan_output = (
-                model(img_LR).data.squeeze().float().cpu().clamp_(0, 1).numpy()
-            )
-        # SHARK OUTPUT
-        shark_output = np.transpose(shark_output[[2, 1, 0], :, :], (1, 2, 0))
-        shark_output = (shark_output * 255.0).round()
-        cv2.imwrite(
-            "OutputImages/{:s}_rlt_shark_output.png".format(base), shark_output
-        )
-        print("Generated SHARK's output")
-        # ESRGAN OUTPUT
-        esrgan_output = np.transpose(esrgan_output[[2, 1, 0], :, :], (1, 2, 0))
-        esrgan_output = (esrgan_output * 255.0).round()
-        cv2.imwrite(
-            "OutputImages/{:s}_rlt_esrgan_output.png".format(base),
-            esrgan_output,
-        )
-        print("Generated ESRGAN's output")
--- a/shark/examples/shark_inference/albert_maskfill_pt.py
+++ b/shark/examples/shark_inference/albert_maskfill_pt.py
@@ -43,7 +43,9 @@ if __name__ == "__main__":
    minilm_mlir, func_name = mlir_importer.import_mlir(
        is_dynamic=False, tracing_required=True
    )
-    shark_module = SharkInference(minilm_mlir)
+    shark_module = SharkInference(
+        minilm_mlir, func_name, mlir_dialect="linalg"
+    )
    shark_module.compile()
    token_logits = torch.tensor(shark_module.forward(inputs))
    mask_id = torch.where(
--- a/shark/examples/shark_inference/albert_maskfill_tf.py
+++ b/shark/examples/shark_inference/albert_maskfill_tf.py
@@ -28,7 +28,7 @@ class AlbertModule(tf.Module):
        self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=t5_inputs, jit_compile=True)
+    @tf.function(input_signature=t5_inputs)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

@@ -54,7 +54,7 @@ if __name__ == "__main__":
    minilm_mlir, func_name = mlir_importer.import_mlir(
        is_dynamic=False, tracing_required=False
    )
-    shark_module = SharkInference(minilm_mlir, mlir_dialect="mhlo")
+    shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="mhlo")
    shark_module.compile()
    output_idx = 0
    data_idx = 1
--- a/shark/examples/shark_inference/bloom_tank.py
+++ b/shark/examples/shark_inference/bloom_tank.py
@@ -1,12 +1,10 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
+from shark.shark_downloader import download_torch_model

-mlir_model, func_name, inputs, golden_out = download_model(
-    "bloom", frontend="torch"
-)
+mlir_model, func_name, inputs, golden_out = download_torch_model("bloom")

 shark_module = SharkInference(
-    mlir_model, device="cpu", mlir_dialect="tm_tensor"
+    mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"
 )
 shark_module.compile()
 result = shark_module.forward(inputs)
--- a/shark/examples/shark_inference/gpt2_tf.py
+++ b/shark/examples/shark_inference/gpt2_tf.py
@@ -19,7 +19,7 @@ class GPT2Module(tf.Module):

        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)

-    @tf.function(input_signature=gpt2_inputs, jit_compile=True)
+    @tf.function(input_signature=gpt2_inputs)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)

--- a/shark/examples/shark_inference/llama/README.md
+++ b/shark/examples/shark_inference/llama/README.md
@@ -1,18 +0,0 @@
-# SHARK LLaMA
-
-## TORCH-MLIR Version
-
-```
-https://github.com/nod-ai/torch-mlir.git
-```
-Then check out the `complex` branch and `git submodule update --init` and then build with `.\build_tools\python_deploy\build_windows.ps1`
-
-### Setup & Run
-```
-git clone https://github.com/nod-ai/llama.git
-```
-Then in this repository
-```
-pip install -e .
-python llama/shark_model.py
-```
--- a/shark/examples/shark_inference/mega_test.py
+++ b/shark/examples/shark_inference/mega_test.py
@@ -1,72 +0,0 @@
-import torch
-import torch_mlir
-from shark.shark_inference import SharkInference
-from shark.shark_compile import shark_compile_through_fx
-from MEGABYTE_pytorch import MEGABYTE
-
-import os
-
-
-class MegaModel(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.model = MEGABYTE(
-            num_tokens=16000,  # number of tokens
-            dim=(
-                512,
-                256,
-            ),  # transformer model dimension (512 for coarsest, 256 for fine in this example)
-            max_seq_len=(
-                1024,
-                4,
-            ),  # sequence length for global and then local. this can be more than 2
-            depth=(
-                6,
-                4,
-            ),  # number of layers for global and then local. this can be more than 2, but length must match the max_seq_len's
-            dim_head=64,  # dimension per head
-            heads=8,  # number of attention heads
-            flash_attn=True,  # use flash attention
-        )
-
-    def forward(self, input):
-        return self.model(input)
-
-
-megaModel = MegaModel()
-inputs = [torch.randint(0, 16000, (1, 1024, 4))]
-
-# CURRENTLY IT BAILS OUT HERE BECAUSE OF MISSING OP LOWERINGS :-
-# 1. aten.alias
-shark_module, _ = shark_compile_through_fx(
-    model=megaModel,
-    inputs=inputs,
-    extended_model_name="mega_shark",
-    is_f16=False,
-    f16_input_mask=None,
-    save_dir=os.getcwd(),
-    debug=False,
-    generate_or_load_vmfb=True,
-    extra_args=[],
-    device="cuda",
-    mlir_dialect="tm_tensor",
-)
-# logits = model(x)
-
-
-def print_output_info(output, msg):
-    print("\n", msg)
-    print("\n\t", output.shape)
-
-
-ans = shark_module("forward", inputs)
-print_output_info(torch.from_numpy(ans), "SHARK's output")
-
-ans = megaModel.forward(*inputs)
-print_output_info(ans, "ORIGINAL Model's output")
-
-# and sample from the logits accordingly
-# or you can use the generate function
-
-# NEED TO LOOK AT THIS LATER IF REQUIRED IN SHARK.
-# sampled = model.generate(temperature = 0.9, filter_thres = 0.9) # (1, 1024, 4)
--- a/shark/examples/shark_inference/mhlo_example.py
+++ b/shark/examples/shark_inference/mhlo_example.py
@@ -13,7 +13,9 @@ arg0 = np.ones((1, 4)).astype(np.float32)
 arg1 = np.ones((4, 1)).astype(np.float32)

 print("Running shark on cpu backend")
-shark_module = SharkInference(mhlo_ir, device="cpu", mlir_dialect="mhlo")
+shark_module = SharkInference(
+    mhlo_ir, function_name="forward", device="cpu", mlir_dialect="mhlo"
+)

 # Generate the random inputs and feed into the graph.
 x = shark_module.generate_random_inputs()
@@ -21,11 +23,15 @@ shark_module.compile()
 print(shark_module.forward(x))

 print("Running shark on cuda backend")
-shark_module = SharkInference(mhlo_ir, device="cuda", mlir_dialect="mhlo")
+shark_module = SharkInference(
+    mhlo_ir, function_name="forward", device="cuda", mlir_dialect="mhlo"
+)
 shark_module.compile()
 print(shark_module.forward(x))

 print("Running shark on vulkan backend")
-shark_module = SharkInference(mhlo_ir, device="vulkan", mlir_dialect="mhlo")
+shark_module = SharkInference(
+    mhlo_ir, function_name="forward", device="vulkan", mlir_dialect="mhlo"
+)
 shark_module.compile()
 print(shark_module.forward(x))
--- a/shark/examples/shark_inference/minilm_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_benchmark_tf.py
@@ -26,7 +26,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input, jit_compile=True)
+    @tf.function(input_signature=bert_input)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/shark/examples/shark_inference/minilm_jax.py
+++ b/shark/examples/shark_inference/minilm_jax.py
@@ -1,73 +0,0 @@
-from transformers import AutoTokenizer, FlaxAutoModel
-import torch
-import jax
-from typing import Union, Dict, List, Any
-import numpy as np
-from shark.shark_inference import SharkInference
-import io
-
-NumpyTree = Union[np.ndarray, Dict[str, np.ndarray], List[np.ndarray]]
-
-
-def convert_torch_tensor_tree_to_numpy(
-    tree: Union[torch.tensor, Dict[str, torch.tensor], List[torch.tensor]]
-) -> NumpyTree:
-    return jax.tree_util.tree_map(
-        lambda torch_tensor: torch_tensor.cpu().detach().numpy(), tree
-    )
-
-
-def convert_int64_to_int32(tree: NumpyTree) -> NumpyTree:
-    return jax.tree_util.tree_map(
-        lambda tensor: np.array(tensor, dtype=np.int32)
-        if tensor.dtype == np.int64
-        else tensor,
-        tree,
-    )
-
-
-def get_sample_input():
-    tokenizer = AutoTokenizer.from_pretrained(
-        "microsoft/MiniLM-L12-H384-uncased"
-    )
-    inputs_torch = tokenizer("Hello, World!", return_tensors="pt")
-    return convert_int64_to_int32(
-        convert_torch_tensor_tree_to_numpy(inputs_torch.data)
-    )
-
-
-def get_jax_model():
-    return FlaxAutoModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
-
-
-def export_jax_to_mlir(jax_model: Any, sample_input: NumpyTree):
-    model_mlir = jax.jit(jax_model).lower(**sample_input).compiler_ir()
-    byte_stream = io.BytesIO()
-    model_mlir.operation.write_bytecode(file=byte_stream)
-    return byte_stream.getvalue()
-
-
-def assert_array_list_allclose(x, y, *args, **kwargs):
-    assert len(x) == len(y)
-    for a, b in zip(x, y):
-        np.testing.assert_allclose(
-            np.asarray(a), np.asarray(b), *args, **kwargs
-        )
-
-
-sample_input = get_sample_input()
-jax_model = get_jax_model()
-mlir = export_jax_to_mlir(jax_model, sample_input)
-
-# Compile and load module.
-shark_inference = SharkInference(mlir_module=mlir, mlir_dialect="mhlo")
-shark_inference.compile()
-
-# Run main function.
-result = shark_inference("main", jax.tree_util.tree_flatten(sample_input)[0])
-
-# Run JAX model.
-reference_result = jax.tree_util.tree_flatten(jax_model(**sample_input))[0]
-
-# Verify result.
-assert_array_list_allclose(result, reference_result, atol=1e-5)
--- a/shark/examples/shark_inference/minilm_jax_requirements.txt
+++ b/shark/examples/shark_inference/minilm_jax_requirements.txt
@@ -1,6 +0,0 @@
-flax
-jax[cpu]
-nodai-SHARK
-orbax
-transformers
-torch
--- a/shark/examples/shark_inference/minilm_jit.py
+++ b/shark/examples/shark_inference/minilm_jit.py
@@ -1,14 +1,15 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
+from shark.shark_downloader import download_torch_model


-mlir_model, func_name, inputs, golden_out = download_model(
-    "microsoft/MiniLM-L12-H384-uncased",
-    frontend="torch",
+mlir_model, func_name, inputs, golden_out = download_torch_model(
+    "microsoft/MiniLM-L12-H384-uncased"
 )


-shark_module = SharkInference(mlir_model, device="cpu", mlir_dialect="linalg")
+shark_module = SharkInference(
+    mlir_model, func_name, device="cpu", mlir_dialect="linalg"
+)
 shark_module.compile()
 result = shark_module.forward(inputs)
 print("The obtained result via shark is: ", result)
--- a/shark/examples/shark_inference/minilm_tf.py
+++ b/shark/examples/shark_inference/minilm_tf.py
@@ -26,7 +26,7 @@ class BertModule(tf.Module):
            input_ids=x, attention_mask=y, token_type_ids=z, training=False
        )

-    @tf.function(input_signature=bert_input, jit_compile=True)
+    @tf.function(input_signature=bert_input)
    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.m.predict(input_ids, attention_mask, token_type_ids)

--- a/shark/examples/shark_inference/resnest.py
+++ b/shark/examples/shark_inference/resnest.py
@@ -33,7 +33,7 @@ mlir_importer = SharkImporter(

 print(golden_out)

-shark_module = SharkInference(vision_mlir, mlir_dialect="linalg")
+shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
 shark_module.compile()
 result = shark_module.forward((input,))
 print("Obtained result", result)
--- a/shark/examples/shark_inference/resnet50_fp16.py
+++ b/shark/examples/shark_inference/resnet50_fp16.py
@@ -49,7 +49,9 @@ module = torch_mlir.compile(
 mlir_model = module
 func_name = "forward"

-shark_module = SharkInference(mlir_model, device="cuda", mlir_dialect="linalg")
+shark_module = SharkInference(
+    mlir_model, func_name, device="cuda", mlir_dialect="linalg"
+)
 shark_module.compile()


--- a/shark/examples/shark_inference/resnet50_script.py
+++ b/shark/examples/shark_inference/resnet50_script.py
@@ -5,7 +5,7 @@ import torchvision.models as models
 from torchvision import transforms
 import sys
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
+from shark.shark_downloader import download_torch_model


 ################################## Preprocessing inputs and model ############
@@ -66,15 +66,13 @@ labels = load_labels()


 ## Can pass any img or input to the forward module.
-mlir_model, func_name, inputs, golden_out = download_model(
-    "resnet50", frontend="torch"
-)
+mlir_model, func_name, inputs, golden_out = download_torch_model("resnet50")

-shark_module = SharkInference(mlir_model, mlir_dialect="linalg")
-shark_module.compile()
+shark_module = SharkInference(mlir_model, func_name, mlir_dialect="linalg")
+# shark_module.compile()
 path = shark_module.save_module()
 shark_module.load_module(path)
-result = shark_module("forward", (img.detach().numpy(),))
+result = shark_module.forward((img.detach().numpy(),))

 print("The top 3 results obtained via shark_runner is:")
 print(top3_possibilities(torch.from_numpy(result)))
--- a/shark/examples/shark_inference/sharded_bloom.py
+++ b/shark/examples/shark_inference/sharded_bloom.py
@@ -1,842 +0,0 @@
-####################################################################################
-# Please make sure you have transformers 4.21.2 installed before running this demo
-#
-# -p --model_path: the directory in which you want to store the bloom files.
-# -dl --device_list: the list of device indices you want to use.  if you want to only use the first device, or you are running on cpu leave this blank.
-#                     Otherwise, please give this argument in this format: "[0, 1, 2]"
-# -de --device: the device you want to run bloom on.  E.G. cpu, cuda
-# -c, --recompile: set to true if you want to recompile to vmfb.
-# -d, --download: set to true if you want to redownload the mlir files
-# -cm, --create_mlirs: set to true if you want to create the mlir files from scratch.  please make sure you have transformers 4.21.2 before using this option
-# -t --token_count: the number of tokens you want to generate
-# -pr --prompt: the prompt you want to feed to the model
-# -m --model_name: the name of the model, e.g. bloom-560m
-#
-# If you don't specify a prompt when you run this example, you will be able to give prompts through the terminal.  Run the
-# example in this way if you want to run multiple examples without reinitializing the model
-#####################################################################################
-
-import os
-import io
-import torch
-import torch.nn as nn
-from collections import OrderedDict
-import torch_mlir
-from torch_mlir import TensorPlaceholder
-import re
-from transformers.models.bloom.configuration_bloom import BloomConfig
-import json
-import sys
-import argparse
-import json
-import urllib.request
-import subprocess
-
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_public_file
-from transformers import (
-    BloomTokenizerFast,
-    BloomForSequenceClassification,
-    BloomForCausalLM,
-)
-from transformers.models.bloom.modeling_bloom import (
-    BloomBlock,
-    build_alibi_tensor,
-)
-
-IS_CUDA = False
-
-
-class ShardedBloom:
-    def __init__(self, src_folder):
-        f = open(f"{src_folder}/config.json")
-        config = json.load(f)
-        f.close()
-
-        self.layers_initialized = False
-
-        self.src_folder = src_folder
-        try:
-            self.n_embed = config["n_embed"]
-        except KeyError:
-            self.n_embed = config["hidden_size"]
-        self.vocab_size = config["vocab_size"]
-        self.n_layer = config["n_layer"]
-        try:
-            self.n_head = config["num_attention_heads"]
-        except KeyError:
-            self.n_head = config["n_head"]
-
-    def _init_layer(self, layer_name, device, replace, device_idx):
-        if replace or not os.path.exists(
-            f"{self.src_folder}/{layer_name}.vmfb"
-        ):
-            f_ = open(f"{self.src_folder}/{layer_name}.mlir", encoding="utf-8")
-            module = f_.read()
-            f_.close()
-            module = bytes(module, "utf-8")
-            shark_module = SharkInference(
-                module,
-                device=device,
-                mlir_dialect="tm_tensor",
-                device_idx=device_idx,
-            )
-            shark_module.save_module(
-                module_name=f"{self.src_folder}/{layer_name}",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-        else:
-            shark_module = SharkInference(
-                "",
-                device=device,
-                mlir_dialect="tm_tensor",
-                device_idx=device_idx,
-            )
-
-        return shark_module
-
-    def init_layers(self, device, replace=False, device_idx=[0]):
-        if device_idx is not None:
-            n_devices = len(device_idx)
-
-        self.word_embeddings_module = self._init_layer(
-            "word_embeddings",
-            device,
-            replace,
-            device_idx if device_idx is None else device_idx[0 % n_devices],
-        )
-        self.word_embeddings_layernorm_module = self._init_layer(
-            "word_embeddings_layernorm",
-            device,
-            replace,
-            device_idx if device_idx is None else device_idx[1 % n_devices],
-        )
-        self.ln_f_module = self._init_layer(
-            "ln_f",
-            device,
-            replace,
-            device_idx if device_idx is None else device_idx[2 % n_devices],
-        )
-        self.lm_head_module = self._init_layer(
-            "lm_head",
-            device,
-            replace,
-            device_idx if device_idx is None else device_idx[3 % n_devices],
-        )
-        self.block_modules = [
-            self._init_layer(
-                f"bloom_block_{i}",
-                device,
-                replace,
-                device_idx
-                if device_idx is None
-                else device_idx[(i + 4) % n_devices],
-            )
-            for i in range(self.n_layer)
-        ]
-
-        self.layers_initialized = True
-
-    def load_layers(self):
-        assert self.layers_initialized
-
-        self.word_embeddings_module.load_module(
-            f"{self.src_folder}/word_embeddings.vmfb"
-        )
-        self.word_embeddings_layernorm_module.load_module(
-            f"{self.src_folder}/word_embeddings_layernorm.vmfb"
-        )
-        for block_module, i in zip(self.block_modules, range(self.n_layer)):
-            block_module.load_module(f"{self.src_folder}/bloom_block_{i}.vmfb")
-        self.ln_f_module.load_module(f"{self.src_folder}/ln_f.vmfb")
-        self.lm_head_module.load_module(f"{self.src_folder}/lm_head.vmfb")
-
-    def forward_pass(self, input_ids, device):
-        if IS_CUDA:
-            cudaSetDevice(self.word_embeddings_module.device_idx)
-
-        input_embeds = self.word_embeddings_module(
-            inputs=(input_ids,), function_name="forward"
-        )
-
-        input_embeds = torch.tensor(input_embeds).float()
-        if IS_CUDA:
-            cudaSetDevice(self.word_embeddings_layernorm_module.device_idx)
-        hidden_states = self.word_embeddings_layernorm_module(
-            inputs=(input_embeds,), function_name="forward"
-        )
-
-        hidden_states = torch.tensor(hidden_states).float()
-
-        attention_mask = torch.ones(
-            [hidden_states.shape[0], len(input_ids[0])]
-        )
-        alibi = build_alibi_tensor(
-            attention_mask,
-            self.n_head,
-            hidden_states.dtype,
-            hidden_states.device,
-        )
-
-        causal_mask = _prepare_attn_mask(
-            attention_mask, input_ids.size(), input_embeds, 0
-        )
-        causal_mask = torch.tensor(causal_mask).float()
-
-        presents = ()
-        all_hidden_states = tuple(hidden_states)
-
-        for block_module, i in zip(self.block_modules, range(self.n_layer)):
-            if IS_CUDA:
-                cudaSetDevice(block_module.device_idx)
-
-            output = block_module(
-                inputs=(
-                    hidden_states.detach().numpy(),
-                    alibi.detach().numpy(),
-                    causal_mask.detach().numpy(),
-                ),
-                function_name="forward",
-            )
-            hidden_states = torch.tensor(output[0]).float()
-            all_hidden_states = all_hidden_states + (hidden_states,)
-            presents = presents + (
-                tuple(
-                    (
-                        output[1],
-                        output[2],
-                    )
-                ),
-            )
-        if IS_CUDA:
-            cudaSetDevice(self.ln_f_module.device_idx)
-
-        hidden_states = self.ln_f_module(
-            inputs=(hidden_states,), function_name="forward"
-        )
-        if IS_CUDA:
-            cudaSetDevice(self.lm_head_module.device_idx)
-
-        logits = self.lm_head_module(
-            inputs=(hidden_states,), function_name="forward"
-        )
-        logits = torch.tensor(logits).float()
-
-        return torch.argmax(logits[:, -1, :], dim=-1)
-
-
-def _make_causal_mask(
-    input_ids_shape: torch.Size,
-    dtype: torch.dtype,
-    past_key_values_length: int = 0,
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    batch_size, target_length = input_ids_shape
-    mask = torch.full((target_length, target_length), torch.finfo(dtype).min)
-    mask_cond = torch.arange(mask.size(-1))
-    intermediate_mask = mask_cond < (mask_cond + 1).view(mask.size(-1), 1)
-    mask.masked_fill_(intermediate_mask, 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat(
-            [
-                torch.zeros(
-                    target_length, past_key_values_length, dtype=dtype
-                ),
-                mask,
-            ],
-            dim=-1,
-        )
-    expanded_mask = mask[None, None, :, :].expand(
-        batch_size, 1, target_length, target_length + past_key_values_length
-    )
-    return expanded_mask
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    batch_size, source_length = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else source_length
-
-    expanded_mask = (
-        mask[:, None, None, :]
-        .expand(batch_size, 1, tgt_len, source_length)
-        .to(dtype)
-    )
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(
-        inverted_mask.to(torch.bool), torch.finfo(dtype).min
-    )
-
-
-def _prepare_attn_mask(
-    attention_mask, input_shape, inputs_embeds, past_key_values_length
-):
-    # create causal mask
-    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-    combined_attention_mask = None
-    if input_shape[-1] > 1:
-        combined_attention_mask = _make_causal_mask(
-            input_shape,
-            inputs_embeds.dtype,
-            past_key_values_length=past_key_values_length,
-        ).to(attention_mask.device)
-
-    if attention_mask is not None:
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        expanded_attn_mask = _expand_mask(
-            attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-        )
-        combined_attention_mask = (
-            expanded_attn_mask
-            if combined_attention_mask is None
-            else expanded_attn_mask + combined_attention_mask
-        )
-
-    return combined_attention_mask
-
-
-def download_model(destination_folder, model_name):
-    download_public_file(
-        f"gs://shark_tank/sharded_bloom/{model_name}/", destination_folder
-    )
-
-
-def compile_embeddings(embeddings_layer, input_ids, path):
-    input_ids_placeholder = torch_mlir.TensorPlaceholder.like(
-        input_ids, dynamic_axes=[1]
-    )
-    module = torch_mlir.compile(
-        embeddings_layer,
-        (input_ids_placeholder),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def compile_word_embeddings_layernorm(
-    embeddings_layer_layernorm, embeds, path
-):
-    embeds_placeholder = torch_mlir.TensorPlaceholder.like(
-        embeds, dynamic_axes=[1]
-    )
-    module = torch_mlir.compile(
-        embeddings_layer_layernorm,
-        (embeds_placeholder),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def strip_overloads(gm):
-    """
-    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-    Args:
-        gm(fx.GraphModule): The input Fx graph module to be modified
-    """
-    for node in gm.graph.nodes:
-        if isinstance(node.target, torch._ops.OpOverload):
-            node.target = node.target.overloadpacket
-    gm.recompile()
-
-
-def compile_to_mlir(
-    bblock,
-    hidden_states,
-    layer_past=None,
-    attention_mask=None,
-    head_mask=None,
-    use_cache=None,
-    output_attentions=False,
-    alibi=None,
-    block_index=0,
-    path=".",
-):
-    fx_g = make_fx(
-        bblock,
-        decomposition_table=get_decompositions(
-            [
-                torch.ops.aten.split.Tensor,
-                torch.ops.aten.split_with_sizes,
-            ]
-        ),
-        tracing_mode="real",
-        _allow_non_fake_inputs=False,
-    )(hidden_states, alibi, attention_mask)
-
-    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-    fx_g.recompile()
-
-    strip_overloads(fx_g)
-
-    hidden_states_placeholder = TensorPlaceholder.like(
-        hidden_states, dynamic_axes=[1]
-    )
-    attention_mask_placeholder = TensorPlaceholder.like(
-        attention_mask, dynamic_axes=[2, 3]
-    )
-    alibi_placeholder = TensorPlaceholder.like(alibi, dynamic_axes=[2])
-
-    ts_g = torch.jit.script(fx_g)
-
-    module = torch_mlir.compile(
-        ts_g,
-        (
-            hidden_states_placeholder,
-            alibi_placeholder,
-            attention_mask_placeholder,
-        ),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    module_placeholder = module
-    module_context = module_placeholder.context
-
-    def check_valid_line(line, line_n, mlir_file_len):
-        if "private" in line:
-            return False
-        if "attributes" in line:
-            return False
-        if mlir_file_len - line_n == 2:
-            return False
-
-        return True
-
-    mlir_file_len = len(str(module).split("\n"))
-
-    def remove_constant_dim(line):
-        if "17x" in line:
-            line = re.sub("17x", "?x", line)
-            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
-        if "tensor.empty" in line and "?x?" in line:
-            line = re.sub(
-                "tensor.empty\(%dim\)", "tensor.empty(%dim, %dim)", line
-            )
-        if "arith.cmpi eq" in line:
-            line = re.sub("c17", "dim", line)
-        if " 17," in line:
-            line = re.sub(" 17,", " %dim,", line)
-        return line
-
-    module = "\n".join(
-        [
-            remove_constant_dim(line)
-            for line, line_n in zip(
-                str(module).split("\n"), range(mlir_file_len)
-            )
-            if check_valid_line(line, line_n, mlir_file_len)
-        ]
-    )
-
-    module = module_placeholder.parse(module, context=module_context)
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def compile_ln_f(ln_f, hidden_layers, path):
-    hidden_layers_placeholder = torch_mlir.TensorPlaceholder.like(
-        hidden_layers, dynamic_axes=[1]
-    )
-    module = torch_mlir.compile(
-        ln_f,
-        (hidden_layers_placeholder),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def compile_lm_head(lm_head, hidden_layers, path):
-    hidden_layers_placeholder = torch_mlir.TensorPlaceholder.like(
-        hidden_layers, dynamic_axes=[1]
-    )
-    module = torch_mlir.compile(
-        lm_head,
-        (hidden_layers_placeholder),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def create_mlirs(destination_folder, model_name):
-    model_config = "bigscience/" + model_name
-    sample_input_ids = torch.ones([1, 17], dtype=torch.int64)
-
-    urllib.request.urlretrieve(
-        f"https://huggingface.co/bigscience/{model_name}/resolve/main/config.json",
-        filename=f"{destination_folder}/config.json",
-    )
-    urllib.request.urlretrieve(
-        f"https://huggingface.co/bigscience/bloom/resolve/main/tokenizer.json",
-        filename=f"{destination_folder}/tokenizer.json",
-    )
-
-    class HuggingFaceLanguage(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = BloomForCausalLM.from_pretrained(model_config)
-
-        def forward(self, tokens):
-            return self.model.forward(tokens)[0]
-
-    class HuggingFaceBlock(torch.nn.Module):
-        def __init__(self, block):
-            super().__init__()
-            self.model = block
-
-        def forward(self, tokens, alibi, attention_mask):
-            output = self.model(
-                hidden_states=tokens,
-                alibi=alibi,
-                attention_mask=attention_mask,
-                use_cache=True,
-                output_attentions=False,
-            )
-            return (output[0], output[1][0], output[1][1])
-
-    model = HuggingFaceLanguage()
-
-    compile_embeddings(
-        model.model.transformer.word_embeddings,
-        sample_input_ids,
-        f"{destination_folder}/word_embeddings.mlir",
-    )
-
-    inputs_embeds = model.model.transformer.word_embeddings(sample_input_ids)
-
-    compile_word_embeddings_layernorm(
-        model.model.transformer.word_embeddings_layernorm,
-        inputs_embeds,
-        f"{destination_folder}/word_embeddings_layernorm.mlir",
-    )
-
-    hidden_states = model.model.transformer.word_embeddings_layernorm(
-        inputs_embeds
-    )
-
-    input_shape = sample_input_ids.size()
-
-    current_sequence_length = hidden_states.shape[1]
-    past_key_values_length = 0
-    past_key_values = tuple([None] * len(model.model.transformer.h))
-
-    attention_mask = torch.ones(
-        (hidden_states.shape[0], current_sequence_length), device="cpu"
-    )
-
-    alibi = build_alibi_tensor(
-        attention_mask,
-        model.model.transformer.n_head,
-        hidden_states.dtype,
-        "cpu",
-    )
-
-    causal_mask = _prepare_attn_mask(
-        attention_mask, input_shape, inputs_embeds, past_key_values_length
-    )
-
-    head_mask = model.model.transformer.get_head_mask(
-        None, model.model.transformer.config.n_layer
-    )
-    output_attentions = model.model.transformer.config.output_attentions
-
-    all_hidden_states = ()
-
-    for i, (block, layer_past) in enumerate(
-        zip(model.model.transformer.h, past_key_values)
-    ):
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-        proxy_model = HuggingFaceBlock(block)
-
-        compile_to_mlir(
-            proxy_model,
-            hidden_states,
-            layer_past=layer_past,
-            attention_mask=causal_mask,
-            head_mask=head_mask[i],
-            use_cache=True,
-            output_attentions=output_attentions,
-            alibi=alibi,
-            block_index=i,
-            path=f"{destination_folder}/bloom_block_{i}.mlir",
-        )
-
-    compile_ln_f(
-        model.model.transformer.ln_f,
-        hidden_states,
-        f"{destination_folder}/ln_f.mlir",
-    )
-    hidden_states = model.model.transformer.ln_f(hidden_states)
-    compile_lm_head(
-        model.model.lm_head,
-        hidden_states,
-        f"{destination_folder}/lm_head.mlir",
-    )
-
-
-def run_large_model(
-    token_count,
-    recompile,
-    model_path,
-    prompt,
-    device_list,
-    script_path,
-    device,
-):
-    f = open(f"{model_path}/prompt.txt", "w+")
-    f.write(prompt)
-    f.close()
-    for i in range(token_count):
-        if i == 0:
-            will_compile = recompile
-        else:
-            will_compile = False
-            f = open(f"{model_path}/prompt.txt", "r")
-            prompt = f.read()
-            f.close()
-
-        subprocess.run(
-            [
-                "python",
-                script_path,
-                model_path,
-                "start",
-                str(will_compile),
-                "cpu",
-                "None",
-                prompt,
-            ]
-        )
-        for i in range(config["n_layer"]):
-            if device_list is not None:
-                device_idx = str(device_list[i % len(device_list)])
-            else:
-                device_idx = "None"
-            subprocess.run(
-                [
-                    "python",
-                    script_path,
-                    model_path,
-                    str(i),
-                    str(will_compile),
-                    device,
-                    device_idx,
-                    prompt,
-                ]
-            )
-        subprocess.run(
-            [
-                "python",
-                script_path,
-                model_path,
-                "end",
-                str(will_compile),
-                "cpu",
-                "None",
-                prompt,
-            ]
-        )
-
-    f = open(f"{model_path}/prompt.txt", "r")
-    output = f.read()
-    f.close()
-    print(output)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(prog="Bloom-560m")
-    parser.add_argument("-p", "--model_path")
-    parser.add_argument("-dl", "--device_list", default=None)
-    parser.add_argument("-de", "--device", default="cpu")
-    parser.add_argument("-c", "--recompile", default=False, type=bool)
-    parser.add_argument("-d", "--download", default=False, type=bool)
-    parser.add_argument("-t", "--token_count", default=10, type=int)
-    parser.add_argument("-m", "--model_name", default="bloom-560m")
-    parser.add_argument("-cm", "--create_mlirs", default=False, type=bool)
-
-    parser.add_argument(
-        "-lm", "--large_model_memory_efficient", default=False, type=bool
-    )
-
-    parser.add_argument(
-        "-pr",
-        "--prompt",
-        default=None,
-    )
-    args = parser.parse_args()
-
-    if args.create_mlirs and args.large_model_memory_efficient:
-        print(
-            "Warning: If you need to use memory efficient mode, you probably want to use 'download' instead"
-        )
-
-    if not os.path.isdir(args.model_path):
-        os.mkdir(args.model_path)
-
-    if args.device_list is not None:
-        args.device_list = json.loads(args.device_list)
-
-    if args.device == "cuda" and args.device_list is not None:
-        IS_CUDA = True
-        from cuda.cudart import cudaSetDevice
-    if args.download and args.create_mlirs:
-        print(
-            "WARNING: It is not advised to turn on both download and create_mlirs"
-        )
-    if args.download:
-        download_model(args.model_path, args.model_name)
-    if args.create_mlirs:
-        create_mlirs(args.model_path, args.model_name)
-    from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    if args.prompt is not None:
-        input_ids = tokenizer.encode(args.prompt, return_tensors="pt")
-
-    if args.large_model_memory_efficient:
-        f = open(f"{args.model_path}/config.json")
-        config = json.load(f)
-        f.close()
-
-        self_path = os.path.dirname(os.path.abspath(__file__))
-        script_path = os.path.join(self_path, "sharded_bloom_large_models.py")
-
-        if args.prompt is not None:
-            run_large_model(
-                args.token_count,
-                args.recompile,
-                args.model_path,
-                args.prompt,
-                args.device_list,
-                script_path,
-                args.device,
-            )
-
-        else:
-            while True:
-                prompt = input("Enter Prompt: ")
-                try:
-                    token_count = int(
-                        input("Enter number of tokens you want to generate: ")
-                    )
-                except:
-                    print(
-                        "Invalid integer entered.  Using default value of 10"
-                    )
-                    token_count = 10
-
-                run_large_model(
-                    token_count,
-                    args.recompile,
-                    args.model_path,
-                    prompt,
-                    args.device_list,
-                    script_path,
-                    args.device,
-                )
-
-    else:
-        shardedbloom = ShardedBloom(args.model_path)
-        shardedbloom.init_layers(
-            device=args.device,
-            replace=args.recompile,
-            device_idx=args.device_list,
-        )
-        shardedbloom.load_layers()
-
-        if args.prompt is not None:
-            for _ in range(args.token_count):
-                next_token = shardedbloom.forward_pass(
-                    torch.tensor(input_ids), device=args.device
-                )
-                input_ids = torch.cat(
-                    [input_ids, next_token.unsqueeze(-1)], dim=-1
-                )
-
-            print(tokenizer.decode(input_ids.squeeze()))
-
-        else:
-            while True:
-                prompt = input("Enter Prompt: ")
-                try:
-                    token_count = int(
-                        input("Enter number of tokens you want to generate: ")
-                    )
-                except:
-                    print(
-                        "Invalid integer entered.  Using default value of 10"
-                    )
-                    token_count = 10
-
-                input_ids = tokenizer.encode(prompt, return_tensors="pt")
-
-                for _ in range(token_count):
-                    next_token = shardedbloom.forward_pass(
-                        torch.tensor(input_ids), device=args.device
-                    )
-                    input_ids = torch.cat(
-                        [input_ids, next_token.unsqueeze(-1)], dim=-1
-                    )
-
-                print(tokenizer.decode(input_ids.squeeze()))
--- a/shark/examples/shark_inference/sharded_bloom_large_models.py
+++ b/shark/examples/shark_inference/sharded_bloom_large_models.py
@@ -1,381 +0,0 @@
-import sys
-import os
-from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig
-import re
-from shark.shark_inference import SharkInference
-import torch
-import torch.nn as nn
-from collections import OrderedDict
-from transformers.models.bloom.modeling_bloom import (
-    BloomBlock,
-    build_alibi_tensor,
-)
-import time
-import json
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    batch_size, source_length = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else source_length
-
-    expanded_mask = (
-        mask[:, None, None, :]
-        .expand(batch_size, 1, tgt_len, source_length)
-        .to(dtype)
-    )
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(
-        inverted_mask.to(torch.bool), torch.finfo(dtype).min
-    )
-
-
-def _prepare_attn_mask(
-    attention_mask, input_shape, inputs_embeds, past_key_values_length
-):
-    # create causal mask
-    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-    combined_attention_mask = None
-    if input_shape[-1] > 1:
-        combined_attention_mask = _make_causal_mask(
-            input_shape,
-            inputs_embeds.dtype,
-            past_key_values_length=past_key_values_length,
-        ).to(attention_mask.device)
-
-    if attention_mask is not None:
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        expanded_attn_mask = _expand_mask(
-            attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-        )
-        combined_attention_mask = (
-            expanded_attn_mask
-            if combined_attention_mask is None
-            else expanded_attn_mask + combined_attention_mask
-        )
-
-    return combined_attention_mask
-
-
-def _make_causal_mask(
-    input_ids_shape: torch.Size,
-    dtype: torch.dtype,
-    past_key_values_length: int = 0,
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    batch_size, target_length = input_ids_shape
-    mask = torch.full((target_length, target_length), torch.finfo(dtype).min)
-    mask_cond = torch.arange(mask.size(-1))
-    intermediate_mask = mask_cond < (mask_cond + 1).view(mask.size(-1), 1)
-    mask.masked_fill_(intermediate_mask, 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat(
-            [
-                torch.zeros(
-                    target_length, past_key_values_length, dtype=dtype
-                ),
-                mask,
-            ],
-            dim=-1,
-        )
-    expanded_mask = mask[None, None, :, :].expand(
-        batch_size, 1, target_length, target_length + past_key_values_length
-    )
-    return expanded_mask
-
-
-if __name__ == "__main__":
-    working_dir = sys.argv[1]
-    layer_name = sys.argv[2]
-    will_compile = sys.argv[3]
-    device = sys.argv[4]
-    device_idx = sys.argv[5]
-    prompt = sys.argv[6]
-
-    if device_idx.lower().strip() == "none":
-        device_idx = None
-    else:
-        device_idx = int(device_idx)
-
-    if will_compile.lower().strip() == "true":
-        will_compile = True
-    else:
-        will_compile = False
-
-    f = open(f"{working_dir}/config.json")
-    config = json.load(f)
-    f.close()
-
-    layers_initialized = False
-    try:
-        n_embed = config["n_embed"]
-    except KeyError:
-        n_embed = config["hidden_size"]
-    vocab_size = config["vocab_size"]
-    n_layer = config["n_layer"]
-    try:
-        n_head = config["num_attention_heads"]
-    except KeyError:
-        n_head = config["n_head"]
-
-    if not os.path.isdir(working_dir):
-        os.mkdir(working_dir)
-
-    if layer_name == "start":
-        tokenizer = AutoTokenizer.from_pretrained(working_dir)
-        input_ids = tokenizer.encode(prompt, return_tensors="pt")
-
-        mlir_str = ""
-
-        if will_compile:
-            f = open(f"{working_dir}/word_embeddings.mlir", encoding="utf-8")
-            mlir_str = f.read()
-            f.close()
-
-            mlir_str = bytes(mlir_str, "utf-8")
-
-        shark_module = SharkInference(
-            mlir_str,
-            device="cpu",
-            mlir_dialect="tm_tensor",
-            device_idx=None,
-        )
-
-        if will_compile:
-            shark_module.save_module(
-                module_name=f"{working_dir}/word_embeddings",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-
-        shark_module.load_module(f"{working_dir}/word_embeddings.vmfb")
-        input_embeds = shark_module(
-            inputs=(input_ids,), function_name="forward"
-        )
-        input_embeds = torch.tensor(input_embeds).float()
-
-        mlir_str = ""
-
-        if will_compile:
-            f = open(
-                f"{working_dir}/word_embeddings_layernorm.mlir",
-                encoding="utf-8",
-            )
-            mlir_str = f.read()
-            f.close()
-
-        shark_module = SharkInference(
-            mlir_str,
-            device="cpu",
-            mlir_dialect="tm_tensor",
-            device_idx=None,
-        )
-
-        if will_compile:
-            shark_module.save_module(
-                module_name=f"{working_dir}/word_embeddings_layernorm",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-
-        shark_module.load_module(
-            f"{working_dir}/word_embeddings_layernorm.vmfb"
-        )
-        hidden_states = shark_module(
-            inputs=(input_embeds,), function_name="forward"
-        )
-        hidden_states = torch.tensor(hidden_states).float()
-
-        torch.save(hidden_states, f"{working_dir}/hidden_states_0.pt")
-
-        attention_mask = torch.ones(
-            [hidden_states.shape[0], len(input_ids[0])]
-        )
-
-        attention_mask = torch.tensor(attention_mask).float()
-
-        alibi = build_alibi_tensor(
-            attention_mask,
-            n_head,
-            hidden_states.dtype,
-            device="cpu",
-        )
-
-        torch.save(alibi, f"{working_dir}/alibi.pt")
-
-        causal_mask = _prepare_attn_mask(
-            attention_mask, input_ids.size(), input_embeds, 0
-        )
-        causal_mask = torch.tensor(causal_mask).float()
-
-        torch.save(causal_mask, f"{working_dir}/causal_mask.pt")
-
-    elif layer_name in [str(x) for x in range(n_layer)]:
-        hidden_states = torch.load(
-            f"{working_dir}/hidden_states_{layer_name}.pt"
-        )
-        alibi = torch.load(f"{working_dir}/alibi.pt")
-        causal_mask = torch.load(f"{working_dir}/causal_mask.pt")
-
-        mlir_str = ""
-
-        if will_compile:
-            f = open(
-                f"{working_dir}/bloom_block_{layer_name}.mlir",
-                encoding="utf-8",
-            )
-            mlir_str = f.read()
-            f.close()
-
-            mlir_str = bytes(mlir_str, "utf-8")
-
-        shark_module = SharkInference(
-            mlir_str,
-            device=device,
-            mlir_dialect="tm_tensor",
-            device_idx=device_idx,
-        )
-
-        if will_compile:
-            shark_module.save_module(
-                module_name=f"{working_dir}/bloom_block_{layer_name}",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-
-        shark_module.load_module(
-            f"{working_dir}/bloom_block_{layer_name}.vmfb"
-        )
-
-        output = shark_module(
-            inputs=(
-                hidden_states.detach().numpy(),
-                alibi.detach().numpy(),
-                causal_mask.detach().numpy(),
-            ),
-            function_name="forward",
-        )
-
-        hidden_states = torch.tensor(output[0]).float()
-
-        torch.save(
-            hidden_states,
-            f"{working_dir}/hidden_states_{int(layer_name) + 1}.pt",
-        )
-
-    elif layer_name == "end":
-        mlir_str = ""
-
-        if will_compile:
-            f = open(f"{working_dir}/ln_f.mlir", encoding="utf-8")
-            mlir_str = f.read()
-            f.close()
-
-            mlir_str = bytes(mlir_str, "utf-8")
-
-        shark_module = SharkInference(
-            mlir_str,
-            device="cpu",
-            mlir_dialect="tm_tensor",
-            device_idx=None,
-        )
-
-        if will_compile:
-            shark_module.save_module(
-                module_name=f"{working_dir}/ln_f",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-
-        shark_module.load_module(f"{working_dir}/ln_f.vmfb")
-
-        hidden_states = torch.load(f"{working_dir}/hidden_states_{n_layer}.pt")
-
-        hidden_states = shark_module(
-            inputs=(hidden_states,), function_name="forward"
-        )
-
-        mlir_str = ""
-
-        if will_compile:
-            f = open(f"{working_dir}/lm_head.mlir", encoding="utf-8")
-            mlir_str = f.read()
-            f.close()
-
-            mlir_str = bytes(mlir_str, "utf-8")
-
-        if config["n_embed"] == 14336:
-
-            def get_state_dict():
-                d = torch.load(
-                    f"{working_dir}/pytorch_model_00001-of-00072.bin"
-                )
-                return OrderedDict(
-                    (k.replace("word_embeddings.", ""), v)
-                    for k, v in d.items()
-                )
-
-            def load_causal_lm_head():
-                linear = nn.utils.skip_init(
-                    nn.Linear, 14336, 250880, bias=False, dtype=torch.float
-                )
-                linear.load_state_dict(get_state_dict(), strict=False)
-                return linear.float()
-
-            lm_head = load_causal_lm_head()
-
-            logits = lm_head(torch.tensor(hidden_states).float())
-
-        else:
-            shark_module = SharkInference(
-                mlir_str,
-                device="cpu",
-                mlir_dialect="tm_tensor",
-                device_idx=None,
-            )
-
-            if will_compile:
-                shark_module.save_module(
-                    module_name=f"{working_dir}/lm_head",
-                    extra_args=[
-                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                        "--iree-stream-resource-max-allocation-size=1000000000",
-                        "--iree-codegen-check-ir-before-llvm-conversion=false",
-                    ],
-                )
-
-            shark_module.load_module(f"{working_dir}/lm_head.vmfb")
-
-            logits = shark_module(
-                inputs=(hidden_states,), function_name="forward"
-            )
-
-        logits = torch.tensor(logits).float()
-
-        tokenizer = AutoTokenizer.from_pretrained(working_dir)
-
-        next_token = tokenizer.decode(torch.argmax(logits[:, -1, :], dim=-1))
-
-        f = open(f"{working_dir}/prompt.txt", "w+")
-        f.write(prompt + next_token)
-        f.close()
--- a/shark/examples/shark_inference/simple_dlrm.py
+++ b/shark/examples/shark_inference/simple_dlrm.py
@@ -151,6 +151,7 @@ class DLRM_Net(nn.Module):
            and (ln_top is not None)
            and (arch_interaction_op is not None)
        ):
+
            # save arguments
            self.output_d = 0
            self.arch_interaction_op = arch_interaction_op
@@ -215,6 +216,7 @@ class DLRM_Net(nn.Module):
        return ly

    def interact_features(self, x, ly):
+
        if self.arch_interaction_op == "dot":
            # concatenate dense and sparse features
            (batch_size, d) = x.shape
@@ -360,7 +362,7 @@ mlir_importer = SharkImporter(
 )

 shark_module = SharkInference(
-    dlrm_mlir, device="vulkan", mlir_dialect="linalg"
+    dlrm_mlir, func_name, device="vulkan", mlir_dialect="linalg"
 )
 shark_module.compile()
 result = shark_module.forward(input_dlrm)
--- a/shark/examples/shark_inference/sparse_arch.py
+++ b/shark/examples/shark_inference/sparse_arch.py
@@ -99,6 +99,7 @@ class SparseArchShark(nn.Module):
        )

    def forward(self, *batched_inputs):
+
        concatenated_list = []
        input_enum, embedding_enum = 0, 0

@@ -120,6 +121,7 @@ class SparseArchShark(nn.Module):


 def test_sparse_arch() -> None:
+
    D = 3
    eb1_config = EmbeddingBagConfig(
        name="t1",
@@ -209,6 +211,7 @@ class DLRMShark(nn.Module):
    def forward(
        self, dense_features: torch.Tensor, *sparse_features
    ) -> torch.Tensor:
+
        embedded_dense = self.dense_arch(dense_features)
        embedded_sparse = self.sparse_arch(*sparse_features)
        concatenated_dense = self.inter_arch(
@@ -294,7 +297,7 @@ def test_dlrm() -> None:
    )

    shark_module = SharkInference(
-        dlrm_mlir, device="cpu", mlir_dialect="linalg"
+        dlrm_mlir, func_name, device="cpu", mlir_dialect="linalg"
    )
    shark_module.compile()
    result = shark_module.forward(inputs)
--- a/shark/examples/shark_inference/stable_diff.py
+++ b/shark/examples/shark_inference/stable_diff.py
@@ -0,0 +1,268 @@
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
+import torch
+from PIL import Image
+from diffusers import LMSDiscreteScheduler
+from tqdm.auto import tqdm
+from shark.shark_inference import SharkInference
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+import torch_mlir
+import tempfile
+import numpy as np
+
+# pip install diffusers
+# pip install scipy
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a photograph of an astronaut riding a horse",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument("--steps", type=int, default=10, help="the device to use")
+p.add_argument("--mlir_loc", type=str, default=None, help="the device to use")
+p.add_argument("--vae_loc", type=str, default=None, help="the device to use")
+args = p.parse_args()
+
+#####################################################
+
+
+def load_mlir(mlir_loc):
+    import os
+
+    if mlir_loc == None:
+        return None
+    print(f"Trying to load the model from {mlir_loc}.")
+    with open(os.path.join(mlir_loc)) as f:
+        mlir_module = f.read()
+    return mlir_module
+
+
+def compile_through_fx(model, inputs, mlir_loc=None):
+
+    module = load_mlir(mlir_loc)
+    if mlir_loc == None:
+        fx_g = make_fx(
+            model,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(*inputs)
+
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+
+        def strip_overloads(gm):
+            """
+            Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+            Args:
+                gm(fx.GraphModule): The input Fx graph module to be modified
+            """
+            for node in gm.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverload):
+                    node.target = node.target.overloadpacket
+            gm.recompile()
+
+        strip_overloads(fx_g)
+
+        ts_g = torch.jit.script(fx_g)
+
+        module = torch_mlir.compile(
+            ts_g,
+            inputs,
+            torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=False,
+            verbose=False,
+        )
+
+    mlir_model = module
+    func_name = "forward"
+
+    shark_module = SharkInference(
+        mlir_model, func_name, device=args.device, mlir_dialect="tm_tensor"
+    )
+    shark_module.compile()
+
+    return shark_module
+
+
+if __name__ == "__main__":
+
+    YOUR_TOKEN = "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"
+
+    # 1. Load the autoencoder model which will be used to decode the latents into image space.
+    vae = AutoencoderKL.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        subfolder="vae",
+        use_auth_token=YOUR_TOKEN,
+    )
+
+    # 2. Load the tokenizer and text encoder to tokenize and encode the text.
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModel.from_pretrained(
+        "openai/clip-vit-large-patch14"
+    )
+
+    class VaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = AutoencoderKL.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="vae",
+                use_auth_token=YOUR_TOKEN,
+            )
+
+        def forward(self, input):
+            return self.vae.decode(input, return_dict=False)[0]
+
+    vae = VaeModel()
+    vae_input = torch.rand(1, 4, 64, 64)
+    shark_vae = compile_through_fx(vae, (vae_input,), args.vae_loc)
+
+    # Wrap the unet model to return tuples.
+    class UnetModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.unet = UNet2DConditionModel.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="unet",
+                use_auth_token=YOUR_TOKEN,
+            )
+            self.in_channels = self.unet.in_channels
+            self.train(False)
+
+        def forward(self, x, y, z):
+            return self.unet.forward(x, y, z, return_dict=False)[0]
+
+    # 3. The UNet model for generating the latents.
+    unet = UnetModel()
+    latent_model_input = torch.rand([2, 4, 64, 64])
+    text_embeddings = torch.rand([2, 77, 768])
+    shark_unet = compile_through_fx(
+        unet,
+        (latent_model_input, torch.tensor([1.0]), text_embeddings),
+        args.mlir_loc,
+    )
+
+    # torch.jit.script(unet)
+
+    scheduler = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+
+    prompt = [args.prompt]
+
+    height = 512  # default height of Stable Diffusion
+    width = 512  # default width of Stable Diffusion
+
+    num_inference_steps = args.steps  # Number of denoising steps
+
+    guidance_scale = 7.5  # Scale for classifier-free guidance
+
+    generator = torch.manual_seed(
+        42
+    )  # Seed generator to create the inital latent noise
+
+    batch_size = len(prompt)
+
+    text_input = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+    text_embeddings = text_encoder(text_input.input_ids)[0]
+
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
+        [""] * batch_size,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+    uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
+
+    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+    latents = torch.randn(
+        (batch_size, unet.in_channels, height // 8, width // 8),
+        generator=generator,
+    )
+    # latents = latents.to(torch_device)
+
+    scheduler.set_timesteps(num_inference_steps)
+
+    latents = latents * scheduler.sigmas[0]
+    # print(latents, latents.shape)
+
+    for i, t in tqdm(enumerate(scheduler.timesteps)):
+
+        print(f"i = {i} t = {t}")
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        latent_model_input = torch.cat([latents] * 2)
+        sigma = scheduler.sigmas[i]
+        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+
+        # predict the noise residual
+
+        # with torch.no_grad():
+        # noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
+
+        latent_model_input_numpy = latent_model_input.detach().numpy()
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+
+        noise_pred = shark_unet.forward(
+            (
+                latent_model_input_numpy,
+                np.array([t]).astype(np.float32),
+                text_embeddings_numpy,
+            )
+        )
+        noise_pred = torch.from_numpy(noise_pred)
+
+        # perform guidance
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (
+            noise_pred_text - noise_pred_uncond
+        )
+
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
+
+    # print("Latents shape : ", latents.shape)
+
+    # scale and decode the image latents with vae
+    latents = 1 / 0.18215 * latents
+    latents_numpy = latents.detach().numpy()
+    image = shark_vae.forward((latents_numpy,))
+    image = torch.from_numpy(image)
+
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    pil_images[0].save("astro.jpg")
--- a/shark/examples/shark_inference/stable_diff_tf.py
+++ b/shark/examples/shark_inference/stable_diff_tf.py
@@ -0,0 +1,313 @@
+import math
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from keras_cv.models.generative.stable_diffusion.clip_tokenizer import (
+    SimpleTokenizer,
+)
+from keras_cv.models.generative.stable_diffusion.constants import (
+    _ALPHAS_CUMPROD,
+)
+from keras_cv.models.generative.stable_diffusion.constants import (
+    _UNCONDITIONAL_TOKENS,
+)
+from keras_cv.models.generative.stable_diffusion.decoder import Decoder
+from keras_cv.models.generative.stable_diffusion.text_encoder import (
+    TextEncoder,
+)
+
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+from PIL import Image
+
+# pip install "git+https://github.com/keras-team/keras-cv.git"
+# pip install tensorflow_dataset
+
+############### Parsing args #####################
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a photograph of an astronaut riding a horse",
+    help="the text prompt to use",
+)
+p.add_argument("--device", type=str, default="cpu", help="the device to use")
+p.add_argument(
+    "--steps", type=int, default=10, help="the number of steps to use"
+)
+p.add_argument(
+    "--save_path",
+    type=str,
+    default=None,
+    help="the file to save the resulting image to. (default to <input prompt>.jpg)",
+)
+args = p.parse_args()
+
+#####################################################
+
+MAX_PROMPT_LENGTH = 77
+
+
+class SharkStableDiffusion:
+    """Shark implementation of Stable Diffusion based on model from keras_cv.
+    Stable Diffusion is a powerful image generation model that can be used,
+    among other things, to generate pictures according to a short text description
+    (called a "prompt").
+    Arguments:
+        device: Device to use with SHARK. Default: cpu
+        jit_compile: Whether to compile the underlying models to XLA.
+            This can lead to a significant speedup on some systems. Default: False.
+    References:
+    - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
+    - [Original implementation](https://github.com/CompVis/stable-diffusion)
+    """
+
+    def __init__(self, device="cpu", jit_compile=True):
+        self.img_height = 512
+        self.img_width = 512
+        self.tokenizer = SimpleTokenizer()
+
+        # Create models
+        self.text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
+
+        mlir_model, func_name, inputs, golden_out = download_tf_model(
+            "stable_diff", tank_url="gs://shark_tank/quinn"
+        )
+        shark_module = SharkInference(
+            mlir_model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        self.diffusion_model = shark_module
+        self.decoder = Decoder(self.img_height, self.img_width)
+        if jit_compile:
+            self.text_encoder.compile(jit_compile=True)
+            self.decoder.compile(jit_compile=True)
+
+        print(
+            "By using this model checkpoint, you acknowledge that its usage is "
+            "subject to the terms of the CreativeML Open RAIL-M license at "
+            "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE"
+        )
+        # Load weights
+        text_encoder_weights_fpath = keras.utils.get_file(
+            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_encoder.h5",
+            file_hash="4789e63e07c0e54d6a34a29b45ce81ece27060c499a709d556c7755b42bb0dc4",
+        )
+        decoder_weights_fpath = keras.utils.get_file(
+            origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_decoder.h5",
+            file_hash="ad350a65cc8bc4a80c8103367e039a3329b4231c2469a1093869a345f55b1962",
+        )
+        self.text_encoder.load_weights(text_encoder_weights_fpath)
+        self.decoder.load_weights(decoder_weights_fpath)
+
+    def text_to_image(
+        self,
+        prompt,
+        batch_size=1,
+        num_steps=25,
+        unconditional_guidance_scale=7.5,
+        seed=None,
+    ):
+        encoded_text = self.encode_text(prompt)
+
+        return self.generate_image(
+            encoded_text,
+            batch_size=batch_size,
+            num_steps=num_steps,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            seed=seed,
+        )
+
+    def encode_text(self, prompt):
+        """Encodes a prompt into a latent text encoding.
+        The encoding produced by this method should be used as the
+        `encoded_text` parameter of `StableDiffusion.generate_image`. Encoding
+        text separately from generating an image can be used to arbitrarily
+        modify the text encoding priot to image generation, e.g. for walking
+        between two prompts.
+        Args:
+            prompt: a string to encode, must be 77 tokens or shorter.
+        Example:
+        ```python
+        from keras_cv.models import StableDiffusion
+        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+        encoded_text  = model.encode_text("Tacos at dawn")
+        img = model.generate_image(encoded_text)
+        ```
+        """
+        # Tokenize prompt (i.e. starting context)
+        inputs = self.tokenizer.encode(prompt)
+        if len(inputs) > MAX_PROMPT_LENGTH:
+            raise ValueError(
+                f"Prompt is too long (should be <= {MAX_PROMPT_LENGTH} tokens)"
+            )
+        phrase = inputs + [49407] * (MAX_PROMPT_LENGTH - len(inputs))
+        phrase = tf.convert_to_tensor([phrase], dtype=tf.int32)
+
+        context = self.text_encoder.predict_on_batch(
+            [phrase, self._get_pos_ids()]
+        )
+
+        return context
+
+    def generate_image(
+        self,
+        encoded_text,
+        batch_size=1,
+        num_steps=25,
+        unconditional_guidance_scale=7.5,
+        diffusion_noise=None,
+        seed=None,
+    ):
+        """Generates an image based on encoded text.
+        The encoding passed to this method should be derived from
+        `StableDiffusion.encode_text`.
+        Args:
+            encoded_text: Tensor of shape (`batch_size`, 77, 768), or a Tensor
+            of shape (77, 768). When the batch axis is omitted, the same encoded
+            text will be used to produce every generated image.
+            batch_size: number of images to generate. Default: 1.
+            num_steps: number of diffusion steps (controls image quality).
+                Default: 25.
+            unconditional_guidance_scale: float controling how closely the image
+                should adhere to the prompt. Larger values result in more
+                closely adhering to the prompt, but will make the image noisier.
+                Default: 7.5.
+            diffusion_noise: Tensor of shape (`batch_size`, img_height // 8,
+                img_width // 8, 4), or a Tensor of shape (img_height // 8,
+                img_width // 8, 4). Optional custom noise to seed the diffusion
+                process. When the batch axis is omitted, the same noise will be
+                used to seed diffusion for every generated image.
+            seed: integer which is used to seed the random generation of
+                diffusion noise, only to be specified if `diffusion_noise` is
+                None.
+        Example:
+        ```python
+        from keras_cv.models import StableDiffusion
+        batch_size = 8
+        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+        e_tacos = model.encode_text("Tacos at dawn")
+        e_watermelons = model.encode_text("Watermelons at dusk")
+        e_interpolated = tf.linspace(e_tacos, e_watermelons, batch_size)
+        images = model.generate_image(e_interpolated, batch_size=batch_size)
+        ```
+        """
+        if diffusion_noise is not None and seed is not None:
+            raise ValueError(
+                "`diffusion_noise` and `seed` should not both be passed to "
+                "`generate_image`. `seed` is only used to generate diffusion "
+                "noise when it's not already user-specified."
+            )
+
+        encoded_text = tf.squeeze(encoded_text)
+        if encoded_text.shape.rank == 2:
+            encoded_text = tf.repeat(
+                tf.expand_dims(encoded_text, axis=0), batch_size, axis=0
+            )
+
+        context = encoded_text
+        unconditional_context = tf.repeat(
+            self._get_unconditional_context(), batch_size, axis=0
+        )
+        context = tf.concat([context, unconditional_context], 0)
+
+        if diffusion_noise is not None:
+            diffusion_noise = tf.squeeze(diffusion_noise)
+            if diffusion_noise.shape.rank == 3:
+                diffusion_noise = tf.repeat(
+                    tf.expand_dims(diffusion_noise, axis=0), batch_size, axis=0
+                )
+            latent = diffusion_noise
+        else:
+            latent = self._get_initial_diffusion_noise(batch_size, seed)
+
+        # Iterative reverse diffusion stage
+        timesteps = tf.range(1, 1000, 1000 // num_steps)
+        alphas, alphas_prev = self._get_initial_alphas(timesteps)
+        progbar = keras.utils.Progbar(len(timesteps))
+        iteration = 0
+        for index, timestep in list(enumerate(timesteps))[::-1]:
+            latent_prev = latent  # Set aside the previous latent vector
+            t_emb = self._get_timestep_embedding(timestep, batch_size)
+
+            # Prepare the latent and unconditional latent to be run with a single forward call
+            latent = tf.concat([latent, latent], 0)
+            t_emb = tf.concat([t_emb, t_emb], 0)
+            latent_numpy = self.diffusion_model.forward(
+                [latent.numpy(), t_emb.numpy(), context.numpy()]
+            )
+            latent = tf.convert_to_tensor(latent_numpy, dtype=tf.float32)
+            latent, unconditional_latent = tf.split(latent, 2)
+
+            latent = unconditional_latent + unconditional_guidance_scale * (
+                latent - unconditional_latent
+            )
+            a_t, a_prev = alphas[index], alphas_prev[index]
+            pred_x0 = (latent_prev - math.sqrt(1 - a_t) * latent) / math.sqrt(
+                a_t
+            )
+            latent = (
+                latent * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0
+            )
+            iteration += 1
+            progbar.update(iteration)
+
+        # Decoding stage
+        decoded = self.decoder.predict_on_batch(latent)
+        decoded = ((decoded + 1) / 2) * 255
+        return np.clip(decoded, 0, 255).astype("uint8")
+
+    def _get_unconditional_context(self):
+        unconditional_tokens = tf.convert_to_tensor(
+            [_UNCONDITIONAL_TOKENS], dtype=tf.int32
+        )
+        unconditional_context = self.text_encoder.predict_on_batch(
+            [unconditional_tokens, self._get_pos_ids()]
+        )
+
+        return unconditional_context
+
+    def _get_timestep_embedding(
+        self, timestep, batch_size, dim=320, max_period=10000
+    ):
+        half = dim // 2
+        freqs = tf.math.exp(
+            -math.log(max_period) * tf.range(0, half, dtype=tf.float32) / half
+        )
+        args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
+        embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
+        embedding = tf.reshape(embedding, [1, -1])
+        return tf.repeat(embedding, batch_size, axis=0)
+
+    def _get_initial_alphas(self, timesteps):
+        alphas = [_ALPHAS_CUMPROD[t] for t in timesteps]
+        alphas_prev = [1.0] + alphas[:-1]
+
+        return alphas, alphas_prev
+
+    def _get_initial_diffusion_noise(self, batch_size, seed):
+        return tf.random.normal(
+            (batch_size, self.img_height // 8, self.img_width // 8, 4),
+            seed=seed,
+        )
+
+    @staticmethod
+    def _get_pos_ids():
+        return tf.convert_to_tensor(
+            [list(range(MAX_PROMPT_LENGTH))], dtype=tf.int32
+        )
+
+
+if __name__ == "__main__":
+    SD = SharkStableDiffusion(device=args.device)
+    images = SD.text_to_image(args.prompt, num_steps=args.steps)
+    pil_images = [Image.fromarray(image) for image in images]
+    save_fname = args.prompt + ".jpg"
+    if args.save_path is not None:
+        save_fname = args.save_path
+    pil_images[0].save(save_fname)
--- a/shark/examples/shark_inference/t5_tf.py
+++ b/shark/examples/shark_inference/t5_tf.py
@@ -18,7 +18,7 @@ class T5Module(tf.Module):
        self.m = TFT5Model.from_pretrained("t5-small")
        self.m.predict = lambda x, y: self.m(input_ids=x, decoder_input_ids=y)

-    @tf.function(input_signature=t5_inputs, jit_compile=True)
+    @tf.function(input_signature=t5_inputs)
    def forward(self, input_ids, decoder_input_ids):
        return self.m.predict(input_ids, decoder_input_ids)

--- a/shark/examples/shark_inference/unet_script.py
+++ b/shark/examples/shark_inference/unet_script.py
@@ -33,7 +33,7 @@ mlir_importer = SharkImporter(
    tracing_required=False
 )

-shark_module = SharkInference(vision_mlir, mlir_dialect="linalg")
+shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
 shark_module.compile()
 result = shark_module.forward((input,))
 np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
--- a/shark/examples/shark_inference/upscaler/main.py
+++ b/shark/examples/shark_inference/upscaler/main.py
@@ -1,21 +0,0 @@
-import requests
-from PIL import Image
-from io import BytesIO
-from pipeline_shark_stable_diffusion_upscale import (
-    SharkStableDiffusionUpscalePipeline,
-)
-import torch
-
-model_id = "stabilityai/stable-diffusion-x4-upscaler"
-pipeline = SharkStableDiffusionUpscalePipeline(model_id)
-
-# let's download an  image
-url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
-response = requests.get(url)
-low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
-low_res_img = low_res_img.resize((128, 128))
-
-prompt = "a white cat"
-
-upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
-upscaled_image.save("upsampled_cat.png")
--- a/shark/examples/shark_inference/upscaler/model_wrappers.py
+++ b/shark/examples/shark_inference/upscaler/model_wrappers.py
@@ -1,98 +0,0 @@
-from diffusers import AutoencoderKL, UNet2DConditionModel
-from transformers import CLIPTextModel
-from utils import compile_through_fx
-import torch
-
-model_id = "stabilityai/stable-diffusion-x4-upscaler"
-
-model_input = {
-    "clip": (torch.randint(1, 2, (1, 77)),),
-    "vae": (torch.randn(1, 4, 128, 128),),
-    "unet": (
-        torch.randn(2, 7, 128, 128),  # latents
-        torch.tensor([1]).to(torch.float32),  # timestep
-        torch.randn(2, 77, 1024),  # embedding
-        torch.randn(2).to(torch.int64),  # noise_level
-    ),
-}
-
-
-def get_clip_mlir(model_name="clip_text", extra_args=[]):
-    text_encoder = CLIPTextModel.from_pretrained(
-        model_id,
-        subfolder="text_encoder",
-    )
-
-    class CLIPText(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.text_encoder = text_encoder
-
-        def forward(self, input):
-            return self.text_encoder(input)[0]
-
-    clip_model = CLIPText()
-    shark_clip = compile_through_fx(
-        clip_model,
-        model_input["clip"],
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_clip
-
-
-def get_vae_mlir(model_name="vae", extra_args=[]):
-    class VaeModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.vae = AutoencoderKL.from_pretrained(
-                model_id,
-                subfolder="vae",
-            )
-
-        def forward(self, input):
-            x = self.vae.decode(input, return_dict=False)[0]
-            return x
-
-    vae = VaeModel()
-    shark_vae = compile_through_fx(
-        vae,
-        model_input["vae"],
-        model_name=model_name,
-        extra_args=extra_args,
-    )
-    return shark_vae
-
-
-def get_unet_mlir(model_name="unet", extra_args=[]):
-    class UnetModel(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.unet = UNet2DConditionModel.from_pretrained(
-                model_id,
-                subfolder="unet",
-            )
-            self.in_channels = self.unet.in_channels
-            self.train(False)
-
-        def forward(self, latent, timestep, text_embedding, noise_level):
-            unet_out = self.unet.forward(
-                latent,
-                timestep,
-                text_embedding,
-                noise_level,
-                return_dict=False,
-            )[0]
-            return unet_out
-
-    unet = UnetModel()
-    f16_input_mask = (True, True, True, False)
-    shark_unet = compile_through_fx(
-        unet,
-        model_input["unet"],
-        model_name=model_name,
-        is_f16=True,
-        f16_input_mask=f16_input_mask,
-        extra_args=extra_args,
-    )
-    return shark_unet
--- a/shark/examples/shark_inference/upscaler/opt_params.py
+++ b/shark/examples/shark_inference/upscaler/opt_params.py
@@ -1,48 +0,0 @@
-import sys
-from model_wrappers import (
-    get_vae_mlir,
-    get_unet_mlir,
-    get_clip_mlir,
-)
-from upscaler_args import args
-from utils import get_shark_model
-
-BATCH_SIZE = len(args.prompts)
-if BATCH_SIZE != 1:
-    sys.exit("Only batch size 1 is supported.")
-
-
-unet_flag = [
-    "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
-]
-
-vae_flag = [
-    "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-convert-conv-nchw-to-nhwc,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
-]
-
-clip_flag = [
-    "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))"
-]
-
-bucket = "gs://shark_tank/stable_diffusion/"
-
-
-def get_unet():
-    model_name = "upscaler_unet"
-    if args.import_mlir:
-        return get_unet_mlir(model_name, unet_flag)
-    return get_shark_model(bucket, model_name, unet_flag)
-
-
-def get_vae():
-    model_name = "upscaler_vae"
-    if args.import_mlir:
-        return get_vae_mlir(model_name, vae_flag)
-    return get_shark_model(bucket, model_name, vae_flag)
-
-
-def get_clip():
-    model_name = "upscaler_clip"
-    if args.import_mlir:
-        return get_clip_mlir(model_name, clip_flag)
-    return get_shark_model(bucket, model_name, clip_flag)
--- a/shark/examples/shark_inference/upscaler/pipeline_shark_stable_diffusion_upscale.py
+++ b/shark/examples/shark_inference/upscaler/pipeline_shark_stable_diffusion_upscale.py
@@ -1,489 +0,0 @@
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import torch
-
-import PIL
-from PIL import Image
-from diffusers.utils import is_accelerate_available
-from transformers import CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, UNet2DConditionModel
-from diffusers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from diffusers import logging
-from diffusers.pipeline_utils import ImagePipelineOutput
-from opt_params import get_unet, get_vae, get_clip
-from tqdm.auto import tqdm
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def preprocess(image):
-    if isinstance(image, torch.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = map(
-            lambda x: x - x % 64, (w, h)
-        )  # resize to integer multiple of 64
-
-        image = [np.array(i.resize((w, h)))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = torch.from_numpy(image)
-    elif isinstance(image[0], torch.Tensor):
-        image = torch.cat(image, dim=0)
-    return image
-
-
-def shark_run_wrapper(model, *args):
-    np_inputs = tuple([x.detach().numpy() for x in args])
-    outputs = model("forward", np_inputs)
-    return torch.from_numpy(outputs)
-
-
-class SharkStableDiffusionUpscalePipeline:
-    def __init__(
-        self,
-        model_id,
-    ):
-        self.tokenizer = CLIPTokenizer.from_pretrained(
-            model_id, subfolder="tokenizer"
-        )
-        self.low_res_scheduler = DDPMScheduler.from_pretrained(
-            model_id,
-            subfolder="scheduler",
-        )
-        self.scheduler = DDIMScheduler.from_pretrained(
-            model_id,
-            subfolder="scheduler",
-        )
-        self.vae = get_vae()
-        self.unet = get_unet()
-        self.text_encoder = get_clip()
-        self.max_noise_level = (350,)
-        self._execution_device = "cpu"
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(
-            prompt, padding="longest", return_tensors="pt"
-        ).input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[
-            -1
-        ] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(
-                untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-            )
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-
-        # if (
-        # hasattr(self.text_encoder.config, "use_attention_mask")
-        # and self.text_encoder.config.use_attention_mask
-        # ):
-        # attention_mask = text_inputs.attention_mask.to(device)
-        # else:
-        # attention_mask = None
-
-        text_embeddings = shark_run_wrapper(
-            self.text_encoder, text_input_ids.to(device)
-        )
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(
-            bs_embed * num_images_per_prompt, seq_len, -1
-        )
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            # if (
-            # hasattr(self.text_encoder.config, "use_attention_mask")
-            # and self.text_encoder.config.use_attention_mask
-            # ):
-            # attention_mask = uncond_input.attention_mask.to(device)
-            # else:
-            # attention_mask = None
-
-            uncond_embeddings = shark_run_wrapper(
-                self.text_encoder,
-                uncond_input.input_ids.to(device),
-            )
-            uncond_embeddings = uncond_embeddings
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(
-                1, num_images_per_prompt, 1
-            )
-            uncond_embeddings = uncond_embeddings.view(
-                batch_size * num_images_per_prompt, seq_len, -1
-            )
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        return text_embeddings
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys()
-        )
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys()
-        )
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents with 0.18215->0.08333
-    def decode_latents(self, latents):
-        latents = 1 / 0.08333 * latents
-        image = shark_run_wrapper(self.vae, latents)
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
-
-    def check_inputs(self, prompt, image, noise_level, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
-
-        if (
-            not isinstance(image, torch.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
-            )
-
-        # verify batch size of prompt and image are same if image is a list or tensor
-        if isinstance(image, list) or isinstance(image, torch.Tensor):
-            if isinstance(prompt, str):
-                batch_size = 1
-            else:
-                batch_size = len(prompt)
-            if isinstance(image, list):
-                image_batch_size = len(image)
-            else:
-                image_batch_size = image.shape[0]
-            if batch_size != image_batch_size:
-                raise ValueError(
-                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
-                    " Please make sure that passed `prompt` matches the batch size of `image`."
-                )
-
-    @staticmethod
-    def numpy_to_pil(images):
-        """
-        Convert a numpy image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        if images.shape[-1] == 1:
-            # special case for grayscale (single channel) images
-            pil_images = [
-                Image.fromarray(image.squeeze(), mode="L") for image in images
-            ]
-        else:
-            pil_images = [Image.fromarray(image) for image in images]
-
-        return pil_images
-
-    def prepare_latents(
-        self,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        shape = (batch_size, num_channels_latents, height, width)
-        if latents is None:
-            if device == "mps":
-                # randn does not work reproducibly on mps
-                latents = torch.randn(
-                    shape, generator=generator, device="cpu", dtype=dtype
-                ).to(device)
-            else:
-                latents = torch.randn(
-                    shape, generator=generator, device=device, dtype=dtype
-                )
-        else:
-            if latents.shape != shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                )
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[
-            torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]
-        ],
-        num_inference_steps: int = 75,
-        guidance_scale: float = 9.0,
-        noise_level: int = 20,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[
-            Union[torch.Generator, List[torch.Generator]]
-        ] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[
-            Callable[[int, int, torch.FloatTensor], None]
-        ] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        # 1. Check inputs
-        self.check_inputs(prompt, image, noise_level, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-        )
-
-        # 4. Preprocess image
-        image = preprocess(image)
-        image = image.to(dtype=text_embeddings.dtype, device=device)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Add noise to image
-        noise_level = torch.tensor(
-            [noise_level], dtype=torch.long, device=device
-        )
-        if device == "mps":
-            # randn does not work reproducibly on mps
-            noise = torch.randn(
-                image.shape,
-                generator=generator,
-                device="cpu",
-                dtype=text_embeddings.dtype,
-            ).to(device)
-        else:
-            noise = torch.randn(
-                image.shape,
-                generator=generator,
-                device=device,
-                dtype=text_embeddings.dtype,
-            )
-        image = self.low_res_scheduler.add_noise(image, noise, noise_level)
-
-        batch_multiplier = 2 if do_classifier_free_guidance else 1
-        image = torch.cat([image] * batch_multiplier * num_images_per_prompt)
-        noise_level = torch.cat([noise_level] * image.shape[0])
-
-        # 6. Prepare latent variables
-        height, width = image.shape[2:]
-        # num_channels_latents = self.vae.config.latent_channels
-        num_channels_latents = 4
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            text_embeddings.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 7. Check that sizes of image and latents match
-        num_channels_image = image.shape[1]
-        # if (
-        # num_channels_latents + num_channels_image
-        # != self.unet.config.in_channels
-        # ):
-        # raise ValueError(
-        # f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-        # f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-        # f" `num_channels_image`: {num_channels_image} "
-        # f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-        # " `pipeline.unet` or your `image` input."
-        # )
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = (
-            len(timesteps) - num_inference_steps * self.scheduler.order
-        )
-        for i, t in tqdm(enumerate(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                torch.cat([latents] * 2)
-                if do_classifier_free_guidance
-                else latents
-            )
-
-            # concat latents, mask, masked_image_latents in the channel dimension
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t
-            )
-            latent_model_input = torch.cat([latent_model_input, image], dim=1)
-
-            timestep = torch.tensor([t]).to(torch.float32)
-
-            # predict the noise residual
-            noise_pred = shark_run_wrapper(
-                self.unet,
-                latent_model_input.half(),
-                timestep,
-                text_embeddings.half(),
-                noise_level,
-            )
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(
-                noise_pred, t, latents, **extra_step_kwargs
-            ).prev_sample
-
-            # # call the callback, if provided
-            # if i == len(timesteps) - 1 or (
-            # (i + 1) > num_warmup_steps
-            # and (i + 1) % self.scheduler.order == 0
-            # ):
-            # progress_bar.update()
-            # if callback is not None and i % callback_steps == 0:
-            # callback(i, t, latents)
-
-        # 10. Post-processing
-        # make sure the VAE is in float32 mode, as it overflows in float16
-        # self.vae.to(dtype=torch.float32)
-        image = self.decode_latents(latents.float())
-
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
--- a/shark/examples/shark_inference/upscaler/upscaler_args.py
+++ b/shark/examples/shark_inference/upscaler/upscaler_args.py
@@ -1,98 +0,0 @@
-import argparse
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-
-##############################################################################
-### Stable Diffusion Params
-##############################################################################
-
-p.add_argument(
-    "--prompts",
-    nargs="+",
-    default=["cyberpunk forest by Salvador Dali"],
-    help="text of which images to be generated.",
-)
-
-p.add_argument(
-    "--negative-prompts",
-    nargs="+",
-    default=[""],
-    help="text you don't want to see in the generated image.",
-)
-
-p.add_argument(
-    "--steps",
-    type=int,
-    default=50,
-    help="the no. of steps to do the sampling.",
-)
-
-p.add_argument(
-    "--seed",
-    type=int,
-    default=42,
-    help="the seed to use.",
-)
-
-p.add_argument(
-    "--guidance_scale",
-    type=float,
-    default=7.5,
-    help="the value to be used for guidance scaling.",
-)
-
-##############################################################################
-### Model Config and Usage Params
-##############################################################################
-
-p.add_argument(
-    "--device", type=str, default="vulkan", help="device to run the model."
-)
-
-p.add_argument(
-    "--precision", type=str, default="fp16", help="precision to run the model."
-)
-
-p.add_argument(
-    "--import_mlir",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="imports the model from torch module to shark_module otherwise downloads the model from shark_tank.",
-)
-
-p.add_argument(
-    "--load_vmfb",
-    default=True,
-    action=argparse.BooleanOptionalAction,
-    help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
-)
-
-p.add_argument(
-    "--save_vmfb",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="saves the compiled flatbuffer to the local directory",
-)
-
-##############################################################################
-### IREE - Vulkan supported flags
-##############################################################################
-
-p.add_argument(
-    "--iree-vulkan-target-triple",
-    type=str,
-    default="",
-    help="Specify target triple for vulkan",
-)
-
-p.add_argument(
-    "--vulkan_debug_utils",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Profiles vulkan device and collects the .rdc info",
-)
-
-
-args = p.parse_args()
--- a/shark/examples/shark_inference/upscaler/utils.py
+++ b/shark/examples/shark_inference/upscaler/utils.py
@@ -1,230 +0,0 @@
-import os
-import torch
-from shark.shark_inference import SharkInference
-from upscaler_args import args
-from shark.shark_importer import import_with_fx
-from shark.iree_utils.vulkan_utils import (
-    set_iree_vulkan_runtime_flags,
-    get_vulkan_target_triple,
-    get_iree_vulkan_runtime_flags,
-)
-
-
-def _compile_module(shark_module, model_name, extra_args=[]):
-    if args.load_vmfb or args.save_vmfb:
-        device = (
-            args.device
-            if "://" not in args.device
-            else "-".join(args.device.split("://"))
-        )
-        extended_name = "{}_{}".format(model_name, device)
-        vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
-        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
-            print(f"loading existing vmfb from: {vmfb_path}")
-            shark_module.load_module(vmfb_path, extra_args=extra_args)
-        else:
-            if args.save_vmfb:
-                print("Saving to {}".format(vmfb_path))
-            else:
-                print(
-                    "No vmfb found. Compiling and saving to {}".format(
-                        vmfb_path
-                    )
-                )
-            path = shark_module.save_module(
-                os.getcwd(), extended_name, extra_args
-            )
-            shark_module.load_module(path, extra_args=extra_args)
-    else:
-        shark_module.compile(extra_args)
-    return shark_module
-
-
-# Downloads the model from shark_tank and returns the shark_module.
-def get_shark_model(tank_url, model_name, extra_args=[]):
-    from shark.shark_downloader import download_model
-    from shark.parser import shark_args
-
-    # Set local shark_tank cache directory.
-    # shark_args.local_tank_cache = args.local_tank_cache
-
-    mlir_model, func_name, inputs, golden_out = download_model(
-        model_name,
-        tank_url=tank_url,
-        frontend="torch",
-    )
-    shark_module = SharkInference(
-        mlir_model, device=args.device, mlir_dialect="linalg"
-    )
-    return _compile_module(shark_module, model_name, extra_args)
-
-
-# Converts the torch-module into a shark_module.
-def compile_through_fx(
-    model, inputs, model_name, is_f16=False, f16_input_mask=None, extra_args=[]
-):
-    mlir_module, func_name = import_with_fx(
-        model, inputs, is_f16, f16_input_mask
-    )
-    shark_module = SharkInference(
-        mlir_module,
-        device=args.device,
-        mlir_dialect="linalg",
-    )
-
-    return _compile_module(shark_module, model_name, extra_args)
-
-
-def set_iree_runtime_flags():
-    vulkan_runtime_flags = get_iree_vulkan_runtime_flags()
-    if args.enable_rgp:
-        vulkan_runtime_flags += [
-            f"--enable_rgp=true",
-            f"--vulkan_debug_utils=true",
-        ]
-    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
-
-
-def get_all_devices(driver_name):
-    """
-    Inputs: driver_name
-    Returns a list of all the available devices for a given driver sorted by
-    the iree path names of the device as in --list_devices option in iree.
-    """
-    from iree.runtime import get_driver
-
-    driver = get_driver(driver_name)
-    device_list_src = driver.query_available_devices()
-    device_list_src.sort(key=lambda d: d["path"])
-    return device_list_src
-
-
-def get_device_mapping(driver, key_combination=3):
-    """This method ensures consistent device ordering when choosing
-    specific devices for execution
-    Args:
-        driver (str): execution driver (vulkan, cuda, rocm, etc)
-        key_combination (int, optional): choice for mapping value for device name.
-        1 : path
-        2 : name
-        3 : (name, path)
-        Defaults to 3.
-    Returns:
-        dict: map to possible device names user can input mapped to desired combination of name/path.
-    """
-    from shark.iree_utils._common import iree_device_map
-
-    driver = iree_device_map(driver)
-    device_list = get_all_devices(driver)
-    device_map = dict()
-
-    def get_output_value(dev_dict):
-        if key_combination == 1:
-            return f"{driver}://{dev_dict['path']}"
-        if key_combination == 2:
-            return dev_dict["name"]
-        if key_combination == 3:
-            return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
-
-    # mapping driver name to default device (driver://0)
-    device_map[f"{driver}"] = get_output_value(device_list[0])
-    for i, device in enumerate(device_list):
-        # mapping with index
-        device_map[f"{driver}://{i}"] = get_output_value(device)
-        # mapping with full path
-        device_map[f"{driver}://{device['path']}"] = get_output_value(device)
-    return device_map
-
-
-def map_device_to_name_path(device, key_combination=3):
-    """Gives the appropriate device data (supported name/path) for user selected execution device
-    Args:
-        device (str): user
-        key_combination (int, optional): choice for mapping value for device name.
-        1 : path
-        2 : name
-        3 : (name, path)
-        Defaults to 3.
-    Raises:
-        ValueError:
-    Returns:
-        str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
-    """
-    driver = device.split("://")[0]
-    device_map = get_device_mapping(driver, key_combination)
-    try:
-        device_mapping = device_map[device]
-    except KeyError:
-        raise ValueError(f"Device '{device}' is not a valid device.")
-    return device_mapping
-
-
-def set_init_device_flags():
-    if "vulkan" in args.device:
-        # set runtime flags for vulkan.
-        set_iree_runtime_flags()
-
-        # set triple flag to avoid multiple calls to get_vulkan_triple_flag
-        device_name, args.device = map_device_to_name_path(args.device)
-        if not args.iree_vulkan_target_triple:
-            triple = get_vulkan_target_triple(device_name)
-            if triple is not None:
-                args.iree_vulkan_target_triple = triple
-        print(
-            f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
-        )
-    elif "cuda" in args.device:
-        args.device = "cuda"
-    elif "cpu" in args.device:
-        args.device = "cpu"
-
-    # set max_length based on availability.
-    if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
-        args.max_length = 77
-    elif args.variant == "openjourney":
-        args.max_length = 64
-
-    # use tuned models only in the case of stablediffusion/fp16 and rdna3 cards.
-    if (
-        args.variant in ["openjourney", "dreamlike"]
-        or args.precision != "fp16"
-        or "vulkan" not in args.device
-        or "rdna3" not in args.iree_vulkan_target_triple
-    ):
-        args.use_tuned = False
-        print("Tuned models are currently not supported for this setting.")
-
-    elif args.use_base_vae and args.variant != "stablediffusion":
-        args.use_tuned = False
-        print("Tuned models are currently not supported for this setting.")
-
-    if args.use_tuned:
-        print("Using tuned models for stablediffusion/fp16 and rdna3 card.")
-
-
-# Utility to get list of devices available.
-def get_available_devices():
-    def get_devices_by_name(driver_name):
-        from shark.iree_utils._common import iree_device_map
-
-        device_list = []
-        try:
-            driver_name = iree_device_map(driver_name)
-            device_list_dict = get_all_devices(driver_name)
-            print(f"{driver_name} devices are available.")
-        except:
-            print(f"{driver_name} devices are not available.")
-        else:
-            for i, device in enumerate(device_list_dict):
-                device_list.append(f"{driver_name}://{i} => {device['name']}")
-        return device_list
-
-    set_iree_runtime_flags()
-
-    available_devices = []
-    vulkan_devices = get_devices_by_name("vulkan")
-    available_devices.extend(vulkan_devices)
-    cuda_devices = get_devices_by_name("cuda")
-    available_devices.extend(cuda_devices)
-    available_devices.append("cpu")
-    return available_devices
--- a/shark/examples/shark_inference/v_diffusion.py
+++ b/shark/examples/shark_inference/v_diffusion.py
@@ -1,13 +1,11 @@
 from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_model
+from shark.shark_downloader import download_torch_model


-mlir_model, func_name, inputs, golden_out = download_model(
-    "v_diffusion", frontend="torch"
-)
+mlir_model, func_name, inputs, golden_out = download_torch_model("v_diffusion")

 shark_module = SharkInference(
-    mlir_model, device="vulkan", mlir_dialect="linalg"
+    mlir_model, func_name, device="vulkan", mlir_dialect="linalg"
 )
 shark_module.compile()
 result = shark_module.forward(inputs)
--- a/shark/examples/shark_training/bert_training.py
+++ b/shark/examples/shark_training/bert_training.py
@@ -1,7 +1,7 @@
 import torch
-from torch.nn.utils import stateless
+from torch.nn.utils import _stateless
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from shark.shark_trainer import SharkTrainer
+from shark.shark_runner import SharkTrainer


 class MiniLMSequenceClassification(torch.nn.Module):
@@ -33,7 +33,7 @@ inp = (torch.randint(2, (1, 128)),)

 def forward(params, buffers, args):
    params_and_buffers = {**params, **buffers}
-    stateless.functional_call(
+    _stateless.functional_call(
        mod, params_and_buffers, args, {}
    ).sum().backward()
    optim = torch.optim.SGD(get_sorted_params(params), lr=0.01)
@@ -42,7 +42,6 @@ def forward(params, buffers, args):
    return params, buffers


-shark_module = SharkTrainer(mod, inp)
-shark_module.compile(forward)
-shark_module.train(num_iters=2)
-print("training done")
+shark_module = SharkTrainer(mod, inp, custom_inference_fn=forward)
+
+print(shark_module.forward())
--- a/shark/examples/shark_training/bert_training_tf.py
+++ b/shark/examples/shark_training/bert_training_tf.py
@@ -52,8 +52,7 @@ class BertModule(tf.Module):
        input_signature=[
            bert_input,  # inputs
            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ],
-        jit_compile=True,
+        ]
    )
    def forward(self, inputs, labels):
        with tf.GradientTape() as tape:
--- a/shark/examples/shark_training/stable-diffusion-img2img/README.md
+++ b/shark/examples/shark_training/stable-diffusion-img2img/README.md
@@ -1,41 +0,0 @@
-# Stable Diffusion Img2Img model
-
-## Installation
-
-<details>
-  <summary>Installation (Linux)</summary>
-
-### Activate shark.venv Virtual Environment
-
-```shell
-source shark.venv/bin/activate
-
-# Some older pip installs may not be able to handle the recent PyTorch deps
-python -m pip install --upgrade pip
-```
-
-### Install dependencies
-
-# Run the setup.sh script
-
-```shell
-./setup.sh
-```
-
-### Run the Stable diffusion Img2Img model
-
-To run the model with the default set of images and params, run:
-```shell
-python stable_diffusion_img2img.py
-```
-To run the model with your set of images, and parameters you need to specify the following params:
-1.) Input images directory with the arg `--input_dir` containing 3-5 images.
-2.) What to teach the model? Using the arg `--what_to_teach`, allowed values are `object` or `style`.
-3.) Placeholder token using the arg `--placeholder_token`, that represents your new concept. It should be passed with the opening and closing angle brackets. For ex: token is `cat-toy`, it should be passed as `<cat-toy>`.
-4.) Initializer token using the arg `--initializer_token`, which summarise what is your new concept.
-
-For the result, you need to pass the text prompt with the arg: `--prompt`. The prompt string should contain a "*s" in it, which will be replaced by the placeholder token during the inference.
-
-By default the result images will go into the `sd_result` dir. To specify your output dir use the arg: `--output_dir`.
-
-The default value of max_training_steps is `3000`, which takes some hours to complete. You can pass the smaller value with the arg `--training_steps`. Specify the number of images to be sampled for the result with the `--num_inference_samples` arg.
--- a/shark/examples/shark_training/stable-diffusion-img2img/setup.sh
+++ b/shark/examples/shark_training/stable-diffusion-img2img/setup.sh
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-TD="$(cd $(dirname $0) && pwd)"
-if [ -z "$PYTHON" ]; then
-  PYTHON="$(which python3)"
-fi
-
-function die() {
-  echo "Error executing command: $*"
-  exit 1
-}
-
-PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))'`
-
-echo "Python: $PYTHON"
-echo "Python version: $PYTHON_VERSION_X_Y"
-
-mkdir input_images
-
-wget https://huggingface.co/datasets/valhalla/images/resolve/main/2.jpeg -P input_images/
-wget https://huggingface.co/datasets/valhalla/images/resolve/main/3.jpeg -P input_images/
-wget https://huggingface.co/datasets/valhalla/images/resolve/main/5.jpeg -P input_images/
-wget https://huggingface.co/datasets/valhalla/images/resolve/main/6.jpeg -P input_images/
-
-pip install diffusers["training"]==0.4.1 transformers ftfy opencv-python
--- a/shark/examples/shark_training/stable-diffusion-img2img/stable_diffusion_img2img.py
+++ b/shark/examples/shark_training/stable-diffusion-img2img/stable_diffusion_img2img.py
@@ -1,600 +0,0 @@
-# Textual-inversion fine-tuning for Stable Diffusion using diffusers
-# This script shows how to "teach" Stable Diffusion a new concept via
-# textual-inversion using 🤗 Hugging Face [🧨 Diffusers library](https://github.com/huggingface/diffusers).
-# By using just 3-5 images you can teach new concepts to Stable Diffusion
-# and personalize the model on your own images.
-
-import argparse
-import itertools
-import math
-import os
-import random
-import cv2
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.utils.data import Dataset
-
-import PIL
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.hub_utils import init_git_repo, push_to_hub
-from diffusers.optimization import get_scheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from PIL import Image
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-YOUR_TOKEN = "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"
-
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-p.add_argument(
-    "--input_dir",
-    type=str,
-    default="input_images/",
-    help="the directory contains the images used for fine tuning",
-)
-p.add_argument(
-    "--output_dir",
-    type=str,
-    default="sd_result",
-    help="the directory contains the images used for fine tuning",
-)
-p.add_argument(
-    "--training_steps",
-    type=int,
-    default=3000,
-    help="the maximum number of training steps",
-)
-p.add_argument("--seed", type=int, default=42, help="the random seed")
-p.add_argument(
-    "--what_to_teach",
-    type=str,
-    choices=["object", "style"],
-    default="object",
-    help="what is it that you are teaching?",
-)
-p.add_argument(
-    "--placeholder_token",
-    type=str,
-    default="<cat-toy>",
-    help="It is the token you are going to use to represent your new concept",
-)
-p.add_argument(
-    "--initializer_token",
-    type=str,
-    default="toy",
-    help="It is a word that can summarise what is your new concept",
-)
-p.add_argument(
-    "--inference_steps",
-    type=int,
-    default=50,
-    help="the number of steps for inference",
-)
-p.add_argument(
-    "--num_inference_samples",
-    type=int,
-    default=4,
-    help="the number of samples for inference",
-)
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a grafitti in a wall with a *s on it",
-    help="the text prompt to use",
-)
-args = p.parse_args()
-
-if "*s" not in args.prompt:
-    raise ValueError(
-        f'The prompt should have a "*s" which will be replaced by a placeholder token.'
-    )
-
-prompt1, prompt2 = args.prompt.split("*s")
-args.prompt = prompt1 + args.placeholder_token + prompt2
-
-pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4"
-
-# Load input images.
-images = []
-for filename in os.listdir(args.input_dir):
-    img = cv2.imread(os.path.join(args.input_dir, filename))
-    if img is not None:
-        images.append(img)
-
-# Setup the prompt templates for training
-imagenet_templates_small = [
-    "a photo of a {}",
-    "a rendering of a {}",
-    "a cropped photo of the {}",
-    "the photo of a {}",
-    "a photo of a clean {}",
-    "a photo of a dirty {}",
-    "a dark photo of the {}",
-    "a photo of my {}",
-    "a photo of the cool {}",
-    "a close-up photo of a {}",
-    "a bright photo of the {}",
-    "a cropped photo of a {}",
-    "a photo of the {}",
-    "a good photo of the {}",
-    "a photo of one {}",
-    "a close-up photo of the {}",
-    "a rendition of the {}",
-    "a photo of the clean {}",
-    "a rendition of a {}",
-    "a photo of a nice {}",
-    "a good photo of a {}",
-    "a photo of the nice {}",
-    "a photo of the small {}",
-    "a photo of the weird {}",
-    "a photo of the large {}",
-    "a photo of a cool {}",
-    "a photo of a small {}",
-]
-
-imagenet_style_templates_small = [
-    "a painting in the style of {}",
-    "a rendering in the style of {}",
-    "a cropped painting in the style of {}",
-    "the painting in the style of {}",
-    "a clean painting in the style of {}",
-    "a dirty painting in the style of {}",
-    "a dark painting in the style of {}",
-    "a picture in the style of {}",
-    "a cool painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a bright painting in the style of {}",
-    "a cropped painting in the style of {}",
-    "a good painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a rendition in the style of {}",
-    "a nice painting in the style of {}",
-    "a small painting in the style of {}",
-    "a weird painting in the style of {}",
-    "a large painting in the style of {}",
-]
-
-
-# Setup the dataset
-class TextualInversionDataset(Dataset):
-    def __init__(
-        self,
-        data_root,
-        tokenizer,
-        learnable_property="object",  # [object, style]
-        size=512,
-        repeats=100,
-        interpolation="bicubic",
-        flip_p=0.5,
-        set="train",
-        placeholder_token="*",
-        center_crop=False,
-    ):
-        self.data_root = data_root
-        self.tokenizer = tokenizer
-        self.learnable_property = learnable_property
-        self.size = size
-        self.placeholder_token = placeholder_token
-        self.center_crop = center_crop
-        self.flip_p = flip_p
-
-        self.image_paths = [
-            os.path.join(self.data_root, file_path)
-            for file_path in os.listdir(self.data_root)
-        ]
-
-        self.num_images = len(self.image_paths)
-        self._length = self.num_images
-
-        if set == "train":
-            self._length = self.num_images * repeats
-
-        self.interpolation = {
-            "linear": PIL.Image.LINEAR,
-            "bilinear": PIL.Image.BILINEAR,
-            "bicubic": PIL.Image.BICUBIC,
-            "lanczos": PIL.Image.LANCZOS,
-        }[interpolation]
-
-        self.templates = (
-            imagenet_style_templates_small
-            if learnable_property == "style"
-            else imagenet_templates_small
-        )
-        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, i):
-        example = {}
-        image = Image.open(self.image_paths[i % self.num_images])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        placeholder_string = self.placeholder_token
-        text = random.choice(self.templates).format(placeholder_string)
-
-        example["input_ids"] = self.tokenizer(
-            text,
-            padding="max_length",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids[0]
-
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-
-        if self.center_crop:
-            crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
-                img.shape[0],
-                img.shape[1],
-            )
-            img = img[
-                (h - crop) // 2 : (h + crop) // 2,
-                (w - crop) // 2 : (w + crop) // 2,
-            ]
-
-        image = Image.fromarray(img)
-        image = image.resize(
-            (self.size, self.size), resample=self.interpolation
-        )
-
-        image = self.flip_transform(image)
-        image = np.array(image).astype(np.uint8)
-        image = (image / 127.5 - 1.0).astype(np.float32)
-
-        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
-        return example
-
-
-# Setting up the model
-# Load the tokenizer and add the placeholder token as a additional special token.
-# Please read and if you agree accept the LICENSE
-# [here](https://huggingface.co/CompVis/stable-diffusion-v1-4) if you see an error
-tokenizer = CLIPTokenizer.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="tokenizer",
-    use_auth_token=YOUR_TOKEN,
-)
-
-# Add the placeholder token in tokenizer
-num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
-if num_added_tokens == 0:
-    raise ValueError(
-        f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
-        " `placeholder_token` that is not already in the tokenizer."
-    )
-
-# Get token ids for our placeholder and initializer token.
-# This code block will complain if initializer string is not a single token
-# Convert the initializer_token, placeholder_token to ids
-token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
-# Check if initializer_token is a single token or a sequence of tokens
-if len(token_ids) > 1:
-    raise ValueError("The initializer token must be a single token.")
-
-initializer_token_id = token_ids[0]
-placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
-
-# Load the Stable Diffusion model
-# Load models and create wrapper for stable diffusion
-text_encoder = CLIPTextModel.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="text_encoder",
-    use_auth_token=YOUR_TOKEN,
-)
-vae = AutoencoderKL.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="vae",
-    use_auth_token=YOUR_TOKEN,
-)
-unet = UNet2DConditionModel.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="unet",
-    use_auth_token=YOUR_TOKEN,
-)
-
-# We have added the `placeholder_token` in the `tokenizer` so we resize the token embeddings here,
-#  this will a new embedding vector in the token embeddings for our `placeholder_token`
-text_encoder.resize_token_embeddings(len(tokenizer))
-
-# Initialise the newly added placeholder token with the embeddings of the initializer token
-token_embeds = text_encoder.get_input_embeddings().weight.data
-token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
-
-# In Textual-Inversion we only train the newly added embedding vector,
-# so lets freeze rest of the model parameters here.
-
-
-def freeze_params(params):
-    for param in params:
-        param.requires_grad = False
-
-
-# Freeze vae and unet
-freeze_params(vae.parameters())
-freeze_params(unet.parameters())
-# Freeze all parameters except for the token embeddings in text encoder
-params_to_freeze = itertools.chain(
-    text_encoder.text_model.encoder.parameters(),
-    text_encoder.text_model.final_layer_norm.parameters(),
-    text_encoder.text_model.embeddings.position_embedding.parameters(),
-)
-freeze_params(params_to_freeze)
-
-# Creating our training data
-
-train_dataset = TextualInversionDataset(
-    data_root=args.input_dir,
-    tokenizer=tokenizer,
-    size=512,
-    placeholder_token=args.placeholder_token,
-    repeats=100,
-    learnable_property=args.what_to_teach,  # Option selected above between object and style
-    center_crop=False,
-    set="train",
-)
-
-
-def create_dataloader(train_batch_size=1):
-    return torch.utils.data.DataLoader(
-        train_dataset, batch_size=train_batch_size, shuffle=True
-    )
-
-
-# Create noise_scheduler for training.
-noise_scheduler = DDPMScheduler(
-    beta_start=0.00085,
-    beta_end=0.012,
-    beta_schedule="scaled_linear",
-    num_train_timesteps=1000,
-    tensor_format="pt",
-)
-
-# Define hyperparameters for our training
-hyperparameters = {
-    "learning_rate": 5e-04,
-    "scale_lr": True,
-    "max_train_steps": args.training_steps,
-    "train_batch_size": 1,
-    "gradient_accumulation_steps": 4,
-    "seed": args.seed,
-    "output_dir": "sd-concept-output",
-}
-
-
-def training_function(text_encoder, vae, unet):
-    logger = get_logger(__name__)
-
-    train_batch_size = hyperparameters["train_batch_size"]
-    gradient_accumulation_steps = hyperparameters[
-        "gradient_accumulation_steps"
-    ]
-    learning_rate = hyperparameters["learning_rate"]
-    max_train_steps = hyperparameters["max_train_steps"]
-    output_dir = hyperparameters["output_dir"]
-
-    accelerator = Accelerator(
-        gradient_accumulation_steps=gradient_accumulation_steps,
-    )
-
-    train_dataloader = create_dataloader(train_batch_size)
-
-    if hyperparameters["scale_lr"]:
-        learning_rate = (
-            learning_rate
-            * gradient_accumulation_steps
-            * train_batch_size
-            * accelerator.num_processes
-        )
-
-    # Initialize the optimizer
-    optimizer = torch.optim.AdamW(
-        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
-        lr=learning_rate,
-    )
-
-    text_encoder, optimizer, train_dataloader = accelerator.prepare(
-        text_encoder, optimizer, train_dataloader
-    )
-
-    # Move vae and unet to device
-    vae.to(accelerator.device)
-    unet.to(accelerator.device)
-
-    # Keep vae and unet in eval model as we don't train these
-    vae.eval()
-    unet.eval()
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / gradient_accumulation_steps
-    )
-    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
-
-    # Train!
-    total_batch_size = (
-        train_batch_size
-        * accelerator.num_processes
-        * gradient_accumulation_steps
-    )
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {gradient_accumulation_steps}"
-    )
-    logger.info(f"  Total optimization steps = {max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(max_train_steps), disable=not accelerator.is_local_main_process
-    )
-    progress_bar.set_description("Steps")
-    global_step = 0
-
-    for epoch in range(num_train_epochs):
-        text_encoder.train()
-        for step, batch in enumerate(train_dataloader):
-            with accelerator.accumulate(text_encoder):
-                # Convert images to latent space
-                latents = (
-                    vae.encode(batch["pixel_values"])
-                    .latent_dist.sample()
-                    .detach()
-                )
-                latents = latents * 0.18215
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn(latents.shape).to(latents.device)
-                bsz = latents.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0,
-                    noise_scheduler.num_train_timesteps,
-                    (bsz,),
-                    device=latents.device,
-                ).long()
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(
-                    latents, noise, timesteps
-                )
-
-                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
-
-                # Predict the noise residual
-                noise_pred = unet(
-                    noisy_latents, timesteps, encoder_hidden_states
-                ).sample
-
-                loss = (
-                    F.mse_loss(noise_pred, noise, reduction="none")
-                    .mean([1, 2, 3])
-                    .mean()
-                )
-                accelerator.backward(loss)
-
-                # Zero out the gradients for all token embeddings except the newly added
-                # embeddings for the concept, as we only want to optimize the concept embeddings
-                if accelerator.num_processes > 1:
-                    grads = (
-                        text_encoder.module.get_input_embeddings().weight.grad
-                    )
-                else:
-                    grads = text_encoder.get_input_embeddings().weight.grad
-                # Get the index for tokens that we want to zero the grads for
-                index_grads_to_zero = (
-                    torch.arange(len(tokenizer)) != placeholder_token_id
-                )
-                grads.data[index_grads_to_zero, :] = grads.data[
-                    index_grads_to_zero, :
-                ].fill_(0)
-
-                optimizer.step()
-                optimizer.zero_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-            logs = {"loss": loss.detach().item()}
-            progress_bar.set_postfix(**logs)
-
-            if global_step >= max_train_steps:
-                break
-
-        accelerator.wait_for_everyone()
-
-    # Create the pipeline using using the trained modules and save it.
-    if accelerator.is_main_process:
-        pipeline = StableDiffusionPipeline(
-            text_encoder=accelerator.unwrap_model(text_encoder),
-            vae=vae,
-            unet=unet,
-            tokenizer=tokenizer,
-            scheduler=PNDMScheduler(
-                beta_start=0.00085,
-                beta_end=0.012,
-                beta_schedule="scaled_linear",
-                skip_prk_steps=True,
-            ),
-            safety_checker=StableDiffusionSafetyChecker.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker"
-            ),
-            feature_extractor=CLIPFeatureExtractor.from_pretrained(
-                "openai/clip-vit-base-patch32"
-            ),
-        )
-        pipeline.save_pretrained(output_dir)
-        # Also save the newly trained embeddings
-        learned_embeds = (
-            accelerator.unwrap_model(text_encoder)
-            .get_input_embeddings()
-            .weight[placeholder_token_id]
-        )
-        learned_embeds_dict = {
-            args.placeholder_token: learned_embeds.detach().cpu()
-        }
-        torch.save(
-            learned_embeds_dict, os.path.join(output_dir, "learned_embeds.bin")
-        )
-
-
-import accelerate
-
-accelerate.notebook_launcher(
-    training_function, args=(text_encoder, vae, unet), num_processes=1
-)
-
-# Set up the pipeline
-pipe = StableDiffusionPipeline.from_pretrained(
-    hyperparameters["output_dir"],
-    # torch_dtype=torch.float16,
-)
-
-all_images = []
-for _ in range(args.num_inference_samples):
-    images = pipe(
-        [args.prompt],
-        num_inference_steps=args.inference_steps,
-        guidance_scale=7.5,
-    ).images
-    all_images.extend(images)
-
-# output_path = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
-if not os.path.isdir(args.output_dir):
-    os.mkdir(args.output_dir)
-
-[
-    image.save(f"{args.output_dir}/{i}.jpeg")
-    for i, image in enumerate(all_images)
-]
--- a/Show More
+++ b/Show More