Add unet_torch reference. (#283 )

* Add unet_torch reference. * Delete distilbert-base-uncased_torch_test.py
Add distilbert_torch reference.
2026-01-11 23:08:19 -05:00 · 2022-08-19 13:35:10 +05:30 · 2022-08-19 13:34:26 +05:30 · 2022-08-18 21:21:43 -07:00
239 changed files with 22782 additions and 141 deletions
--- a/.github/workflows/gh-pages-releases.yml
+++ b/.github/workflows/gh-pages-releases.yml
@@ -1,37 +0,0 @@
-# See: https://github.com/llvm/torch-mlir/issues/1374
-name: Publish releases page
-
-on:
-  workflow_dispatch:
-
-jobs:
-  scrape_and_publish_releases:
-    name: "Scrape and publish releases"
-    runs-on: ubuntu-latest
-
-    # Don't run this in everyone's forks.
-    if: github.repository == 'nod-ai/SHARK'
-
-    steps:
-      - name: Checking out repository
-        uses: actions/checkout@v2
-        with:
-          token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      - name: Run scrape releases script
-        run: python ./build_tools/scrape_releases.py nod-ai SHARK > /tmp/index.html
-        shell: bash
-      - run: git fetch --all
-      - run: git switch github-pages
-      - run: git config --global user.email "none@none.com"
-      - run: git config --global user.name "nod-team"
-      - run: mv /tmp/index.html package-index/index.html
-      - run: git add package-index/index.html
-
-      # Only try to make a commit if the file has changed.
-      - run: git diff --cached --exit-code || git commit -m "Update releases."
-
-      - name: GitHub Push
-        uses: ad-m/github-push-action@v0.6.0
-        with:
-          github_token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-          branch: github-pages
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -16,7 +16,6 @@ jobs:
      fail-fast: false
      matrix:
        python-version: ["3.10"]
-        backend: [IREE, SHARK]

    steps:
    - uses: actions/checkout@v3
@@ -39,10 +38,6 @@ jobs:
        tag_name="${package_version}"
        echo "package_version=${package_version}" >> $GITHUB_ENV
        echo "tag_name=${tag_name}" >> $GITHUB_ENV    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
    - name: Create Release
      id: create_release
      uses: actions/create-release@v1
@@ -54,18 +49,12 @@ jobs:
        body: |
          Automatic snapshot release of nod.ai SHARK.
        draft: true
-        prerelease: false
-    - name: Find Torch-MLIR Release
-      run: |
-        TM_HTML_URL="$(python3 -c "import urllib.request, json, sys; u=json.loads(urllib.request.urlopen('https://api.github.com/repos/llvm/torch-mlir/releases/latest').read().decode()).get('html_url', False); print(u) if u else sys.exit(1);")"
-        TM_RELEASE_DIR=${TM_HTML_URL/"tag"/"expanded_assets"}
-        echo "TM_RELEASE_DIR=${TM_RELEASE_DIR}" >> $GITHUB_ENV
+        prerelease: false        
    - name: Install dependencies
      run: |
-        echo "Torch-MLIR Release DIR is ${{ env.TM_RELEASE_DIR }}"
        python -m pip install --upgrade pip
        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
+        if [ -f requirements.txt ]; then pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cpu  -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
@@ -73,19 +62,46 @@ jobs:
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
    - name: Build and validate the IREE package
-      if: ${{ matrix.backend == 'IREE' }}
      run: |
        cd $GITHUB_WORKSPACE
        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
        source iree.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/iree-org/iree/releases
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://github.com/llvm/torch-mlir/releases -f https://github.com/iree-org/iree/releases
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
+        pytest -k 'cpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
+          tail -n 1 |
+          tee -a pytest_results.txt
+        pytest -k 'gpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
+          tail -n 1 |
+          tee -a pytest_results.txt
+        pytest -k 'vulkan' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
+          tail -n 1 |
+          tee -a pytest_results.txt
+        rm -rf ./wheelhouse/nodai*
+
+    - name: Build and validate the SHARK Runtime package
+      run: |
+        cd $GITHUB_WORKSPACE
+        ./setup_venv.sh
+        source shark.venv/bin/activate
+        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
+        SHARK_PACKAGE_VERSION=${package_version} \
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases
+        # Install the built wheel
+        pip install ./wheelhouse/nodai*
+        # Validate the Models
+        pytest -k 'cpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
+          tail -n 1 |
+          tee -a pytest_results.txt
+        pytest -k 'gpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
+          tail -n 1 |
+          tee -a pytest_results.txt
+        pytest -k 'vulkan' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
          tail -n 1 |
          tee -a pytest_results.txt
        if !(grep -Fxq " failed" pytest_results.txt) 
@@ -94,36 +110,20 @@ jobs:
            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/$SHA
            gsutil -m cp -r gs://shark_tank/$SHA/* gs://shark_tank/latest/
        fi
+        rm pytest_results.txt
        rm -rf ./wheelhouse/nodai*

-    - name: Build and validate the SHARK Runtime package
-      if: ${{ matrix.backend == 'SHARK' }}
-      run: |
-        cd $GITHUB_WORKSPACE
-        ./setup_venv.sh
-        source shark.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
-          tail -n 1 |
-          tee -a pytest_results.txt
    
    - name: Upload Release Assets
-      if: ${{ matrix.backend == 'SHARK' }}
      id: upload-release-assets
      uses: dwenegar/upload-release-assets@v1
      env:
        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
      with:
        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ${GITHUB_WORKSPACE}/wheelhouse/nodai_*.whl
+        assets_path: ./wheelhouse/nodai_*.whl

    - name: Publish Release
-      if: ${{ matrix.backend == 'SHARK' }}
      id: publish_release
      uses: eregon/publish-release@v1
      env:
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -15,8 +15,8 @@ jobs:
    strategy:
      fail-fast: true
      matrix:
-        os: [icelake, a100, MacStudio, ubuntu-latest]
-        suite: [cpu,cuda,vulkan]
+        os: [a100, MacStudio, ubuntu-latest]
+        suite: [cpu,gpu,vulkan]
        python-version: ["3.10"]
        include:
          - os: ubuntu-latest
@@ -25,21 +25,15 @@ jobs:
          - os: ubuntu-latest
            suite: vulkan
          - os: ubuntu-latest
-            suite: cuda
+            suite: gpu
          - os: ubuntu-latest
            suite: cpu
          - os: MacStudio
-            suite: cuda
+            suite: gpu
          - os: MacStudio
            suite: cpu
          - os: MacStudio
            suite: vulkan
-          - os: icelake
-            suite: vulkan
-          - os: icelake
-            suite: cuda
-          - os: a100
-            suite: cpu

    runs-on: ${{ matrix.os }}

@@ -52,13 +46,13 @@ jobs:
        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
        
    - name: Set up Python Version File ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
+      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest'
      run: |
        # See https://github.com/actions/setup-python/issues/433
        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
    
    - name: Set up Python ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
+      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest'
      uses: actions/setup-python@v4
      with:
        python-version: '${{ matrix.python-version }}'
@@ -84,30 +78,27 @@ jobs:
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py

-    - name: Validate Models on CPU
+    - name: Validate CPU Models
      if: matrix.suite == 'cpu'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cpu
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
-        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
+        pytest -k 'cpu' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 

-    - name: Validate Models on NVIDIA GPU
-      if: matrix.suite == 'cuda'
+    - name: Validate GPU Models
+      if: matrix.suite == 'gpu'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cuda
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
-        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
+        pytest --benchmark -k "gpu" --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 
+        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_gpu_${SHORT_SHA}.csv

    - name: Validate Vulkan Models
      if: matrix.suite == 'vulkan'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k vulkan
+        pytest -k 'vulkan' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "inference/thirdparty/shark-runtime"]
+	path = inference/thirdparty/shark-runtime
+	url =https://github.com/nod-ai/SHARK-Runtime.git
+	branch = shark-06032022
--- a/.style.yapf
+++ b/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+  based_on_style = google
+  column_limit = 80
--- a/README.md
+++ b/README.md
@@ -0,0 +1,392 @@
+# SHARK
+
+High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerators and Heterogeneous Clusters
+
+[![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
+[![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)
+
+## Communication Channels
+
+*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
+*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+
+
+## Installation
+
+<details>
+  <summary>Installation (Linux and macOS)</summary>
+
+### Setup a new pip Virtual Environment
+
+This step sets up a new VirtualEnv for Python
+
+```shell
+python --version #Check you have 3.7->3.10 on Linux or 3.10 on macOS
+python -m venv shark_venv
+source shark_venv/bin/activate
+
+# If you are using conda create and activate a new conda env
+
+# Some older pip installs may not be able to handle the recent PyTorch deps
+python -m pip install --upgrade pip
+```
+
+*macOS Metal* users please install https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg and enable "System wide install"
+
+### Install SHARK
+
+This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10
+
+```shell
+pip install nodai-shark -f https://github.com/nod-ai/SHARK/releases -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/shark-runtime/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+```
+If you are on an Intel macOS machine you need this [workaround](https://github.com/nod-ai/SHARK/issues/102) for an upstream issue.
+
+### Download and run Resnet50 sample
+
+```shell
+curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/resnet50_script.py
+#Install deps for test script
+pip install --pre torch torchvision torchaudio tqdm pillow --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+python ./resnet50_script.py --device="cpu"  #use cuda or vulkan or metal
+```
+
+### Download and run BERT (MiniLM) sample
+```shell
+curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/minilm_jit.py
+#Install deps for test script
+pip install transformers torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
+```
+</details>
+
+
+<details>
+  <summary>Source Installation</summary>
+
+## Check out the code
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git
+```
+
+## Setup your Python VirtualEnvironment and Dependencies
+```shell
+# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
+./setup_venv.sh
+source shark.venv/bin/activate
+```
+For example if you want to use Python3.10 and upstream IREE with TF Import tools you can use the environment variables like:
+```
+# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 USE_IREE=1 ./setup_venv.sh 
+```
+
+If you are a Torch-mlir developer or an IREE developer and want to test local changes you can uninstall
+the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
+with Python bindings and set your PYTHONPATH as mentioned [here](https://google.github.io/iree/bindings/python/)
+for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
+for Torch-MLIR.
+
+### Run a demo script
+```shell
+python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
+# Or a pytest
+pytest tank/tf/hf_masked_lm/albert-base-v2_test.py::AlbertBaseModuleTest::test_module_static_cpu
+```
+
+
+</details>
+
+
+<details>
+  <summary>Testing</summary>
+
+### Run all model tests on CPU/GPU/VULKAN/Metal
+```shell
+pytest tank
+
+# If on Linux for multithreading on CPU (faster results):
+pytest tank -n auto
+```
+
+### Running specific tests
+```shell
+# Run tests for a specific model:
+pytest tank/<MODEL_NAME> #i.e., pytest tank/bert-base-uncased
+
+# Run tests for a specific case:
+pytest tank/<MODEL_NAME> -k "keyword" 
+# i.e., pytest tank/bert-base-uncased/bert-base-uncased_test.py -k "static_gpu"
+
+```
+
+### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.
+
+(requires source installation with `IMPORTER=1 ./setup_venv.sh`)
+
+```shell
+pytest --benchmark tank
+  
+# Just do static GPU benchmarks for PyTorch tests:
+pytest --benchmark tank --ignore-glob="_tf*" -k "static_gpu"
+```
+  
+### Benchmark Resnet50, MiniLM on CPU
+
+(requires source installation with `IMPORTER=1 ./setup_venv.sh`)  
+  
+```shell
+# We suggest running the following commands as root before running benchmarks on CPU:
+  
+cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print $2}' | sort -n | uniq | ( while read X ; do echo $X ; echo 0 > /sys/devices/system/cpu/cpu$X/online ; done )
+echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
+
+# Benchmark canonical Resnet50 on CPU via pytest
+pytest --benchmark tank/resnet50/ -k "cpu"
+
+# Benchmark canonical MiniLM on CPU via pytest
+pytest --benchmark tank/MiniLM-L12-H384-uncased/ -k "cpu"
+
+# Benchmark MiniLM on CPU via transformer-benchmarks:
+git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
+cd transformer-benchmarks
+./perf-ci.sh -n
+# Check detail.csv for MLIR/IREE results.
+
+```
+
+</details>
+
+
+<details>
+  <summary>API Reference</summary>
+
+### Shark Inference API
+
+```
+
+from shark.shark_importer import SharkImporter
+
+# SharkImporter imports mlir file from the torch, tensorflow or tf-lite module.
+
+mlir_importer = SharkImporter(
+    torch_module,
+    (input),
+    frontend="torch",  #tf, #tf-lite
+)
+torch_mlir, func_name = mlir_importer.import_mlir(tracing_required=True)
+
+# SharkInference accepts mlir in linalg, mhlo, and tosa dialect.
+
+from shark.shark_inference import SharkInference
+shark_module = SharkInference(torch_mlir, func_name, device="cpu", mlir_dialect="linalg")
+shark_module.compile()
+result = shark_module.forward((input))
+
+```
+
+
+### Example demonstrating running MHLO IR.
+
+```
+from shark.shark_inference import SharkInference
+import numpy as np
+
+mhlo_ir = r"""builtin.module  {
+      func.func @forward(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
+        %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<4x4xf32>
+        %1 = "mhlo.abs"(%0) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+        return %1 : tensor<4x4xf32>
+      }
+}"""
+
+arg0 = np.ones((1, 4)).astype(np.float32)
+arg1 = np.ones((4, 1)).astype(np.float32)
+shark_module = SharkInference(mhlo_ir, func_name="forward", device="cpu", mlir_dialect="mhlo")
+shark_module.compile()
+result = shark_module.forward((arg0, arg1))
+```
+</details>
+
+
+## Supported and Validated Models
+
+<details>
+  <summary>PyTorch Models</summary>
+
+### Huggingface PyTorch Models
+
+| Hugging Face Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :green_heart: (JIT)          | :green_heart:         | :green_heart:         | :green_heart:            |
+| Albert              | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
+| BigBird             | :green_heart: (AOT)            |          |          |             |
+| DistilBERT          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
+| GPT2                | :broken_heart: (AOT)            |          |          |             |
+| MobileBert          | :green_heart: (JIT)            | :green_heart:         | :green_heart:         | :green_heart:            |
+
+### Torchvision  Models
+
+| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|--------------------|----------------------|----------|----------|-------------|
+| AlexNet            | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| DenseNet121        | :green_heart: (Script)         |          |          |             |
+| MNasNet1_0         | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| MobileNetV2        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| MobileNetV3        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Unet               | :broken_heart: (Script)         |          |          |             |
+| Resnet18           | :green_heart: (Script)         | :green_heart:         |  :green_heart:        | :green_heart:            |
+| Resnet50           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| Resnet101           | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| Resnext50_32x4d    | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| ShuffleNet_v2      | :broken_heart: (Script)         |          |          |             |
+| SqueezeNet         | :green_heart: (Script)         | :green_heart:         |   :green_heart:       | :green_heart:            |
+| EfficientNet       | :green_heart: (Script)         |          |          |             |
+| Regnet             | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| Resnest            | :broken_heart: (Script)         |          |          |             |
+| Vision Transformer | :green_heart: (Script)         |          |          |             |
+| VGG 16             | :green_heart: (Script)         | :green_heart:         |   :green_heart:       |             |
+| Wide Resnet        | :green_heart: (Script)         | :green_heart:         | :green_heart:         | :green_heart:            |
+| RAFT               | :broken_heart: (JIT)            |          |          |             |
+
+For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
+
+### PyTorch Training Models
+
+| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>JAX Models</summary>
+
+
+### JAX  Models
+
+| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| DALL-E                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected                | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>TFLite Models</summary>
+
+### TFLite Models
+
+| Models | TOSA/LinAlg  | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :broken_heart:           | :broken_heart:         |          |             |
+| FullyConnected      | :green_heart:           | :green_heart:         |          |             |
+| albert | :green_heart:           | :green_heart:         |          |             |
+| asr_conformer | :green_heart:           | :green_heart:         |          |             |
+| bird_classifier | :green_heart:           | :green_heart:         |          |             |
+| cartoon_gan | :green_heart:           | :green_heart:         |          |             |
+| craft_text | :green_heart:           | :green_heart:         |          |             |
+| deeplab_v3 | :green_heart:           | :green_heart:         |          |             |
+| densenet | :green_heart:           | :green_heart:         |          |             |
+| east_text_detector | :green_heart:           | :green_heart:         |          |             |
+| efficientnet_lite0_int8 | :green_heart:           | :green_heart:         |          |             |
+| efficientnet | :green_heart:           | :green_heart:         |          |             |
+| gpt2 | :green_heart:           | :green_heart:         |          |             |
+| image_stylization | :green_heart:           | :green_heart:         |          |             |
+| inception_v4 | :green_heart:           | :green_heart:         |          |             |
+| inception_v4_uint8 | :green_heart:           | :green_heart:         |          |             |
+| lightning_fp16 | :green_heart:           | :green_heart:         |          |             |
+| lightning_i8 | :green_heart:           | :green_heart:         |          |             |
+| lightning | :green_heart:           | :green_heart:         |          |             |
+| magenta | :green_heart:           | :green_heart:         |          |             |
+| midas | :green_heart:           | :green_heart:         |          |             |
+| mirnet | :green_heart:           | :green_heart:         |          |             |
+| mnasnet | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_edgetpu_s_float | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_edgetpu_s_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilebert | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_tf2_float | :green_heart:           | :green_heart:         |          |             |
+| mobilebert_tf2_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_ssd_quant | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v2_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v3-large | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v3-large_uint8 | :green_heart:           | :green_heart:         |          |             |
+| mobilenet_v35-int8 | :green_heart:           | :green_heart:         |          |             |
+| nasnet | :green_heart:           | :green_heart:         |          |             |
+| person_detect | :green_heart:           | :green_heart:         |          |             |
+| posenet | :green_heart:           | :green_heart:         |          |             |
+| resnet_50_int8 | :green_heart:           | :green_heart:         |          |             |
+| rosetta | :green_heart:           | :green_heart:         |          |             |
+| spice | :green_heart:           | :green_heart:         |          |             |
+| squeezenet | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v1 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v1_uint8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_fpnlite | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_fpnlite_uint8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2_int8 | :green_heart:           | :green_heart:         |          |             |
+| ssd_mobilenet_v2 | :green_heart:           | :green_heart:         |          |             |
+| ssd_spaghettinet_large | :green_heart:           | :green_heart:         |          |             |
+| ssd_spaghettinet_large_uint8 | :green_heart:           | :green_heart:         |          |             |
+| visual_wake_words_i8 | :green_heart:           | :green_heart:         |          |             |
+
+</details>
+
+<details>
+  <summary>TF Models</summary>
+
+### Tensorflow Models (Inference)
+
+| Hugging Face Models | tf-mhlo lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| albert-base-v2              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| DistilBERT          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| CamemBert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| ConvBert              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| Deberta              |            |         |          |             |
+| electra          | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| funnel              |            |         |          |             |
+| layoutlm              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| longformer              |            |         |          |             |
+| mobile-bert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| remembert              |            |         |          |             |
+| tapas              |            |         |          |             |
+| flaubert                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| roberta                | :green_heart:          | :green_heart:         | :green_heart:         | :green_heart:            |
+| xlm-roberta              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+| mpnet              | :green_heart:            | :green_heart:         | :green_heart:         | :green_heart:            |
+
+</details>
+
+## Related Projects
+
+<details>
+  <summary>IREE Project Channels</summary>
+
+*   [Upstream IREE issues](https://github.com/google/iree/issues): Feature requests,
+    bugs, and other work tracking
+*   [Upstream IREE Discord server](https://discord.gg/26P4xW4): Daily development
+    discussions with the core team and collaborators
+*   [iree-discuss email list](https://groups.google.com/forum/#!forum/iree-discuss):
+    Announcements, general and low-priority discussion
+</details>
+
+<details>
+  <summary>MLIR and Torch-MLIR Project Channels</summary>
+
+* `#torch-mlir` channel on the LLVM [Discord](https://discord.gg/xS7Z362) - this is the most active communication channel
+* Torch-MLIR Github issues [here](https://github.com/llvm/torch-mlir/issues)
+* [`torch-mlir` section](https://llvm.discourse.group/c/projects-that-want-to-become-official-llvm-projects/torch-mlir/41) of LLVM Discourse
+*  Weekly meetings on Mondays 9AM PST. See [here](https://discourse.llvm.org/t/community-meeting-developer-hour-refactoring-recurring-meetings/62575) for more information.
+* [MLIR topic within LLVM Discourse](https://llvm.discourse.group/c/llvm-project/mlir/31) SHARK and IREE is enabled by and heavily relies on [MLIR](https://mlir.llvm.org).
+</details>
+  
+## License
+
+nod.ai SHARK is licensed under the terms of the Apache 2.0 License with LLVM Exceptions.
+See [LICENSE](LICENSE) for more information.
--- a/benchmarks/init.py
+++ b/benchmarks/init.py
--- a/benchmarks/hf_model_benchmark.py
+++ b/benchmarks/hf_model_benchmark.py
@@ -0,0 +1,22 @@
+import torch
+from shark.parser import parser
+from benchmarks.hf_transformer import SharkHFBenchmarkRunner
+
+parser.add_argument(
+    "--model_name",
+    type=str,
+    required=True,
+    help='Specifies name of HF model to benchmark. (For exmaple "microsoft/MiniLM-L12-H384-uncased"',
+)
+load_args, unknown = parser.parse_known_args()
+
+if __name__ == "__main__":
+    model_name = load_args.model_name
+    test_input = torch.randint(2, (1, 128))
+    shark_module = SharkHFBenchmarkRunner(
+        model_name, (test_input,), jit_trace=True
+    )
+    shark_module.benchmark_c()
+    shark_module.benchmark_python((test_input,))
+    shark_module.benchmark_torch(test_input)
+    shark_module.benchmark_onnx(test_input)
--- a/benchmarks/hf_transformer.py
+++ b/benchmarks/hf_transformer.py
@@ -0,0 +1,181 @@
+import torch
+from shark.shark_benchmark_runner import SharkBenchmarkRunner
+from shark.parser import shark_args
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from onnxruntime.transformers.benchmark import (
+    run_pytorch,
+    run_tensorflow,
+    run_onnxruntime,
+)
+from onnxruntime.transformers.huggingface_models import MODELS
+from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
+import os
+import psutil
+
+
+class OnnxFusionOptions(object):
+    def __init__(self):
+        self.disable_gelu = False
+        self.disable_layer_norm = False
+        self.disable_attention = False
+        self.disable_skip_layer_norm = False
+        self.disable_embed_layer_norm = False
+        self.disable_bias_skip_layer_norm = False
+        self.disable_bias_gelu = False
+        self.enable_gelu_approximation = False
+        self.use_mask_index = False
+        self.no_attention_mask = False
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
+    # SharkRunner derived class with Benchmarking capabilities.
+    def __init__(
+        self,
+        model_name: str,
+        input: tuple,
+        dynamic: bool = False,
+        device: str = None,
+        jit_trace: bool = False,
+        from_aot: bool = False,
+        frontend: str = "torch",
+    ):
+        self.device = device if device is not None else shark_args.device
+        if self.device == "gpu":
+            raise ValueError(
+                "Currently GPU Benchmarking is not supported due to OOM from ORT."
+            )
+        self.model_name = model_name
+        model = HuggingFaceLanguage(model_name)
+        SharkBenchmarkRunner.__init__(
+            self,
+            model,
+            input,
+            dynamic,
+            self.device,
+            jit_trace,
+            from_aot,
+            frontend,
+        )
+
+    def benchmark_torch(self, inputs):
+        use_gpu = self.device == "gpu"
+        # Set set the model's layer number to automatic.
+        config_modifier = ConfigModifier(None)
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        verbose = False
+        result = run_pytorch(
+            use_gpu,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            False,
+            cache_dir,
+            verbose,
+        )
+        print(
+            f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    # TODO: Currently non-functional due to TF runtime error. There might be some issue with, initializing TF.
+    def benchmark_tf(self, inputs):
+        use_gpu = self.device == "gpu"
+        # Set set the model's layer number to automatic.
+        config_modifier = ConfigModifier(None)
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        verbose = False
+        result = run_tensorflow(
+            use_gpu,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            cache_dir,
+            verbose,
+        )
+        print(
+            f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    def benchmark_onnx(self, inputs):
+        if self.model_name not in MODELS:
+            print(
+                f"{self.model_name} is currently not supported in ORT's HF. Check \
+https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
+for currently supported models. Exiting benchmark ONNX."
+            )
+            return
+        use_gpu = self.device == "gpu"
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        onnx_dir = os.path.join(".", "onnx_models")
+        verbose = False
+        input_counts = [1]
+        optimize_onnx = True
+        validate_onnx = False
+        disable_ort_io_binding = False
+        use_raw_attention_mask = True
+        model_fusion_statistics = {}
+        overwrite = False
+        model_source = "pt"  # Either "pt" or "tf"
+        provider = None
+        config_modifier = ConfigModifier(None)
+        onnx_args = OnnxFusionOptions()
+        result = run_onnxruntime(
+            use_gpu,
+            provider,
+            [self.model_name],
+            None,
+            config_modifier,
+            Precision.FLOAT32,
+            num_threads,
+            batch_sizes,
+            sequence_lengths,
+            shark_args.num_iterations,
+            input_counts,
+            optimize_onnx,
+            validate_onnx,
+            cache_dir,
+            onnx_dir,
+            verbose,
+            overwrite,
+            disable_ort_io_binding,
+            use_raw_attention_mask,
+            model_fusion_statistics,
+            model_source,
+            onnx_args,
+        )
+        print(
+            f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -0,0 +1,231 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers
+
+import torch
+import tensorflow as tf
+import numpy as np
+import torchvision.models as models
+from transformers import (
+    AutoModelForSequenceClassification,
+    BertTokenizer,
+    TFBertModel,
+)
+import importlib
+import pytest
+import unittest
+
+torch.manual_seed(0)
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+##################### Tensorflow Hugging Face LM Models ###################################
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+tf_bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class TFHuggingFaceLanguage(tf.Module):
+    def __init__(self, hf_model_name):
+        super(TFHuggingFaceLanguage, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(hf_model_name, from_pt=True)
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False
+        )
+
+    @tf.function(input_signature=tf_bert_input)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+def get_TFhf_model(name):
+    model = TFHuggingFaceLanguage(name)
+    tokenizer = BertTokenizer.from_pretrained(name)
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+    )
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0
+        )
+    test_input = (
+        encoded_input["input_ids"],
+        encoded_input["attention_mask"],
+        encoded_input["token_type_ids"],
+    )
+    actual_out = model.forward(*test_input)
+    return model, test_input, actual_out
+
+
+##################### Hugging Face LM Models ###################################
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+def get_hf_model(name):
+    model = HuggingFaceLanguage(name)
+    # TODO: Currently the test input is set to (1,128)
+    test_input = torch.randint(2, (1, 128))
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+
+################################################################################
+
+##################### Torch Vision Models    ###################################
+
+
+class VisionModule(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+def get_vision_model(torch_model):
+    model = VisionModule(torch_model)
+    # TODO: Currently the test input is set to (1,128)
+    test_input = torch.randn(1, 3, 224, 224)
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+
+#############################   Benchmark Tests ####################################
+
+pytest_benchmark_param = pytest.mark.parametrize(
+    ("dynamic", "device"),
+    [
+        pytest.param(False, "cpu"),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, "cpu", marks=pytest.mark.skip),
+        pytest.param(
+            False,
+            "gpu",
+            marks=pytest.mark.skipif(
+                check_device_drivers("gpu"), reason="nvidia-smi not found"
+            ),
+        ),
+        pytest.param(True, "gpu", marks=pytest.mark.skip),
+        pytest.param(
+            False,
+            "vulkan",
+            marks=pytest.mark.skipif(
+                check_device_drivers("vulkan"),
+                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases",
+            ),
+        ),
+        pytest.param(
+            True,
+            "vulkan",
+            marks=pytest.mark.skipif(
+                check_device_drivers("vulkan"),
+                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases",
+            ),
+        ),
+    ],
+)
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("iree.tools") is None,
+    reason="Cannot find tools to import TF",
+)
+@pytest_benchmark_param
+def test_bench_minilm_torch(dynamic, device):
+    model, test_input, act_out = get_hf_model(
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
+    shark_module = SharkInference(
+        model,
+        (test_input,),
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.compile()
+        shark_module.benchmark_all((test_input,))
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("iree.tools") is None,
+    reason="Cannot find tools to import TF",
+)
+@pytest_benchmark_param
+def test_bench_distilbert(dynamic, device):
+    model, test_input, act_out = get_TFhf_model("distilbert-base-uncased")
+    shark_module = SharkInference(
+        model,
+        test_input,
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.set_frontend("tensorflow")
+        shark_module.compile()
+        shark_module.benchmark_all(test_input)
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
+
+
+@pytest.mark.skip(reason="XLM Roberta too large to test.")
+@pytest_benchmark_param
+def test_bench_xlm_roberta(dynamic, device):
+    model, test_input, act_out = get_TFhf_model("xlm-roberta-base")
+    shark_module = SharkInference(
+        model,
+        test_input,
+        device=device,
+        dynamic=dynamic,
+        jit_trace=True,
+        benchmark_mode=True,
+    )
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.set_frontend("tensorflow")
+        shark_module.compile()
+        shark_module.benchmark_all(test_input)
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
--- a/benchmarks/tests/test_hf_benchmark.py
+++ b/benchmarks/tests/test_hf_benchmark.py
@@ -0,0 +1,45 @@
+import torch
+from benchmarks.hf_transformer import SharkHFBenchmarkRunner
+import importlib
+import pytest
+
+torch.manual_seed(0)
+
+############################# HF Benchmark Tests ####################################
+
+# Test running benchmark module without failing.
+pytest_benchmark_param = pytest.mark.parametrize(
+    ("dynamic", "device"),
+    [
+        pytest.param(False, "cpu"),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, "cpu", marks=pytest.mark.skip),
+    ],
+)
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("onnxruntime") is None,
+    reason="Cannot find ONNXRUNTIME.",
+)
+@pytest_benchmark_param
+def test_HFbench_minilm_torch(dynamic, device):
+    model_name = "bert-base-uncased"
+    test_input = torch.randint(2, (1, 128))
+    try:
+        shark_module = SharkHFBenchmarkRunner(
+            model_name,
+            (test_input,),
+            jit_trace=True,
+            dynamic=dynamic,
+            device=device,
+        )
+        shark_module.benchmark_c()
+        shark_module.benchmark_python((test_input,))
+        shark_module.benchmark_torch(test_input)
+        shark_module.benchmark_onnx(test_input)
+        # If becnhmarking succesful, assert success/True.
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+IMPORTER=1 ./setup_venv.sh
+source $GITHUB_WORKSPACE/shark.venv/bin/activate
+python generate_sharktank.py --upload=False
--- a/conftest.py
+++ b/conftest.py
@@ -0,0 +1,33 @@
+def pytest_addoption(parser):
+    # Attaches SHARK command-line arguments to the pytest machinery.
+    parser.addoption(
+        "--benchmark",
+        action="store_true",
+        default="False",
+        help="Pass option to benchmark and write results.csv",
+    )
+    parser.addoption(
+        "--onnx_bench",
+        action="store_true",
+        default="False",
+        help="Add ONNX benchmark results to pytest benchmarks.",
+    )
+    # The following options are deprecated and pending removal.
+    parser.addoption(
+        "--save_mlir",
+        action="store_true",
+        default="False",
+        help="Pass option to save input MLIR",
+    )
+    parser.addoption(
+        "--save_vmfb",
+        action="store_true",
+        default="False",
+        help="Pass option to save IREE output .vmfb",
+    )
+    parser.addoption(
+        "--save_temps",
+        action="store_true",
+        default="False",
+        help="Saves IREE reproduction artifacts for filing upstream issues.",
+    )
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -0,0 +1,235 @@
+# Lint as: python3
+"""SHARK Tank"""
+# python generate_sharktank.py, you have to give a csv tile with [model_name, model_download_url]
+# will generate local shark tank folder like this:
+#   /SHARK
+#     /gen_shark_tank
+#       /albert_lite_base
+#       /...model_name...
+#
+
+import os
+import csv
+import argparse
+from shark.shark_importer import SharkImporter
+import tensorflow as tf
+import subprocess as sp
+import hashlib
+import numpy as np
+
+visible_default = tf.config.list_physical_devices("GPU")
+try:
+    tf.config.set_visible_devices([], "GPU")
+    visible_devices = tf.config.get_visible_devices()
+    for device in visible_devices:
+        assert device.device_type != "GPU"
+except:
+    # Invalid device or cannot modify virtual devices once initialized.
+    pass
+
+# All generated models and metadata will be saved under this directory.
+WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
+
+
+def create_hash(file_name):
+    with open(file_name, "rb") as f:
+        file_hash = hashlib.blake2b()
+        while chunk := f.read(2**20):
+            file_hash.update(chunk)
+
+    return file_hash.hexdigest()
+
+
+def save_torch_model(torch_model_list):
+    from tank.model_utils import get_hf_model
+    from tank.model_utils import get_vision_model
+
+    with open(torch_model_list) as csvfile:
+        torch_reader = csv.reader(csvfile, delimiter=",")
+        fields = next(torch_reader)
+        for row in torch_reader:
+            torch_model_name = row[0]
+            tracing_required = row[1]
+            model_type = row[2]
+
+            tracing_required = False if tracing_required == "False" else True
+
+            model = None
+            input = None
+            if model_type == "vision":
+                model, input, _ = get_vision_model(torch_model_name)
+            elif model_type == "hf":
+                model, input, _ = get_hf_model(torch_model_name)
+
+            torch_model_name = torch_model_name.replace("/", "_")
+            torch_model_dir = os.path.join(
+                WORKDIR, str(torch_model_name) + "_torch"
+            )
+            os.makedirs(torch_model_dir, exist_ok=True)
+
+            mlir_importer = SharkImporter(
+                model,
+                (input,),
+                frontend="torch",
+            )
+            mlir_importer.import_debug(
+                is_dynamic=False,
+                tracing_required=tracing_required,
+                dir=torch_model_dir,
+                model_name=torch_model_name,
+            )
+            mlir_hash = create_hash(
+                os.path.join(
+                    torch_model_dir, torch_model_name + "_torch" + ".mlir"
+                )
+            )
+            np.save(os.path.join(torch_model_dir, "hash"), np.array(mlir_hash))
+            # Generate torch dynamic models.
+            mlir_importer.import_debug(
+                is_dynamic=True,
+                tracing_required=tracing_required,
+                dir=torch_model_dir,
+                model_name=torch_model_name + "_dynamic",
+            )
+
+
+def save_tf_model(tf_model_list):
+    from tank.model_utils_tf import (
+        get_causal_image_model,
+        get_causal_lm_model,
+        get_keras_model,
+        get_TFhf_model,
+    )
+
+    with open(tf_model_list) as csvfile:
+        tf_reader = csv.reader(csvfile, delimiter=",")
+        fields = next(tf_reader)
+        for row in tf_reader:
+            tf_model_name = row[0]
+            model_type = row[1]
+
+            model = None
+            input = None
+            print(f"Generating artifacts for model {tf_model_name}")
+            if model_type == "hf":
+                model, input, _ = get_causal_lm_model(tf_model_name)
+            if model_type == "img":
+                model, input, _ = get_causal_image_model(tf_model_name)
+            if model_type == "keras":
+                model, input, _ = get_keras_model(tf_model_name)
+            if model_type == "TFhf":
+                model, input, _ = get_TFhf_model(tf_model_name)
+
+            tf_model_name = tf_model_name.replace("/", "_")
+            tf_model_dir = os.path.join(WORKDIR, str(tf_model_name) + "_tf")
+            os.makedirs(tf_model_dir, exist_ok=True)
+
+            mlir_importer = SharkImporter(
+                model,
+                input,
+                frontend="tf",
+            )
+            mlir_importer.import_debug(
+                dir=tf_model_dir,
+                model_name=tf_model_name,
+            )
+            mlir_hash = create_hash(
+                os.path.join(tf_model_dir, tf_model_name + "_tf" + ".mlir")
+            )
+            np.save(os.path.join(tf_model_dir, "hash"), np.array(mlir_hash))
+
+
+def save_tflite_model(tflite_model_list):
+    from shark.tflite_utils import TFLitePreprocessor
+
+    with open(tflite_model_list) as csvfile:
+        tflite_reader = csv.reader(csvfile, delimiter=",")
+        for row in tflite_reader:
+            print("\n")
+            tflite_model_name = row[0]
+            tflite_model_link = row[1]
+            print("tflite_model_name", tflite_model_name)
+            print("tflite_model_link", tflite_model_link)
+            tflite_model_name_dir = os.path.join(
+                WORKDIR, str(tflite_model_name) + "_tflite"
+            )
+            os.makedirs(tflite_model_name_dir, exist_ok=True)
+            print(f"TMP_TFLITE_MODELNAME_DIR = {tflite_model_name_dir}")
+
+            # Preprocess to get SharkImporter input args
+            tflite_preprocessor = TFLitePreprocessor(str(tflite_model_name))
+            raw_model_file_path = tflite_preprocessor.get_raw_model_file()
+            inputs = tflite_preprocessor.get_inputs()
+            tflite_interpreter = tflite_preprocessor.get_interpreter()
+
+            # Use SharkImporter to get SharkInference input args
+            my_shark_importer = SharkImporter(
+                module=tflite_interpreter,
+                inputs=inputs,
+                frontend="tflite",
+                raw_model_file=raw_model_file_path,
+            )
+            my_shark_importer.import_debug(
+                dir=tflite_model_name_dir,
+                model_name=tflite_model_name,
+                func_name="main",
+            )
+            mlir_hash = create_hash(
+                os.path.join(
+                    tflite_model_name_dir,
+                    tflite_model_name + "_tflite" + ".mlir",
+                )
+            )
+            np.save(
+                os.path.join(tflite_model_name_dir, "hash"),
+                np.array(mlir_hash),
+            )
+
+
+# Validates whether the file is present or not.
+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--torch_model_csv",
+        type=lambda x: is_valid_file(x),
+        default="./tank/pytorch/torch_model_list.csv",
+        help="""Contains the file with torch_model name and args.
+             Please see: https://github.com/nod-ai/SHARK/blob/main/tank/pytorch/torch_model_list.csv""",
+    )
+    parser.add_argument(
+        "--tf_model_csv",
+        type=lambda x: is_valid_file(x),
+        default="./tank/tf/tf_model_list.csv",
+        help="Contains the file with tf model name and args.",
+    )
+    parser.add_argument(
+        "--tflite_model_csv",
+        type=lambda x: is_valid_file(x),
+        default="./tank/tflite/tflite_model_list.csv",
+        help="Contains the file with tf model name and args.",
+    )
+    parser.add_argument("--upload", type=bool, default=False)
+
+    args = parser.parse_args()
+    if args.torch_model_csv:
+        save_torch_model(args.torch_model_csv)
+
+    if args.tf_model_csv:
+        save_tf_model(args.tf_model_csv)
+
+    if args.tflite_model_csv:
+        save_tflite_model(args.tflite_model_csv)
+
+    if args.upload:
+        git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
+        print("uploading files to gs://shark_tank/" + git_hash)
+        os.system(
+            "gsutil cp -r ./gen_shark_tank/* gs://shark_tank/" + git_hash
+        )
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -0,0 +1,192 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.17)
+
+project(sharkbackend LANGUAGES C CXX)
+
+#
+# Options
+#
+
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
+option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+#
+# Dependencies
+#
+# FetchContent requires us to include the transitive closure of all
+# repos that we depend on so that we can override the tags.
+#
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-core
+  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_TAG ${TRITON_CORE_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-backend
+  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(repo-common repo-core repo-backend)
+
+#
+# The backend must be built into a shared library. Use an ldscript to
+# hide all symbols except for the TRITONBACKEND API.
+#
+configure_file(src/libtriton_dshark.ldscript libtriton_dshark.ldscript COPYONLY)
+
+add_library(
+  triton-dshark-backend SHARED
+  src/dshark.cc
+  #src/dshark_driver_module.c
+)
+
+add_library(
+  SharkBackend::triton-dshark-backend ALIAS triton-dshark-backend
+)
+
+target_include_directories(
+  triton-dshark-backend
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_BINARY_DIR}/lib/cmake/mlir")
+
+add_subdirectory(thirdparty/shark-runtime EXCLUDE_FROM_ALL)
+
+target_link_libraries(triton-dshark-backend PRIVATE iree_base_base
+  iree_hal_hal
+  iree_hal_cuda_cuda
+  iree_hal_cuda_registration_registration
+  iree_hal_vmvx_registration_registration
+  iree_hal_dylib_registration_registration
+  iree_modules_hal_hal
+  iree_vm_vm
+  iree_vm_bytecode_module
+  iree_hal_local_loaders_system_library_loader
+  iree_hal_local_loaders_vmvx_module_loader
+  )
+
+target_compile_features(triton-dshark-backend PRIVATE cxx_std_11)
+
+
+target_link_libraries(
+  triton-dshark-backend
+  PRIVATE
+    triton-core-serverapi   # from repo-core
+    triton-core-backendapi  # from repo-core
+    triton-core-serverstub  # from repo-core
+    triton-backend-utils    # from repo-backend
+)
+
+if(WIN32)
+  set_target_properties(
+    triton-dshark-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_dshark
+  )
+else()
+  set_target_properties(
+    triton-dshark-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_dshark
+    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_dshark.ldscript
+    LINK_FLAGS "-Wl,--version-script libtriton_dshark.ldscript"
+  )
+endif()
+
+
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/SharkBackend)
+
+install(
+  TARGETS
+    triton-dshark-backend
+  EXPORT
+    triton-dshark-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
+  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
+)
+
+install(
+  EXPORT
+    triton-dshark-backend-targets
+  FILE
+    SharkBackendTargets.cmake
+  NAMESPACE
+    SharkBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/SharkBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-dshark-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendTargets.cmake
+  NAMESPACE SharkBackend::
+)
+
+export(PACKAGE SharkBackend)
+
--- a/inference/README.md
+++ b/inference/README.md
@@ -0,0 +1,100 @@
+# SHARK Triton Backend
+
+The triton backend for shark.
+
+# Build
+
+Install SHARK
+
+```
+git clone https://github.com/nod-ai/SHARK.git
+# skip above step if dshark is already installed
+cd SHARK/inference
+```
+
+install dependancies
+
+```
+apt-get install patchelf rapidjson-dev python3-dev
+git submodule update --init
+```
+
+update the submodules of iree
+
+```
+cd thirdparty/shark-runtime
+git submodule update --init
+```
+
+Next, make the backend and install it
+
+```
+cd ../..
+mkdir build && cd build
+cmake -DTRITON_ENABLE_GPU=ON \
+-DIREE_HAL_DRIVER_CUDA=ON \
+-DIREE_TARGET_BACKEND_CUDA=ON \
+-DMLIR_ENABLE_CUDA_RUNNER=ON \
+-DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \
+-DTRITON_BACKEND_REPO_TAG=r22.02 \
+-DTRITON_CORE_REPO_TAG=r22.02 \
+-DTRITON_COMMON_REPO_TAG=r22.02 ..
+make install
+```
+
+# Incorporating into Triton
+
+There are much more in depth explenations for the following steps in triton's documentation:
+https://github.com/triton-inference-server/server/blob/main/docs/compose.md#triton-with-unsupported-and-custom-backends
+
+There should be a file at /build/install/backends/dshark/libtriton_dshark.so.  You will need to copy it into your triton server image.  
+More documentation is in the link above, but to create the docker image, you need to run the compose.py command in the triton-backend server repo
+
+
+To first build your image, clone the tritonserver repo.
+
+```
+git clone https://github.com/triton-inference-server/server.git
+```
+
+then run `compose.py` to build a docker compose file 
+```
+cd server
+python3 compose.py --repoagent checksum --dry-run
+```
+
+Because dshark is a third party backend, you will need to manually modify the `Dockerfile.compose` to include the dshark backend.  To do this, in the Dockerfile.compose file produced, copy this line.
+the dshark backend will be located in the build folder from earlier under `/build/install/backends`
+
+```
+COPY /path/to/build/install/backends/dshark /opt/tritonserver/backends/dshark
+```
+
+Next run 
+```
+docker build -t tritonserver_custom -f Dockerfile.compose .
+docker run -it --gpus=1 --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
+```
+
+where `path/to/model_repos` is where you are storing the models you want to run
+
+if your not using gpus, omit `--gpus=1`
+
+```
+docker run -it  --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
+```
+
+# Setting up a model
+
+to include a model in your backend, add a directory with your model name to your model repository directory.  examples of models can be seen here: https://github.com/triton-inference-server/backend/tree/main/examples/model_repos/minimal_models
+
+make sure to adjust the input correctly in the config.pbtxt file, and save a vmfb file under 1/model.vmfb
+
+# CUDA
+
+if you're having issues with cuda, make sure your correct drivers are installed, and that `nvidia-smi` works, and also make sure that the nvcc compiler is on the path.
+
+
+
+
+
--- a/inference/cmake/SharkBackendConfig.cmake.in
+++ b/inference/cmake/SharkBackendConfig.cmake.in
@@ -0,0 +1,39 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  SHARKBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${SHARKBACKEND_CMAKE_DIR})
+
+if(NOT TARGET SharkBackend::triton-dshark-backend)
+  include("${SHARKBACKEND_CMAKE_DIR}/SharkBackendTargets.cmake")
+endif()
+
+set(SHARKBACKEND_LIBRARIES SharkBackend::triton-dshark-backend)
--- a/inference/src/dshark.cc
+++ b/inference/src/dshark.cc
--- a/inference/src/libtriton_dshark.ldscript
+++ b/inference/src/libtriton_dshark.ldscript
@@ -0,0 +1,30 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
--- a/inference/thirdparty/shark-runtime
+++ b/inference/thirdparty/shark-runtime
--- a/package-index/index.html
+++ b/package-index/index.html
@@ -1,45 +0,0 @@
-<!DOCTYPE html>
-<html>
-  <body>
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230130.481/shark_sd_20230130_481.exe'>shark_sd_20230130_481.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230130.481/shark_sd_cli_20230130_481.exe'>shark_sd_cli_20230130_481.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.479/shark_sd_20230129_479.exe'>shark_sd_20230129_479.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.479/shark_sd_cli_20230129_479.exe'>shark_sd_cli_20230129_479.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.480/shark_sd_20230129_480.exe'>shark_sd_20230129_480.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.480/shark_sd_cli_20230129_480.exe'>shark_sd_cli_20230129_480.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.478/shark_sd_20230129_478.exe'>shark_sd_20230129_478.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.478/shark_sd_cli_20230129_478.exe'>shark_sd_cli_20230129_478.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230128.477/shark_sd_20230128_477.exe'>shark_sd_20230128_477.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230128.477/shark_sd_cli_20230128_477.exe'>shark_sd_cli_20230128_477.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230127.476/shark_sd_20230127_476.exe'>shark_sd_20230127_476.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230127.476/shark_sd_cli_20230127_476.exe'>shark_sd_cli_20230127_476.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230126.475/shark_sd_20230126_475.exe'>shark_sd_20230126_475.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230126.475/shark_sd_cli_20230126_475.exe'>shark_sd_cli_20230126_475.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.474/shark_sd_20230125_474.exe'>shark_sd_20230125_474.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.474/shark_sd_cli_20230125_474.exe'>shark_sd_cli_20230125_474.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.473/shark_sd_20230125_473.exe'>shark_sd_20230125_473.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.473/shark_sd_cli_20230125_473.exe'>shark_sd_cli_20230125_473.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.472/shark_sd_20230125_472.exe'>shark_sd_20230125_472.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.471/shark_sd_20230125_471.exe'>shark_sd_20230125_471.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.468/shark_sd_20230125_468.exe'>shark_sd_20230125_468.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.470/shark_sd_20230124_470.exe'>shark_sd_20230124_470.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.470/shark_sd_cli_20230124_470.exe'>shark_sd_cli_20230124_470.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.469/shark_sd_20230124_469.exe'>shark_sd_20230124_469.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.467/shark_sd_20230124_467.exe'>shark_sd_20230124_467.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.466/shark_sd_20230124_466.exe'>shark_sd_20230124_466.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.462/shark_sd_20230124_462.exe'>shark_sd_20230124_462.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230123.461/shark_sd_20230123_461.exe'>shark_sd_20230123_461.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230123.460/shark_sd_20230123_460.exe'>shark_sd_20230123_460.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.459/shark_sd_20230122_459.exe'>shark_sd_20230122_459.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.458/shark_sd_20230122_458.exe'>shark_sd_20230122_458.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.457/shark_sd_20230122_457.exe'>shark_sd_20230122_457.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230121.456/shark_sd_20230121_456.exe'>shark_sd_20230121_456.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230120.455/shark_sd_20230120_455.exe'>shark_sd_20230120_455.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230119.454/shark_sd_20230119_454.exe'>shark_sd_20230119_454.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230118.453/shark_sd_20230118_453.exe'>shark_sd_20230118_453.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230117.452/shark_sd_20230117_452.exe'>shark_sd_20230117_452.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230116.451/shark_sd_20230116_451.exe'>shark_sd_20230116_451.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230115.450/shark_sd_20230115_450.exe'>shark_sd_20230115_450.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230114.449/shark_sd_20230114_449.exe'>shark_sd_20230114_449.exe</a><br />
-  </body>
-</html>
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,12 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel",
+    "packaging",
+
+    "numpy==1.22.4",
+    "torch-mlir>=20220428.420",
+    "iree-compiler>=20220427.13",
+    "iree-runtime>=20220427.13",
+]
+build-backend = "setuptools.build_meta"
--- a/pytest.ini
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+addopts = --verbose -p no:warnings
+norecursedirs = inference tank/tflite 
--- a/reference_models/bert-base-uncased_torch/bert-base-uncased_torch_test.py
+++ b/reference_models/bert-base-uncased_torch/bert-base-uncased_torch_test.py
@@ -0,0 +1,109 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import compare_tensors
+from shark.shark_downloader import download_torch_model
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import pytest
+
+
+class BertBaseUncasedModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+        onnx_bench=False,
+    ):
+        self.benchmark = benchmark
+        self.onnx_bench = onnx_bench
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "bert-base-uncased", dynamic
+        )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
+        if self.benchmark == True:
+            shark_args.onnx_bench = self.onnx_bench
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "bert-base-uncased",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class BertBaseUncasedModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = BertBaseUncasedModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/reference_models/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
+++ b/reference_models/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
@@ -0,0 +1,71 @@
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+
+import iree.compiler as ireec
+import unittest
+import pytest
+import numpy as np
+
+
+class DistilBertModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model, func_name, inputs, golden_out = download_tf_model(
+            "distilbert-base-uncased"
+        )
+
+        shark_module = SharkInference(
+            model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        result = shark_module.forward(inputs)
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+
+
+class DistilBertModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = DistilBertModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/reference_models/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
+++ b/reference_models/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
@@ -0,0 +1,95 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import compare_tensors
+from shark.parser import shark_args
+from shark.shark_downloader import download_torch_model
+
+import unittest
+import numpy as np
+import pytest
+
+
+class DistilBertModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "distilbert-base-uncased", dynamic
+        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
+        if self.benchmark == True:
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "distilbert-base-uncased",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class DistilBertModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = DistilBertModuleTester(self)
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/reference_models/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
+++ b/reference_models/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
@@ -0,0 +1,114 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_downloader import download_torch_model
+
+import unittest
+import numpy as np
+import pytest
+
+
+class MobileNetV3ModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "mobilenet_v3_small", dynamic
+        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        np.testing.assert_allclose(act_out, results, rtol=1e-02, atol=1e-03)
+
+        if self.benchmark == True:
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "alexnet",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class MobileNetV3ModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = MobileNetV3ModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.xfail(reason="golden results don't match.")
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.xfail(reason="golden results don't match.")
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.xfail(reason="stuck in the pipeline.")
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/reference_models/resnet101_torch/resnet101_torch_test.py
+++ b/reference_models/resnet101_torch/resnet101_torch_test.py
@@ -0,0 +1,114 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import compare_tensors
+from shark.shark_downloader import download_torch_model
+
+import unittest
+import numpy as np
+import pytest
+
+
+class Resnet101ModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "resnet101", dynamic
+        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
+        if self.benchmark == True:
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "resnet101",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class Resnet101ModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = Resnet101ModuleTester(self)
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/reference_models/resnet50_torch/resnet50_torch_test.py
+++ b/reference_models/resnet50_torch/resnet50_torch_test.py
@@ -0,0 +1,114 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import get_vision_model, compare_tensors
+from shark.shark_downloader import download_torch_model
+
+import unittest
+import numpy as np
+import pytest
+
+
+class Resnet50ModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "resnet50", dynamic
+        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
+        if self.benchmark == True:
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "resnet50",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class Resnet50ModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = Resnet50ModuleTester(self)
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/reference_models/unet_torch/unet_torch_test.py
+++ b/reference_models/unet_torch/unet_torch_test.py
@@ -0,0 +1,91 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_downloader import download_torch_model
+
+import unittest
+import numpy as np
+import pytest
+
+
+class UnetModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "unet", dynamic
+        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        np.testing.assert_allclose(act_out, results, rtol=1e-02, atol=1e-03)
+
+        if self.benchmark == True:
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "unet",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class UnetModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = UnetModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -0,0 +1,41 @@
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+--pre
+
+numpy
+torch
+torchvision
+
+tqdm
+
+#iree-compiler  | iree-runtime should already be installed
+#these dont work ok osx
+#iree-tools-tflite
+#iree-tools-xla
+#iree-tools-tf
+
+# TensorFlow and JAX.
+gin-config
+tensorflow-macos
+tensorflow-metal
+#tf-models-nightly
+#tensorflow-text-nightly
+transformers==4.18.0
+tensorflow-probability
+#jax[cpu]
+
+# tflitehub dependencies.
+Pillow
+
+# Testing and support.
+#lit
+#pyyaml
+
+#ONNX and ORT for benchmarking
+#--extra-index-url https://test.pypi.org/simple/
+#protobuf
+#coloredlogs
+#flatbuffers
+#sympy
+#psutil
+#onnx-weekly
+#ort-nightly
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -0,0 +1,40 @@
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+--pre
+
+numpy==1.22.4
+torch
+torchvision
+
+tqdm
+
+#iree-compiler  | iree-runtime should already be installed
+iree-tools-tflite
+iree-tools-xla
+iree-tools-tf
+
+# TensorFlow and JAX.
+gin-config
+tensorflow
+#tf-models-nightly
+#tensorflow-text-nightly
+transformers==4.18.0
+#tensorflow-probability
+#jax[cpu]
+
+
+# tflitehub dependencies.
+Pillow
+
+# Testing and support.
+lit
+pyyaml
+
+#ONNX and ORT for benchmarking
+#--extra-index-url https://test.pypi.org/simple/
+#protobuf
+#coloredlogs
+#flatbuffers
+#sympy
+#psutil
+#onnx-weekly
+#ort-nightly
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+setuptools
+wheel
+
+# SHARK Runner
+tqdm
+
+# SHARK Downloader
+gsutil
+
+# Testing
+pytest
+pytest-xdist
+Pillow
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,38 @@
+from setuptools import find_packages
+from setuptools import setup
+
+import os
+
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
+
+setup(
+    name="nodai-SHARK",
+    version=f"{PACKAGE_VERSION}",
+    description="SHARK provides a High Performance Machine Learning Framework",
+    author="nod.ai",
+    author_email="stdin@nod.ai",
+    url="https://nod.ai",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    project_urls={
+        "Code": "https://github.com/nod-ai/SHARK",
+        "Bug Tracker": "https://github.com/nod-ai/SHARK/issues",
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    packages=find_packages(exclude=("examples")),
+    python_requires=">=3.7",
+    install_requires=[
+        "numpy",
+        "PyYAML",
+        "torch-mlir>=20220428.420",
+        "iree-compiler>=20220427.13",
+        "iree-runtime>=20220427.13",
+    ],
+)
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+# Sets up a venv suitable for running samples.
+# e.g:
+# ./setup_venv.sh  #setup a default $PYTHON3 shark.venv
+# Environment Variables by the script.
+# PYTHON=$PYTHON3.10 ./setup_venv.sh  #pass a version of $PYTHON to use
+# VENV_DIR=myshark.venv #create a venv called myshark.venv
+# USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
+# IMPORTER=1 #Install importer deps
+# if you run the script from a conda env it will install in your conda env
+
+TD="$(cd $(dirname $0) && pwd)"
+if [ -z "$PYTHON" ]; then
+  PYTHON="$(which python3)"
+fi
+
+function die() {
+  echo "Error executing command: $*"
+  exit 1
+}
+
+PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))'`
+
+echo "Python: $PYTHON"
+echo "Python version: $PYTHON_VERSION_X_Y"
+
+if [[ -z "${CONDA_PREFIX}" ]]; then
+  # Not a conda env. So create a new VENV dir
+  VENV_DIR=${VENV_DIR:-shark.venv}
+  echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
+  $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
+  source "$VENV_DIR/bin/activate" || die "Could not activate venv"
+  PYTHON="$(which python3)"
+else
+  echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
+fi
+
+Red=`tput setaf 1`
+Green=`tput setaf 2`
+Yellow=`tput setaf 3`
+
+# Assume no binary torch-mlir.
+# Currently available for macOS m1&intel (3.10) and Linux(3.7,3.8,3.9,3.10)
+torch_mlir_bin=false
+if [[ $(uname -s) = 'Darwin' ]]; then
+  echo "${Yellow}Apple macOS detected"
+  if [[ $(uname -m) == 'arm64' ]]; then
+    echo "${Yellow}Apple M1 Detected"
+    hash rustc 2>/dev/null
+    if [ $? -eq 0 ];then
+      echo "${Green}rustc found to compile HF tokenizers"
+    else
+      echo "${Red}Could not find rustc" >&2
+      echo "${Red}Please run:"
+      echo "${Red}curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh"
+      exit 1
+    fi
+  fi
+  echo "${Yellow}Run the following commands to setup your SSL certs for your Python version if you see SSL errors with tests"
+  echo "${Yellow}/Applications/Python\ 3.XX/Install\ Certificates.command"
+  if [ "$PYTHON_VERSION_X_Y" == "3.10" ]; then
+    torch_mlir_bin=true
+  fi
+elif [[ $(uname -s) = 'Linux' ]]; then
+  echo "${Yellow}Linux detected"
+  if [ "$PYTHON_VERSION_X_Y" == "3.7" ] || [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.9" ] || [ "$PYTHON_VERSION_X_Y" == "3.10" ] ; then
+    torch_mlir_bin=true
+  fi
+else
+  echo "${Red}OS not detected. Pray and Play"
+fi
+
+# Upgrade pip and install requirements.
+$PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
+$PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
+if [ "$torch_mlir_bin" = true ]; then
+  $PYTHON -m pip install --find-links https://github.com/llvm/torch-mlir/releases torch-mlir --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  if [ $? -eq 0 ];then
+    echo "Successfully Installed torch-mlir"
+  else
+    echo "Could not install torch-mlir" >&2
+  fi
+else
+  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
+  echo "${Yello}Python 3.10 supported on macOS and 3.7,3.8,3.9 and 3.10 on Linux"
+  echo "${Red}Please build torch-mlir from source in your environment"
+  exit 1
+fi
+if [[ -z "${USE_IREE}" ]]; then
+  RUNTIME="nod-ai/SHARK-Runtime"
+else
+  RUNTIME="google/iree"
+fi
+echo "Installing ${RUNTIME}..."
+$PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime
+
+if [[ ! -z "${IMPORTER}" ]]; then
+  echo "${Yellow}Installing importer tools.."
+  if [[ $(uname -s) = 'Linux' ]]; then
+    echo "${Yellow}Linux detected.. installing Linux importer tools"
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://test.pypi.org/simple/ --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+  elif [[ $(uname -s) = 'Darwin' ]]; then
+    echo "${Yellow}macOS detected.. installing macOS importer tools"
+    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  fi
+fi
+
+$PYTHON -m pip install -e . --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://github.com/llvm/torch-mlir/releases -f https://github.com/${RUNTIME}/releases
+
+if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
+  $PYTHON -m pip uninstall -y torch torchvision
+  $PYTHON -m pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+  if [ $? -eq 0 ];then
+    echo "Successfully Installed torch + cu116."
+  else
+    echo "Could not install torch + cu116." >&2
+  fi
+fi
+
+if [[ ! -z "${ONNX}" ]]; then
+  echo "${Yellow}Installing ONNX and onnxruntime for benchmarks..."
+  $PYTHON -m pip install onnx onnxruntime psutil
+  if [ $? -eq 0 ];then
+    echo "Successfully installed ONNX and ONNX runtime."
+  else
+    echo "Could not install ONNX." >&2
+  fi
+fi
+
+if [[ -z "${CONDA_PREFIX}" ]]; then
+  echo "${Green}Before running examples activate venv with:"
+  echo "  ${Green}source $VENV_DIR/bin/activate"
+fi
+
--- a/shark/init.py
+++ b/shark/init.py
--- a/shark/backward_makefx.py
+++ b/shark/backward_makefx.py
@@ -0,0 +1,78 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch._decomp import get_decompositions
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.nn.utils import _stateless
+
+from torch import fx
+import tempfile
+
+
+class MakeFxModule:
+    def __init__(self, model, inputs, labels=None, custom_inference_fn=None):
+        self.model = model
+        self.inputs = inputs
+        self.custom_inference_fn = custom_inference_fn
+        self.training_graph = None
+
+    # Doesn't replace the None type.
+    def change_fx_graph_return_to_tuple(self, fx_g: fx.GraphModule):
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                # output nodes always have one argument
+                node_arg = node.args[0]
+                out_nodes = []
+                if isinstance(node_arg, list):
+                    # Don't return NoneType elements.
+                    for out_node in node_arg:
+                        if not isinstance(out_node, type(None)):
+                            out_nodes.append(out_node)
+                    # If there is a single tensor/element to be returned don't
+                    # a tuple for it.
+                    if len(out_nodes) == 1:
+                        node.args = out_nodes
+                    else:
+                        node.args = (tuple(out_nodes),)
+        fx_g.graph.lint()
+        fx_g.recompile()
+        return fx_g
+
+    def generate_graph(self):
+        fx_g = make_fx(
+            self.custom_inference_fn,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                ]
+            ),
+        )(
+            dict(self.model.named_parameters()),
+            dict(self.model.named_buffers()),
+            self.inputs,
+        )
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+        fx_g = self.change_fx_graph_return_to_tuple(fx_g)
+        ts_g = torch.jit.script(fx_g)
+        temp = tempfile.NamedTemporaryFile(
+            suffix="_shark_ts", prefix="temp_ts_"
+        )
+        ts_g.save(temp.name)
+        new_ts = torch.jit.load(temp.name)
+        self.training_graph = new_ts
--- a/shark/examples/shark_eager/dynamo_demo.ipynb
+++ b/shark/examples/shark_eager/dynamo_demo.ipynb
@@ -0,0 +1,300 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mlevental/miniconda3/envs/torch-mlir/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "# standard imports\n",
+    "import torch\n",
+    "from shark.iree_utils import get_iree_compiled_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "# torch dynamo related imports\n",
+    "try:\n",
+    "    import torchdynamo\n",
+    "    from torchdynamo.optimizations.backends import create_backend\n",
+    "    from torchdynamo.optimizations.subgraph import SubGraph\n",
+    "except ModuleNotFoundError:\n",
+    "    print(\"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\")\n",
+    "    exit()\n",
+    "\n",
+    "# torch-mlir imports for compiling\n",
+    "from torch_mlir import compile, OutputType"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "[TorchDynamo](https://github.com/pytorch/torchdynamo) is a compiler for PyTorch programs that uses the [frame evaluation API](https://www.python.org/dev/peps/pep-0523/) in CPython to dynamically modify Python bytecode right before it is executed. It creates this FX Graph through bytecode analysis and is designed to mix Python execution with compiled backends."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "def toy_example(*args):\n",
+    "    a, b = args\n",
+    "\n",
+    "    x = a / (torch.abs(a) + 1)\n",
+    "    if b.sum() < 0:\n",
+    "        b = b * -1\n",
+    "    return x * b"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [],
+   "source": [
+    "# compiler that lowers fx_graph to through MLIR\n",
+    "def __torch_mlir(fx_graph, *args, **kwargs):\n",
+    "    assert isinstance(\n",
+    "        fx_graph, torch.fx.GraphModule\n",
+    "    ), \"Model must be an FX GraphModule.\"\n",
+    "\n",
+    "    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule):\n",
+    "        \"\"\"Replace tuple with tuple element in functions that return one-element tuples.\"\"\"\n",
+    "\n",
+    "        for node in fx_g.graph.nodes:\n",
+    "            if node.op == \"output\":\n",
+    "                assert len(node.args) == 1, \"Output node must have a single argument\"\n",
+    "                node_arg = node.args[0]\n",
+    "                if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
+    "                    node.args = (node_arg[0],)\n",
+    "        fx_g.graph.lint()\n",
+    "        fx_g.recompile()\n",
+    "        return fx_g\n",
+    "\n",
+    "    fx_graph = _unwrap_single_tuple_return(fx_graph)\n",
+    "    ts_graph = torch.jit.script(fx_graph)\n",
+    "\n",
+    "    # torchdynamo does munges the args differently depending on whether you use\n",
+    "    # the @torchdynamo.optimize decorator or the context manager\n",
+    "    if isinstance(args, tuple):\n",
+    "        args = list(args)\n",
+    "    assert isinstance(args, list)\n",
+    "    if len(args) == 1 and isinstance(args[0], list):\n",
+    "        args = args[0]\n",
+    "\n",
+    "    linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)\n",
+    "    callable, _ = get_iree_compiled_module(linalg_module, \"cuda\", func_name=\"forward\")\n",
+    "\n",
+    "    def forward(*inputs):\n",
+    "        return callable(*inputs)\n",
+    "\n",
+    "    return forward"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Simplest way to use TorchDynamo with the `torchdynamo.optimize` context manager:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 1 device(s).\n",
+      "Device: 0\n",
+      "  Name: NVIDIA GeForce RTX 3080\n",
+      "  Compute Capability: 8.6\n",
+      "[-0.40066046 -0.4210303   0.03225489 -0.44849953  0.10370405 -0.04422468\n",
+      "  0.33262825 -0.20109026  0.02102537 -0.24882983]\n",
+      "[-0.07824923 -0.17004533  0.06439921 -0.06163602  0.26633525 -1.1560082\n",
+      " -0.06660341  0.24227881  0.1462235  -0.32055548]\n",
+      "[-0.01464001  0.442209   -0.0607936  -0.5477967  -0.25226554 -0.08588809\n",
+      " -0.30497575  0.00061084 -0.50069696  0.2317973 ]\n",
+      "[ 0.25726247  0.39388427 -0.24093066  0.12316308 -0.01981307  0.5661146\n",
+      "  0.26199922  0.8123446  -0.01576749  0.30846444]\n",
+      "[ 0.7878203  -0.45975062 -0.29956317 -0.07032048 -0.55817443 -0.62506855\n",
+      " -1.6837492  -0.38442805  0.28220773 -1.5325156 ]\n",
+      "[ 0.07975311  0.67754704 -0.30927914  0.00347631 -0.07326564  0.01893554\n",
+      " -0.7518105  -0.03078967 -0.07623022  0.38865626]\n",
+      "[-0.7751679  -0.5841397  -0.6622711   0.18574935 -0.6049372   0.02844244\n",
+      " -0.20471913  0.3337415  -0.3619432  -0.35087156]\n",
+      "[-0.08569919 -0.10775139 -0.02338934  0.21933547 -0.46712473  0.00062137\n",
+      " -0.58207744  0.06457533  0.18276742  0.03866556]\n",
+      "[-0.2311981  -0.43036282  0.20561649 -0.10363232 -0.13248594  0.02885137\n",
+      " -0.31241602 -0.36907142  0.08861586  0.2331427 ]\n",
+      "[-0.07273526 -0.31246194 -0.24218291 -0.24145737  0.0364486   0.14382267\n",
+      " -0.00531162  0.15447603 -0.5220248  -0.09016377]\n"
+     ]
+    }
+   ],
+   "source": [
+    "with torchdynamo.optimize(__torch_mlir):\n",
+    "    for _ in range(10):\n",
+    "        print(toy_example(torch.randn(10), torch.randn(10)))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "It can also be used through a decorator:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [],
+   "source": [
+    "@create_backend\n",
+    "def torch_mlir(subgraph, *args, **kwargs):\n",
+    "    assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
+    "    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
+    "\n",
+    "@torchdynamo.optimize(\"torch_mlir\")\n",
+    "def toy_example2(*args):\n",
+    "    a, b = args\n",
+    "\n",
+    "    x = a / (torch.abs(a) + 1)\n",
+    "    if b.sum() < 0:\n",
+    "        b = b * -1\n",
+    "    return x * b"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 1 device(s).\n",
+      "Device: 0\n",
+      "  Name: NVIDIA GeForce RTX 3080\n",
+      "  Compute Capability: 8.6\n",
+      "[-0.35494277  0.03409214 -0.02271946  0.7335942   0.03122527 -0.41881397\n",
+      " -0.6609761  -0.6418614   0.29336175 -0.01973678]\n",
+      "[-2.7246824e-01 -3.5543957e-01  6.0087401e-01 -7.4570496e-03\n",
+      " -4.2481605e-02 -5.0296803e-04  7.2928613e-01 -1.4673788e-03\n",
+      " -2.7621329e-01 -6.0995776e-02]\n",
+      "[-0.03165906  0.3889693   0.24052973  0.27279532 -0.02773128 -0.12602475\n",
+      " -1.0124422   0.5720256  -0.35437614 -0.20992722]\n",
+      "[-0.41831446  0.5525326  -0.29749998 -0.17044766  0.11804754 -0.05210691\n",
+      " -0.46145165 -0.8776549   0.10090438  0.17463352]\n",
+      "[ 0.02194221  0.20959911  0.26973712  0.12551276 -0.0020404   0.1490246\n",
+      " -0.04456685  1.1100804   0.8105744   0.6676846 ]\n",
+      "[ 0.06528181 -0.13591261  0.5370964  -0.4398162  -0.03372452  0.9691372\n",
+      " -0.01120087  0.2947028   0.4804801  -0.3324341 ]\n",
+      "[ 0.33549032 -0.23001772 -0.08681437  0.16490957 -0.11223086  0.09168988\n",
+      "  0.02403045  0.17344482  0.46406478 -0.00129451]\n",
+      "[-0.27475086  0.42384806  1.9090122  -0.41147137 -0.6888369   0.08435658\n",
+      " -0.26628923 -0.17436793 -0.8058869  -0.02582378]\n",
+      "[-0.10109414  0.08681287 -0.10055986  0.6858881   0.29267687 -0.02797117\n",
+      " -0.01425194  0.4882803   0.3551982  -0.858935  ]\n",
+      "[-0.22086617  0.524994    0.17721705 -0.03813264 -0.54570735 -0.4421502\n",
+      "  0.11938014 -0.01122053  0.39294165 -0.61770755]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for _ in range(10):\n",
+    "    print(toy_example2(torch.randn(10), torch.randn(10)))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/shark/examples/shark_eager/dynamo_demo.py
+++ b/shark/examples/shark_eager/dynamo_demo.py
@@ -0,0 +1,92 @@
+import torch
+from torch_mlir import compile, OutputType
+
+from shark.iree_utils import get_iree_compiled_module
+
+try:
+    import torchdynamo
+    from torchdynamo.optimizations.backends import create_backend
+    from torchdynamo.optimizations.subgraph import SubGraph
+except ModuleNotFoundError:
+    print(
+        "Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo"
+    )
+    exit()
+
+NUM_ITERS = 10
+
+
+def __torch_mlir(fx_graph, *args, **kwargs):
+    assert isinstance(
+        fx_graph, torch.fx.GraphModule
+    ), "Model must be an FX GraphModule."
+
+    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule):
+        """Replace tuple with tuple element in functions that return one-element tuples."""
+
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, tuple) and len(node_arg) == 1:
+                    node.args = (node_arg[0],)
+        fx_g.graph.lint()
+        fx_g.recompile()
+        return fx_g
+
+    fx_graph = _unwrap_single_tuple_return(fx_graph)
+    ts_graph = torch.jit.script(fx_graph)
+
+    if isinstance(args, tuple):
+        args = list(args)
+    assert isinstance(args, list)
+    if len(args) == 1 and isinstance(args[0], list):
+        args = args[0]
+
+    linalg_module = compile(
+        ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS
+    )
+    callable, _ = get_iree_compiled_module(
+        linalg_module, "cuda", func_name="forward"
+    )
+
+    def forward(*inputs):
+        return callable(*inputs)
+
+    return forward
+
+
+def toy_example(*args):
+    a, b = args
+
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+with torchdynamo.optimize(__torch_mlir):
+    for _ in range(10):
+        print(toy_example(torch.randn(10), torch.randn(10)))
+
+
+@create_backend
+def torch_mlir(subgraph, *args, **kwargs):
+    assert isinstance(subgraph, SubGraph), "Model must be a dynamo SubGraph."
+    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))
+
+
+@torchdynamo.optimize("torch_mlir")
+def toy_example2(*args):
+    a, b = args
+
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+for _ in range(10):
+    print(toy_example2(torch.randn(10), torch.randn(10)))
--- a/shark/examples/shark_eager/eager_mode.ipynb
+++ b/shark/examples/shark_eager/eager_mode.ipynb
@@ -0,0 +1,805 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mlevental/miniconda3/envs/torch-mlir/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "# standard imports\n",
+    "import torch\n",
+    "from torch_mlir.eager_mode import torch_mlir_tensor"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "# eager mode imports\n",
+    "from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor\n",
+    "from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The simplest way of using Eager Mode (through IREE) requires setting a \"backend\":"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(\"cpu\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and wrapping all your `torch.Tensor`s:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_ITERS = 10\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = 2 * torch.ones((10, 10))\n",
+    "\n",
+    "tt = TorchMLIRTensor(t)\n",
+    "print(tt)\n",
+    "uu = TorchMLIRTensor(u)\n",
+    "print(uu)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "`TorchMLIRTensor` is a \"tensor wrapper subclass\" (more info [here](https://github.com/albanD/subclass_zoo)) that keeps the IREE `DeviceArray` in a field `elem`:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(NUM_ITERS):\n",
+    "    yy = tt + uu\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())\n",
+    "    yy = tt * uu\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "If you have a GPU (and CUDA installed) that works too (you can verify by having `watch -n1 nvidia-smi` up in a terminal while running the next cell):"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(\"gpu\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = 2 * torch.ones((10, 10))\n",
+    "\n",
+    "tt = TorchMLIRTensor(t)\n",
+    "print(tt)\n",
+    "uu = TorchMLIRTensor(u)\n",
+    "print(uu)\n",
+    "\n",
+    "yy = tt + uu\n",
+    "print(yy.elem.to_host())\n",
+    "yy = tt * uu\n",
+    "print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "There is a convenience class `SharkEagerMode` that will handle both the installation of the backend and the wrapping of `torch.Tensor`s:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# eager mode RAII\n",
+    "from shark.shark_runner import SharkEagerMode\n",
+    "\n",
+    "shark_eager_mode = SharkEagerMode(\"cpu\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = torch.ones((10, 10))\n",
+    "\n",
+    "print(t)\n",
+    "print(u)\n",
+    "\n",
+    "for i in range(NUM_ITERS):\n",
+    "    yy = t + u\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())\n",
+    "    yy = t * u\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The `SharkEagerMode` class is a hacky take on [RAII](https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization) that defines a \"deleter\" that runs when an instantiation (of `SharkEagerMode`) is garbage collected. Takeaway is that if you want to turn off `SharkEagerMode`, or switch backends, you need to `del` the instance:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "del shark_eager_mode\n",
+    "shark_eager_mode = SharkEagerMode(\"cuda\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = torch.ones((10, 10))\n",
+    "\n",
+    "print(t)\n",
+    "print(u)\n",
+    "\n",
+    "yy = t + u\n",
+    "print(type(yy))\n",
+    "print(yy.elem.to_host())\n",
+    "yy = t * u\n",
+    "print(type(yy))\n",
+    "print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/shark/examples/shark_eager/eager_mode.py
+++ b/shark/examples/shark_eager/eager_mode.py
@@ -0,0 +1,148 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.utils.cpp_extension import load_inline, include_paths
+from torch_mlir.eager_mode import torch_mlir_tensor
+from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor
+
+from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
+from shark.shark_runner import SharkEagerMode
+
+
+def test_cpu():
+    torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend("cpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = 2 * torch.ones((10, 10), device="cpu")
+
+    tt = TorchMLIRTensor(t)
+    print(tt)
+    uu = TorchMLIRTensor(u)
+    print(uu)
+
+    for i in range(NUM_ITERS):
+        yy = tt + uu
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = tt * uu
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+def test_gpu():
+    source = """
+    #include <iostream>
+    #include "cuda.h"
+    #include "cuda_runtime_api.h"
+
+    using namespace std;
+
+    void print_free_mem() {
+        int num_gpus;
+        size_t free, total;
+        cudaSetDevice(0);
+        int id;
+        cudaGetDevice(&id);
+        cudaMemGetInfo(&free, &total);
+        cout << "GPU " << id << " memory: used=" << (total-free)/(1<<20) << endl;
+    }
+    """
+    gpu_stats = load_inline(
+        name="inline_extension",
+        cpp_sources=[source],
+        extra_include_paths=include_paths(cuda=True),
+        functions=["print_free_mem"],
+    )
+    torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend("gpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = 2 * torch.ones((10, 10), device="cpu")
+
+    tt = TorchMLIRTensor(t)
+    print(tt)
+    uu = TorchMLIRTensor(u)
+    print(uu)
+
+    for i in range(NUM_ITERS):
+        yy = tt + uu
+        print(yy.elem.to_host())
+        yy = tt * uu
+        print(yy.elem.to_host())
+        gpu_stats.print_free_mem()
+
+
+def test_python_mode_ref_backend():
+    # hide this wherever you want?
+    _ = SharkEagerMode("refbackend")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        print(i)
+        yy = t + u
+        print(yy.elem)
+        yy = t * u
+        print(yy.elem)
+
+
+def test_python_mode_iree_cpu():
+    # hide this wherever you want?
+    _ = SharkEagerMode("cpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        yy = t + u
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = t * u
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+def test_python_mode_iree_gpu():
+    _ = SharkEagerMode("gpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        yy = t + u
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = t * u
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+if __name__ == "__main__":
+    NUM_ITERS = 10
+    test_cpu()
+    if torch.cuda.is_available():
+        test_gpu()
+    test_python_mode_ref_backend()
+    test_python_mode_iree_cpu()
+    test_python_mode_iree_gpu()
--- a/shark/examples/shark_inference/CLIPModel_tf.py
+++ b/shark/examples/shark_inference/CLIPModel_tf.py
@@ -0,0 +1,65 @@
+from PIL import Image
+import requests
+
+from transformers import CLIPProcessor, TFCLIPModel
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+# Create a set of inputs
+clip_vit_inputs = [
+    tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
+    tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
+    tf.TensorSpec(shape=[1, 3, 224, 224], dtype=tf.float32),
+]
+
+
+class CLIPModule(tf.Module):
+    def __init__(self):
+        super(CLIPModule, self).__init__()
+        self.m = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+
+        self.m.predict = lambda x, y, z: self.m(
+            input_ids=x, attention_mask=y, pixel_values=z
+        )
+
+    @tf.function(input_signature=clip_vit_inputs)
+    def forward(self, input_ids, attention_mask, pixel_values):
+        return self.m.predict(
+            input_ids, attention_mask, pixel_values
+        ).logits_per_image
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    inputs = processor(
+        text=["a photo of a cat", "a photo of a dog"],
+        images=image,
+        return_tensors="tf",
+        padding=True,
+    )
+
+    shark_module = SharkInference(
+        CLIPModule(),
+        (
+            inputs["input_ids"],
+            inputs["attention_mask"],
+            inputs["pixel_values"],
+        ),
+    )
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+
+    print(
+        shark_module.forward(
+            (
+                inputs["input_ids"],
+                inputs["attention_mask"],
+                inputs["pixel_values"],
+            )
+        )
+    )
--- a/shark/examples/shark_inference/albert_maskfill_pt.py
+++ b/shark/examples/shark_inference/albert_maskfill_pt.py
@@ -0,0 +1,88 @@
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import torch
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+from iree.compiler import compile_str
+from iree import runtime as ireert
+import os
+import numpy as np
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+
+class AlbertModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForMaskedLM.from_pretrained("albert-base-v2")
+        self.model.eval()
+
+    def forward(self, input_ids, attention_mask):
+        return self.model(
+            input_ids=input_ids, attention_mask=attention_mask
+        ).logits
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
+    text = "This [MASK] is very tasty."
+    encoded_inputs = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        return_tensors="pt",
+    )
+    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
+    mlir_importer = SharkImporter(
+        AlbertModule(),
+        inputs,
+        frontend="torch",
+    )
+    minilm_mlir, func_name = mlir_importer.import_mlir(
+        is_dynamic=False, tracing_required=True
+    )
+    shark_module = SharkInference(
+        minilm_mlir, func_name, mlir_dialect="linalg"
+    )
+    shark_module.compile()
+    token_logits = torch.tensor(shark_module.forward(inputs))
+    mask_id = torch.where(
+        encoded_inputs["input_ids"] == tokenizer.mask_token_id
+    )[1]
+    mask_token_logits = token_logits[0, mask_id, :]
+    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+    for token in top_5_tokens:
+        print(
+            f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
+        )
+    while True:
+        try:
+            new_text = input("Give me a sentence with [MASK] to fill: ")
+            encoded_inputs = tokenizer(
+                new_text,
+                padding="max_length",
+                truncation=True,
+                max_length=MAX_SEQUENCE_LENGTH,
+                return_tensors="pt",
+            )
+            inputs = (
+                encoded_inputs["input_ids"],
+                encoded_inputs["attention_mask"],
+            )
+            token_logits = torch.tensor(shark_module.forward(inputs))
+            mask_id = torch.where(
+                encoded_inputs["input_ids"] == tokenizer.mask_token_id
+            )[1]
+            mask_token_logits = token_logits[0, mask_id, :]
+            top_5_tokens = (
+                torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+            )
+            for token in top_5_tokens:
+                print(
+                    f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
+                )
+        except KeyboardInterrupt:
+            print("Exiting program.")
+            break
--- a/shark/examples/shark_inference/albert_maskfill_tf.py
+++ b/shark/examples/shark_inference/albert_maskfill_tf.py
@@ -0,0 +1,100 @@
+from PIL import Image
+import requests
+
+from transformers import TFAutoModelForMaskedLM, AutoTokenizer
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+from iree import runtime as ireert
+import os
+import numpy as np
+import sys
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of inputs
+t5_inputs = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class AlbertModule(tf.Module):
+    def __init__(self):
+        super(AlbertModule, self).__init__()
+        self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
+        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
+
+    @tf.function(input_signature=t5_inputs)
+    def forward(self, input_ids, attention_mask):
+        return self.m.predict(input_ids, attention_mask)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
+    # text = "This is a great [MASK]."
+    text = "This [MASK] is very tasty."
+    encoded_inputs = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        return_tensors="tf",
+    )
+    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
+    mlir_importer = SharkImporter(
+        AlbertModule(),
+        inputs,
+        frontend="tf",
+    )
+    minilm_mlir, func_name = mlir_importer.import_mlir(
+        is_dynamic=False, tracing_required=False
+    )
+    shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="mhlo")
+    shark_module.compile()
+    output_idx = 0
+    data_idx = 1
+    token_logits = shark_module.forward(inputs)[output_idx][data_idx]
+    mask_id = np.where(
+        tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id
+    )
+    mask_token_logits = token_logits[0, mask_id, :]
+    top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[0:5]
+    for token in top_5_tokens:
+        print(
+            f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
+        )
+    while True:
+        try:
+            new_text = input("Give me a sentence with [MASK] to fill: ")
+            encoded_inputs = tokenizer(
+                new_text,
+                padding="max_length",
+                truncation=True,
+                max_length=MAX_SEQUENCE_LENGTH,
+                return_tensors="tf",
+            )
+            inputs = (
+                encoded_inputs["input_ids"],
+                encoded_inputs["attention_mask"],
+            )
+            token_logits = shark_module.forward(inputs)[output_idx][data_idx]
+            mask_id = np.where(
+                tf.squeeze(encoded_inputs["input_ids"])
+                == tokenizer.mask_token_id
+            )
+            mask_token_logits = token_logits[0, mask_id, :]
+            top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[
+                0:5
+            ]
+            for token in top_5_tokens:
+                print(
+                    f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
+                )
+        except KeyboardInterrupt:
+            print("Exiting program.")
+            sys.exit()
--- a/shark/examples/shark_inference/gpt2_tf.py
+++ b/shark/examples/shark_inference/gpt2_tf.py
@@ -0,0 +1,40 @@
+from PIL import Image
+import requests
+
+from transformers import GPT2Tokenizer, TFGPT2Model
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+# Create a set of inputs
+gpt2_inputs = [
+    tf.TensorSpec(shape=[1, 8], dtype=tf.int32),
+    tf.TensorSpec(shape=[1, 8], dtype=tf.int32),
+]
+
+
+class GPT2Module(tf.Module):
+    def __init__(self):
+        super(GPT2Module, self).__init__()
+        self.m = TFGPT2Model.from_pretrained("distilgpt2")
+
+        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
+
+    @tf.function(input_signature=gpt2_inputs)
+    def forward(self, input_ids, attention_mask):
+        return self.m.predict(input_ids, attention_mask)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
+    text = "I love the distilled version of models."
+
+    inputs = tokenizer(text, return_tensors="tf")
+    shark_module = SharkInference(
+        GPT2Module(), (inputs["input_ids"], inputs["attention_mask"])
+    )
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    print(
+        shark_module.forward((inputs["input_ids"], inputs["attention_mask"]))
+    )
--- a/shark/examples/shark_inference/mhlo_example.py
+++ b/shark/examples/shark_inference/mhlo_example.py
@@ -0,0 +1,37 @@
+from shark.shark_inference import SharkInference
+import numpy as np
+
+mhlo_ir = r"""builtin.module  {
+      func.func @forward(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
+        %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<4x4xf32>
+        %1 = "mhlo.abs"(%0) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+        return %1 : tensor<4x4xf32>
+      }
+}"""
+
+arg0 = np.ones((1, 4)).astype(np.float32)
+arg1 = np.ones((4, 1)).astype(np.float32)
+
+print("Running shark on cpu backend")
+shark_module = SharkInference(
+    mhlo_ir, function_name="forward", device="cpu", mlir_dialect="mhlo"
+)
+
+# Generate the random inputs and feed into the graph.
+x = shark_module.generate_random_inputs()
+shark_module.compile()
+print(shark_module.forward(x))
+
+print("Running shark on cuda backend")
+shark_module = SharkInference(
+    mhlo_ir, function_name="forward", device="cuda", mlir_dialect="mhlo"
+)
+shark_module.compile()
+print(shark_module.forward(x))
+
+print("Running shark on vulkan backend")
+shark_module = SharkInference(
+    mhlo_ir, function_name="forward", device="vulkan", mlir_dialect="mhlo"
+)
+shark_module.compile()
+print(shark_module.forward(x))
--- a/shark/examples/shark_inference/minilm_benchmark.py
+++ b/shark/examples/shark_inference/minilm_benchmark.py
@@ -0,0 +1,35 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from shark.shark_inference import SharkInference
+
+torch.manual_seed(0)
+tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
+
+
+class MiniLMSequenceClassification(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased",  # The pretrained model.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+test_input = torch.randint(2, (1, 128))
+
+shark_module = SharkInference(
+    MiniLMSequenceClassification(),
+    (test_input,),
+    jit_trace=True,
+    benchmark_mode=True,
+)
+
+shark_module.compile()
+shark_module.forward((test_input,))
+shark_module.benchmark_all((test_input,))
--- a/shark/examples/shark_inference/minilm_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_benchmark_tf.py
@@ -0,0 +1,61 @@
+import tensorflow as tf
+from transformers import BertModel, BertTokenizer, TFBertModel
+from shark.shark_inference import SharkInference
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class BertModule(tf.Module):
+    def __init__(self):
+        super(BertModule, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased", from_pt=True
+        )
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False
+        )
+
+    @tf.function(input_signature=bert_input)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = BertTokenizer.from_pretrained(
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+    )
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0
+        )
+
+    test_input = (
+        encoded_input["input_ids"],
+        encoded_input["attention_mask"],
+        encoded_input["token_type_ids"],
+    )
+    shark_module = SharkInference(
+        BertModule(), test_input, benchmark_mode=True
+    )
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    shark_module.benchmark_all(test_input)
--- a/shark/examples/shark_inference/minilm_jit.py
+++ b/shark/examples/shark_inference/minilm_jit.py
@@ -0,0 +1,24 @@
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_torch_model
+
+
+mlir_model, func_name, inputs, golden_out = download_torch_model(
+    "microsoft/MiniLM-L12-H384-uncased"
+)
+
+
+shark_module = SharkInference(
+    mlir_model, func_name, device="cpu", mlir_dialect="linalg"
+)
+shark_module.compile()
+result = shark_module.forward(inputs)
+print("The obtained result via shark is: ", result)
+print("The golden result is:", golden_out)
+
+
+# Let's generate random inputs, currently supported
+# for static models.
+rand_inputs = shark_module.generate_random_inputs()
+rand_results = shark_module.forward(rand_inputs)
+
+print("Running shark_module with random_inputs is: ", rand_results)
--- a/shark/examples/shark_inference/minilm_tf.py
+++ b/shark/examples/shark_inference/minilm_tf.py
@@ -0,0 +1,70 @@
+import tensorflow as tf
+from transformers import BertModel, BertTokenizer, TFBertModel
+from shark.shark_inference import SharkInference
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class BertModule(tf.Module):
+    def __init__(self):
+        super(BertModule, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased", from_pt=True
+        )
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False
+        )
+
+    @tf.function(input_signature=bert_input)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = BertTokenizer.from_pretrained(
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+    )
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0
+        )
+
+    shark_module = SharkInference(
+        BertModule(),
+        (
+            encoded_input["input_ids"],
+            encoded_input["attention_mask"],
+            encoded_input["token_type_ids"],
+        ),
+    )
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+
+    print(
+        shark_module.forward(
+            (
+                encoded_input["input_ids"],
+                encoded_input["attention_mask"],
+                encoded_input["token_type_ids"],
+            )
+        )
+    )
--- a/shark/examples/shark_inference/minilm_tf_gpu_config.json
+++ b/shark/examples/shark_inference/minilm_tf_gpu_config.json
--- a/shark/examples/shark_inference/resnest.py
+++ b/shark/examples/shark_inference/resnest.py
@@ -0,0 +1,39 @@
+import torch
+import torchvision.models as models
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+
+torch.hub.list("zhanghang1989/ResNeSt", force_reload=True)
+
+
+class ResnestModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = torch.hub.load(
+            "zhanghang1989/ResNeSt", "resnest50", pretrained=True
+        )
+        self.model.eval()
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+input = torch.randn(1, 3, 224, 224)
+
+
+mlir_importer = SharkImporter(
+    ResnestModule(),
+    (input,),
+    frontend="torch",
+)
+
+(vision_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
+    tracing_required=True
+)
+
+print(golden_out)
+
+shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
+shark_module.compile()
+result = shark_module.forward((input,))
+print("Obtained result", result)
--- a/shark/examples/shark_inference/resnet50_script.py
+++ b/shark/examples/shark_inference/resnet50_script.py
@@ -0,0 +1,81 @@
+from PIL import Image
+import requests
+import torch
+import torchvision.models as models
+from torchvision import transforms
+import sys
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_torch_model
+
+
+################################## Preprocessing inputs and model ############
+def load_and_preprocess_image(url: str):
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
+    }
+    img = Image.open(
+        requests.get(url, headers=headers, stream=True).raw
+    ).convert("RGB")
+    # preprocessing pipeline
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+            ),
+        ]
+    )
+    img_preprocessed = preprocess(img)
+    return torch.unsqueeze(img_preprocessed, 0)
+
+
+def load_labels():
+    classes_text = requests.get(
+        "https://raw.githubusercontent.com/cathyzhyi/ml-data/main/imagenet-classes.txt",
+        stream=True,
+    ).text
+    labels = [line.strip() for line in classes_text.splitlines()]
+    return labels
+
+
+def top3_possibilities(res):
+    _, indexes = torch.sort(res, descending=True)
+    percentage = torch.nn.functional.softmax(res, dim=1)[0] * 100
+    top3 = [(labels[idx], percentage[idx].item()) for idx in indexes[0][:3]]
+    return top3
+
+
+class Resnet50Module(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.resnet = models.resnet50(pretrained=True)
+        self.train(False)
+
+    def forward(self, img):
+        return self.resnet.forward(img)
+
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/2/26/YellowLabradorLooking_new.jpg"
+print("load image from " + image_url, file=sys.stderr)
+img = load_and_preprocess_image(image_url)
+labels = load_labels()
+
+##############################################################################
+
+
+## Can pass any img or input to the forward module.
+mlir_model, func_name, inputs, golden_out = download_torch_model("resnet50")
+
+shark_module = SharkInference(mlir_model, func_name, mlir_dialect="linalg")
+shark_module.compile()
+result = shark_module.forward((img.detach().numpy(),))
+
+print("The top 3 results obtained via shark_runner is:")
+print(top3_possibilities(torch.from_numpy(result)))
+
+print()
+
+print("The top 3 results obtained via torch is:")
+print(top3_possibilities(Resnet50Module()(img)))
--- a/shark/examples/shark_inference/t5_tf.py
+++ b/shark/examples/shark_inference/t5_tf.py
@@ -0,0 +1,35 @@
+from PIL import Image
+import requests
+
+from transformers import T5Tokenizer, TFT5Model
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+# Create a set of inputs
+t5_inputs = [
+    tf.TensorSpec(shape=[1, 10], dtype=tf.int32),
+    tf.TensorSpec(shape=[1, 10], dtype=tf.int32),
+]
+
+
+class T5Module(tf.Module):
+    def __init__(self):
+        super(T5Module, self).__init__()
+        self.m = TFT5Model.from_pretrained("t5-small")
+        self.m.predict = lambda x, y: self.m(input_ids=x, decoder_input_ids=y)
+
+    @tf.function(input_signature=t5_inputs)
+    def forward(self, input_ids, decoder_input_ids):
+        return self.m.predict(input_ids, decoder_input_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    text = "I love the distilled version of models."
+    inputs = tokenizer(text, return_tensors="tf").input_ids
+
+    shark_module = SharkInference(T5Module(), (inputs, inputs))
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    print(shark_module.forward((inputs, inputs)))
--- a/shark/examples/shark_inference/torch_vision_models_script.py
+++ b/shark/examples/shark_inference/torch_vision_models_script.py
@@ -0,0 +1,43 @@
+import torch
+import torchvision.models as models
+from shark.shark_inference import SharkInference
+
+
+class VisionModule(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+input = torch.randn(1, 3, 224, 224)
+
+## The vision models present here: https://pytorch.org/vision/stable/models.html
+vision_models_list = [
+    models.resnet18(pretrained=True),
+    models.alexnet(pretrained=True),
+    models.vgg16(pretrained=True),
+    models.squeezenet1_0(pretrained=True),
+    models.densenet161(pretrained=True),
+    models.inception_v3(pretrained=True),
+    models.shufflenet_v2_x1_0(pretrained=True),
+    models.mobilenet_v2(pretrained=True),
+    models.mobilenet_v3_small(pretrained=True),
+    models.resnext50_32x4d(pretrained=True),
+    models.wide_resnet50_2(pretrained=True),
+    models.mnasnet1_0(pretrained=True),
+    models.efficientnet_b0(pretrained=True),
+    models.regnet_y_400mf(pretrained=True),
+    models.regnet_x_400mf(pretrained=True),
+]
+
+for i, vision_model in enumerate(vision_models_list):
+    shark_module = SharkInference(
+        VisionModule(vision_model),
+        (input,),
+    )
+    shark_module.compile()
+    shark_module.forward((input,))
--- a/shark/examples/shark_inference/unet_script.py
+++ b/shark/examples/shark_inference/unet_script.py
@@ -0,0 +1,39 @@
+import torch
+import numpy as np
+from shark.shark_inference import SharkInference
+from shark.shark_importer import SharkImporter
+
+
+class UnetModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = torch.hub.load(
+            "mateuszbuda/brain-segmentation-pytorch",
+            "unet",
+            in_channels=3,
+            out_channels=1,
+            init_features=32,
+            pretrained=True,
+        )
+        self.model.eval()
+
+    def forward(self, input):
+        return self.model(input)
+
+
+input = torch.randn(1, 3, 224, 224)
+
+mlir_importer = SharkImporter(
+    UnetModule(),
+    (input,),
+    frontend="torch",
+)
+
+(vision_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
+    tracing_required=False
+)
+
+shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
+shark_module.compile()
+result = shark_module.forward((input,))
+np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
--- a/shark/examples/shark_inference/v_diffusion.py
+++ b/shark/examples/shark_inference/v_diffusion.py
@@ -0,0 +1,13 @@
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_torch_model
+
+
+mlir_model, func_name, inputs, golden_out = download_torch_model("v_diffusion")
+
+shark_module = SharkInference(
+    mlir_model, func_name, device="vulkan", mlir_dialect="linalg"
+)
+shark_module.compile()
+result = shark_module.forward(inputs)
+print("The obtained result via shark is: ", result)
+print("The golden result is:", golden_out)
--- a/shark/examples/shark_training/bert_training.py
+++ b/shark/examples/shark_training/bert_training.py
@@ -0,0 +1,47 @@
+import torch
+from torch.nn.utils import _stateless
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from shark.shark_runner import SharkTrainer
+
+
+class MiniLMSequenceClassification(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased",  # The pretrained model.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+mod = MiniLMSequenceClassification()
+
+
+def get_sorted_params(named_params):
+    return [i[1] for i in sorted(named_params.items())]
+
+
+print(dict(mod.named_buffers()))
+
+inp = (torch.randint(2, (1, 128)),)
+
+
+def forward(params, buffers, args):
+    params_and_buffers = {**params, **buffers}
+    _stateless.functional_call(
+        mod, params_and_buffers, args, {}
+    ).sum().backward()
+    optim = torch.optim.SGD(get_sorted_params(params), lr=0.01)
+    # optim.load_state_dict(optim_state)
+    optim.step()
+    return params, buffers
+
+
+shark_module = SharkTrainer(mod, inp, custom_inference_fn=forward)
+
+print(shark_module.forward())
--- a/shark/examples/shark_training/bert_training_load_tf.py
+++ b/shark/examples/shark_training/bert_training_load_tf.py
@@ -0,0 +1,60 @@
+import numpy as np
+import os
+import time
+import tensorflow as tf
+
+from shark.shark_trainer import SharkTrainer
+from shark.parser import parser
+from urllib import request
+
+parser.add_argument(
+    "--download_mlir_path",
+    type=str,
+    default="bert_tf_training.mlir",
+    help="Specifies path to target mlir file that will be loaded.",
+)
+load_args, unknown = parser.parse_known_args()
+
+tf.random.set_seed(0)
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Download BERT model from tank and train.
+if __name__ == "__main__":
+    predict_sample_input = [
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+    ]
+    file_link = "https://storage.googleapis.com/shark_tank/users/stanley/bert_tf_training.mlir"
+    response = request.urlretrieve(file_link, load_args.download_mlir_path)
+    sample_input_tensors = [
+        tf.convert_to_tensor(val, dtype=tf.int32)
+        for val in predict_sample_input
+    ]
+    num_iter = 10
+    if not os.path.isfile(load_args.download_mlir_path):
+        raise ValueError(
+            f"Tried looking for target mlir in {load_args.download_mlir_path}, but cannot be found."
+        )
+    with open(load_args.download_mlir_path, "rb") as input_file:
+        bert_mlir = input_file.read()
+    shark_module = SharkTrainer(
+        bert_mlir,
+        (
+            sample_input_tensors,
+            tf.convert_to_tensor(
+                np.random.randint(5, size=(BATCH_SIZE)), dtype=tf.int32
+            ),
+        ),
+    )
+    shark_module.set_frontend("mhlo")
+    shark_module.compile()
+    start = time.time()
+    print(shark_module.train(num_iter))
+    end = time.time()
+    total_time = end - start
+    print("time: " + str(total_time))
+    print("time/iter: " + str(total_time / num_iter))
--- a/shark/examples/shark_training/bert_training_tf.py
+++ b/shark/examples/shark_training/bert_training_tf.py
@@ -0,0 +1,97 @@
+from absl import app
+import time
+
+import numpy as np
+import tensorflow as tf
+
+from official.nlp.modeling import layers
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import bert_classifier
+
+from shark.shark_trainer import SharkTrainer
+
+
+tf.random.set_seed(0)
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class BertModule(tf.Module):
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+        test_network = networks.BertEncoder(
+            vocab_size=vocab_size, num_layers=2, dict_outputs=dict_outputs
+        )
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            test_network, num_classes=NUM_CLASSES
+        )
+        bert_trainer_model.summary()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.predict = tf.function(input_signature=[bert_input])(
+            self.m.predict
+        )
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(
+        input_signature=[
+            bert_input,  # inputs
+            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
+        ]
+    )
+    def forward(self, inputs, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            probs = self.m(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+
+if __name__ == "__main__":
+    predict_sample_input = [
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+    ]
+    sample_input_tensors = [
+        tf.convert_to_tensor(val, dtype=tf.int32)
+        for val in predict_sample_input
+    ]
+    num_iter = 10
+    shark_module = SharkTrainer(
+        BertModule(),
+        (
+            sample_input_tensors,
+            tf.convert_to_tensor(
+                np.random.randint(5, size=(BATCH_SIZE)), dtype=tf.int32
+            ),
+        ),
+    )
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    start = time.time()
+    print(shark_module.train(num_iter))
+    end = time.time()
+    total_time = end - start
+    print("time: " + str(total_time))
+    print("time/iter: " + str(total_time / num_iter))
--- a/shark/examples/shark_training/neural_net_training.py
+++ b/shark/examples/shark_training/neural_net_training.py
@@ -0,0 +1,44 @@
+import torch
+from torch.nn.utils import _stateless
+from shark.shark_trainer import SharkTrainer
+
+
+class Foo(torch.nn.Module):
+    def __init__(self):
+        super(Foo, self).__init__()
+        self.l1 = torch.nn.Linear(10, 16)
+        self.relu = torch.nn.ReLU()
+        self.l2 = torch.nn.Linear(16, 2)
+
+    def forward(self, x):
+        out = self.l1(x)
+        out = self.relu(out)
+        out = self.l2(out)
+        return out
+
+
+mod = Foo()
+inp = (torch.randn(10, 10),)
+
+
+def get_sorted_params(named_params):
+    return [i[1] for i in sorted(named_params.items())]
+
+
+def forward(params, buffers, args):
+    params_and_buffers = {**params, **buffers}
+    _stateless.functional_call(
+        mod, params_and_buffers, args, {}
+    ).sum().backward()
+    optim = torch.optim.SGD(get_sorted_params(params), lr=0.01)
+    optim.step()
+    return params, buffers
+
+
+# fx_graph = forward(dict(mod.named_parameters()), dict(mod.named_buffers()), inp)
+
+shark_module = SharkTrainer(mod, inp)
+# Pass the training function in case of torch
+shark_module.compile(training_fn=forward)
+
+shark_module.train(num_iters=10)
--- a/shark/iree_eager_backend.py
+++ b/shark/iree_eager_backend.py
@@ -0,0 +1,88 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Any
+
+import iree
+import iree.runtime as ireert
+import numpy as np
+import torch
+from iree.runtime import DeviceArray
+from torch_mlir._mlir_libs._mlir.ir import Module
+from torch_mlir.compiler_utils import (
+    get_module_name_for_debug_dump,
+    run_pipeline_with_repro_report,
+)
+from torch_mlir.eager_mode.torch_mlir_eager_backend import (
+    TorchMLIREagerBackend,
+    TensorMetaData,
+)
+from torch_mlir_e2e_test.eager_backends.refbackend import (
+    NUMPY_TO_TORCH_DTYPE_DICT,
+)
+
+from shark.iree_utils.compile_utils import (
+    get_iree_compiled_module,
+    IREE_DEVICE_MAP,
+)
+
+
+class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):
+    """Main entry-point for the iree backend for torch-mlir eager mode.
+
+    EagerModeIREELinalgOnTensorsBackend uses iree.DeviceArray representations of tensors and
+    thus all of the wrapping and unwrapping and munging here is done to between torch.Tensor and iree.DeviceArray,
+    with np.ndarray as an intermediary.
+    """
+
+    def __init__(self, device: str):
+        self.torch_device_str = device
+        self.config = ireert.Config(IREE_DEVICE_MAP[device])
+        self.raw_device_str = device
+
+    def get_torch_metadata(
+        self, tensor: DeviceArray, kwargs: Dict[str, Any]
+    ) -> TensorMetaData:
+        return TensorMetaData(
+            size=tensor.shape,
+            dtype=NUMPY_TO_TORCH_DTYPE_DICT[tensor.dtype.type],
+            device=torch.device(self.torch_device_str),
+            requires_grad=tensor.dtype.type
+            in {np.float, np.float32, np.float64}
+            and kwargs.get("requires_grad", False),
+        )
+
+    def compile(self, imported_module: Module):
+        fn_name = get_module_name_for_debug_dump(imported_module)
+        run_pipeline_with_repro_report(
+            imported_module,
+            "torch-function-to-torch-backend-pipeline,torch-backend-to-linalg-on-tensors-backend-pipeline",
+            "EagerMode",
+        )
+        callable, _ = get_iree_compiled_module(
+            imported_module, self.raw_device_str, func_name=fn_name
+        )
+        return callable
+
+    def copy_into(self, dst, src):
+        """Copy output back to appropriate arg that it should alias."""
+        np.copyto(dst, src)
+
+    def transfer_from_device_to_torch(self, e):
+        return torch.from_numpy(e.to_host())
+
+    def transfer_from_torch_to_device(
+        self, tensor: torch.Tensor
+    ) -> DeviceArray:
+        return iree.runtime.asdevicearray(self.config.device, tensor.numpy())
--- a/shark/iree_utils/init.py
+++ b/shark/iree_utils/init.py
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## Common utilities to be shared by iree utilities.
+
+import os
+import sys
+import subprocess
+
+
+def run_cmd(cmd):
+    """
+    Inputs: cli command string.
+    """
+    try:
+        result = subprocess.run(
+            cmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True,
+        )
+        result_str = result.stdout.decode()
+        return result_str
+    except Exception:
+        sys.exit("Exiting program due to error running:", cmd)
+
+
+IREE_DEVICE_MAP = {
+    "cpu": "local-task",
+    "gpu": "cuda",
+    "cuda": "cuda",
+    "vulkan": "vulkan",
+    "metal": "vulkan",
+    "rocm": "rocm",
+    "intel-gpu": "level_zero",
+}
+
+IREE_TARGET_MAP = {
+    "cpu": "llvm-cpu",
+    "gpu": "cuda",
+    "cuda": "cuda",
+    "vulkan": "vulkan",
+    "metal": "vulkan",
+    "rocm": "rocm",
+    "intel-gpu": "opencl-spirv",
+}
+
+# Finds whether the required drivers are installed for the given device.
+def check_device_drivers(device):
+    """Checks necessary drivers present for gpu and vulkan devices"""
+    if device in ["gpu", "cuda"]:
+        try:
+            subprocess.check_output("nvidia-smi")
+        except Exception:
+            return True
+    elif device in ["metal", "vulkan"]:
+        try:
+            subprocess.check_output("vulkaninfo")
+        except Exception:
+            return True
+    elif device in ["intel-gpu"]:
+        try:
+            subprocess.check_output(["dpkg", "-L", "intel-level-zero-gpu"])
+            return False
+        except Exception:
+            return True
+    elif device == "cpu":
+        return False
+    # Unknown device.
+    else:
+        return True
+
+    return False
+
+
+# Installation info for the missing device drivers.
+def device_driver_info(device):
+    if device in ["gpu", "cuda"]:
+        return "nvidia-smi not found, please install the required drivers from https://www.nvidia.in/Download/index.aspx?lang=en-in"
+    elif device in ["metal", "vulkan"]:
+        return "vulkaninfo not found, Install from https://vulkan.lunarg.com/sdk/home or your distribution"
+    else:
+        return f"{device} is not supported."
--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -0,0 +1,97 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import iree.runtime.scripts.iree_benchmark_module as benchmark_module
+from shark.iree_utils._common import run_cmd, IREE_DEVICE_MAP
+import numpy as np
+import os
+import re
+
+UNIT_TO_SECOND_MAP = {"ms": 0.001, "s": 1}
+
+
+def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
+    """
+    Input: A tuple of input tensors i.e tuple(torch.tensor)
+    Output: list of string that represent mlir types (i.e 1x24xf64)
+    # TODO: Support more than floats, and ints
+    """
+    list_of_type = []
+    for input_tensor in input_tensors:
+        type_string = "x".join([str(dim) for dim in input_tensor.shape])
+        if mlir_dialect in ["linalg", "tosa"]:
+            dtype_string = str(input_tensor.dtype).replace("torch.", "")
+        elif mlir_dialect in ["mhlo", "tflite"]:
+            dtype = input_tensor.dtype
+            try:
+                dtype_string = re.findall("'[^\"]*'", str(dtype))[0].replace(
+                    "'", ""
+                )
+            except IndexError:
+                dtype_string = str(dtype)
+        regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
+        match = regex_split.match(dtype_string)
+        mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
+        type_string += f"x{mlir_type_string}"
+        list_of_type.append(type_string)
+    return list_of_type
+
+
+def build_benchmark_args(
+    input_file: str,
+    device: str,
+    input_tensors: tuple,
+    mlir_dialect: str,
+    training=False,
+):
+    """
+    Inputs: input_file leading to vmfb, input_tensor to function, target device,
+    and whether it is training or not.
+    Outputs: string that execute benchmark-module on target model.
+    """
+    path = benchmark_module.__path__[0]
+    benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
+    benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
+    # TODO: The function named can be passed as one of the args.
+    fn_name = "forward"
+    if training == True:
+        # TODO: Replace name of train with actual train fn name.
+        fn_name = "train"
+    benchmark_cl.append(f"--entry_function={fn_name}")
+    benchmark_cl.append(f"--device={IREE_DEVICE_MAP[device]}")
+    mlir_input_types = tensor_to_type_str(input_tensors, mlir_dialect)
+    for mlir_input in mlir_input_types:
+        benchmark_cl.append(f"--function_input={mlir_input}")
+    time_extractor = "| awk 'END{{print $2 $3}}'"
+    benchmark_cl.append(time_extractor)
+    return benchmark_cl
+
+
+def run_benchmark_module(benchmark_cl):
+    """
+    Run benchmark command, extract result and return iteration/seconds.
+
+    # TODO: Add an example of the benchmark command.
+    Input: benchmark command.
+    """
+    benchmark_path = benchmark_cl[0]
+    assert os.path.exists(
+        benchmark_path
+    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
+    bench_result = run_cmd(" ".join(benchmark_cl))
+    regex_split = re.compile("([0-9]+[.]*[0-9]*)([a-zA-Z]+)")
+    match = regex_split.match(bench_result)
+    time = float(match.group(1))
+    unit = match.group(2)
+    return 1.0 / (time * UNIT_TO_SECOND_MAP[unit])
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -0,0 +1,173 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import iree.runtime as ireert
+import iree.compiler as ireec
+from shark.iree_utils._common import IREE_DEVICE_MAP, IREE_TARGET_MAP
+import numpy as np
+import os
+
+# Get the iree-compile arguments given device.
+def get_iree_device_args(device):
+    if device == "cpu":
+        from shark.iree_utils.cpu_utils import get_iree_cpu_args
+
+        return get_iree_cpu_args()
+    if device in ["gpu", "cuda"]:
+        from shark.iree_utils.gpu_utils import get_iree_gpu_args
+
+        return get_iree_gpu_args()
+    if device in ["metal", "vulkan"]:
+        from shark.iree_utils.vulkan_utils import get_iree_vulkan_args
+
+        return get_iree_vulkan_args()
+    return []
+
+
+# Get the iree-compiler arguments given frontend.
+def get_iree_frontend_args(frontend):
+    if frontend in ["torch", "pytorch", "linalg"]:
+        return ["--iree-llvm-target-cpu-features=host"]
+    elif frontend in ["tensorflow", "tf", "mhlo"]:
+        return [
+            "--iree-llvm-target-cpu-features=host",
+            "--iree-mhlo-demote-i64-to-i32=false",
+            "--iree-flow-demote-i64-to-i32",
+        ]
+    else:
+        # Frontend not found.
+        return []
+
+
+# Common args to be used given any frontend or device.
+def get_iree_common_args():
+    return [
+        "--iree-stream-resource-index-bits=64",
+        "--iree-vm-target-index-bits=64",
+    ]
+
+
+def compile_module_to_flatbuffer(
+    module, device, frontend, func_name, model_config_path
+):
+    # Setup Compile arguments wrt to frontends.
+    input_type = ""
+    args = get_iree_frontend_args(frontend)
+    args += get_iree_device_args(device)
+    args += get_iree_common_args()
+
+    if frontend in ["tensorflow", "tf"]:
+        input_type = "mhlo"
+    elif frontend in ["mhlo", "tosa"]:
+        input_type = frontend
+    elif frontend in ["tflite", "tflite-tosa"]:
+        input_type = "tosa"
+
+    # TODO: make it simpler.
+    # Compile according to the input type, else just try compiling.
+    if input_type not in ["mhlo", "tosa"]:
+        module = str(module)
+    if input_type != "":
+        # Currently for MHLO/TOSA.
+        flatbuffer_blob = ireec.compile_str(
+            module,
+            target_backends=[IREE_TARGET_MAP[device]],
+            extra_args=args,
+            input_type=input_type,
+        )
+    else:
+        # Currently for Torch.
+        flatbuffer_blob = ireec.compile_str(
+            str(module),
+            target_backends=[IREE_TARGET_MAP[device]],
+            extra_args=args,
+        )
+
+    return flatbuffer_blob
+
+
+def get_iree_module(flatbuffer_blob, device, func_name):
+    # Returns the compiled module and the configs.
+    config = ireert.Config(IREE_DEVICE_MAP[device])
+    vm_module = ireert.VmModule.from_flatbuffer(
+        config.vm_instance, flatbuffer_blob
+    )
+    ctx = ireert.SystemContext(config=config)
+    ctx.add_vm_module(vm_module)
+    ModuleCompiled = ctx.modules.module[func_name]
+    return ModuleCompiled, config
+
+
+def get_iree_compiled_module(
+    module,
+    device: str,
+    frontend: str = "torch",
+    func_name: str = "forward",
+    model_config_path: str = None,
+):
+    """Given a module returns the compiled .vmfb and configs"""
+    flatbuffer_blob = compile_module_to_flatbuffer(
+        module, device, frontend, func_name, model_config_path
+    )
+    return get_iree_module(flatbuffer_blob, device, func_name)
+
+
+def export_iree_module_to_vmfb(
+    module,
+    device: str,
+    directory: str,
+    mlir_dialect: str = "linalg",
+    func_name: str = "forward",
+    model_config_path: str = None,
+):
+    # Compiles the module given specs and saves it as .vmfb file.
+    flatbuffer_blob = compile_module_to_flatbuffer(
+        module, device, mlir_dialect, func_name, model_config_path
+    )
+    module_name = f"{mlir_dialect}_{func_name}_{device}"
+    filename = os.path.join(directory, module_name + ".vmfb")
+    print(f"Saved vmfb in {filename}.")
+    with open(filename, "wb") as f:
+        f.write(flatbuffer_blob)
+    return filename
+
+
+def export_module_to_mlir_file(module, frontend, directory: str):
+    # TODO: write proper documentation.
+    mlir_str = module
+    if frontend in ["tensorflow", "tf", "mhlo", "tflite"]:
+        mlir_str = module.decode("utf-8")
+    elif frontend in ["pytorch", "torch"]:
+        mlir_str = module.operation.get_asm()
+    filename = os.path.join(directory, "model.mlir")
+    with open(filename, "w") as f:
+        f.write(mlir_str)
+    print(f"Saved mlir in {filename}.")
+    return filename
+
+
+def get_results(compiled_vm, input, config, frontend="torch"):
+    """Runs a .vmfb file given inputs and config and returns output."""
+    device_inputs = [ireert.asdevicearray(config.device, a) for a in input]
+    result = compiled_vm(*device_inputs)
+    result_tensors = []
+    if isinstance(result, tuple):
+        for val in result:
+            result_tensors.append(np.copy(np.asarray(val, val.dtype)))
+        return result_tensors
+    elif isinstance(result, dict):
+        data = list(result.items())
+        res = np.array(data, dtype=object)
+        return np.copy(res)
+    else:
+        return np.copy(np.asarray(result, dtype=result.dtype))
--- a/shark/iree_utils/cpu_utils.py
+++ b/shark/iree_utils/cpu_utils.py
@@ -0,0 +1,44 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# All the iree_cpu related functionalities go here.
+
+import subprocess
+
+# Get the default cpu args.
+def get_iree_cpu_args():
+    find_triple_cmd = "uname -s -m"
+    os_name, proc_name = (
+        subprocess.run(
+            find_triple_cmd, shell=True, stdout=subprocess.PIPE, check=True
+        )
+        .stdout.decode("utf-8")
+        .split()
+    )
+    if os_name == "Darwin":
+        find_kernel_version_cmd = "uname -r"
+        kernel_version = subprocess.run(
+            find_kernel_version_cmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            check=True,
+        ).stdout.decode("utf-8")
+        target_triple = f"{proc_name}-apple-darwin{kernel_version}"
+    elif os_name == "Linux":
+        target_triple = f"{proc_name}-linux-gnu"
+    else:
+        error_message = f"OS Type f{os_name} not supported and triple can't be determined, open issue to dSHARK team please :)"
+        raise Exception(error_message)
+    print(f"Target triple found:{target_triple}")
+    return [f"-iree-llvm-target-triple={target_triple}"]
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -0,0 +1,111 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# All the iree_gpu related functionalities go here.
+
+import iree.runtime as ireert
+import ctypes
+from shark.parser import shark_args
+
+# Get the default gpu args given the architecture.
+def get_iree_gpu_args():
+    ireert.flags.FUNCTION_INPUT_VALIDATION = False
+    ireert.flags.parse_flags("--cuda_allow_inline_execution")
+    # TODO: Give the user_interface to pass the sm_arch.
+    sm_arch = get_cuda_sm_cc()
+    if (
+        sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]
+    ) and (shark_args.enable_tf32 == True):
+        return [
+            "--iree-hal-cuda-disable-loop-nounroll-wa",
+            f"--iree-hal-cuda-llvm-target-arch={sm_arch}",
+        ]
+    else:
+        return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
+
+
+# Some constants taken from cuda.h
+CUDA_SUCCESS = 0
+CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
+CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
+CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
+CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
+
+
+def get_cuda_sm_cc():
+    libnames = ("libcuda.so", "libcuda.dylib", "cuda.dll")
+    for libname in libnames:
+        try:
+            cuda = ctypes.CDLL(libname)
+        except OSError:
+            continue
+        else:
+            break
+    else:
+        raise OSError("could not load any of: " + " ".join(libnames))
+
+    nGpus = ctypes.c_int()
+    name = b" " * 100
+    cc_major = ctypes.c_int()
+    cc_minor = ctypes.c_int()
+
+    result = ctypes.c_int()
+    device = ctypes.c_int()
+    context = ctypes.c_void_p()
+    error_str = ctypes.c_char_p()
+
+    result = cuda.cuInit(0)
+    if result != CUDA_SUCCESS:
+        cuda.cuGetErrorString(result, ctypes.byref(error_str))
+        print(
+            "cuInit failed with error code %d: %s"
+            % (result, error_str.value.decode())
+        )
+        return 1
+    result = cuda.cuDeviceGetCount(ctypes.byref(nGpus))
+    if result != CUDA_SUCCESS:
+        cuda.cuGetErrorString(result, ctypes.byref(error_str))
+        print(
+            "cuDeviceGetCount failed with error code %d: %s"
+            % (result, error_str.value.decode())
+        )
+        return 1
+    print("Found %d device(s)." % nGpus.value)
+    for i in range(nGpus.value):
+        result = cuda.cuDeviceGet(ctypes.byref(device), i)
+        if result != CUDA_SUCCESS:
+            cuda.cuGetErrorString(result, ctypes.byref(error_str))
+            print(
+                "cuDeviceGet failed with error code %d: %s"
+                % (result, error_str.value.decode())
+            )
+            return 1
+        print("Device: %d" % i)
+        if (
+            cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device)
+            == CUDA_SUCCESS
+        ):
+            print("  Name: %s" % (name.split(b"\0", 1)[0].decode()))
+        if (
+            cuda.cuDeviceComputeCapability(
+                ctypes.byref(cc_major), ctypes.byref(cc_minor), device
+            )
+            == CUDA_SUCCESS
+        ):
+            print(
+                "  Compute Capability: %d.%d"
+                % (cc_major.value, cc_minor.value)
+            )
+    sm = f"sm_{cc_major.value}{cc_minor.value}"
+    return sm
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -0,0 +1,60 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# All the iree_vulkan related functionalities go here.
+
+from shark.iree_utils._common import run_cmd
+
+
+def get_vulkan_triple_flag():
+    vulkan_device_cmd = "vulkaninfo | grep deviceName | awk 'END{{print $NF}}'"
+    vulkan_device = run_cmd(vulkan_device_cmd).strip()
+    if vulkan_device == "Ultra":
+        print("Found MacStudio M1 Device. Using m1-moltenvk-macos")
+        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
+    elif vulkan_device == "M2":
+        print("Found Apple M2 Device. Using m1-moltenvk-macos")
+        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
+    elif vulkan_device == "Max":
+        print("Found Apple M1 Max Device. Using m1-moltenvk-macos")
+        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
+    elif vulkan_device == "Pro":
+        print("Found Apple M1 Pro Device. Using m1-moltenvk-macos")
+        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
+    elif vulkan_device == "M1":
+        print("Found Apple M1 Device. Using m1-moltenvk-macos")
+        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
+    elif vulkan_device == "A100-SXM4-40GB":
+        print("Found Nvidia Device. Using ampere-rtx3080-linux")
+        return "-iree-vulkan-target-triple=ampere-rtx3080-linux"
+    elif vulkan_device == "3090":
+        print("Found Nvidia Device. Using ampere-rtx3090-linux")
+        return "-iree-vulkan-target-triple=ampere-rtx3090-linux"
+    else:
+        print(
+            """Optimized kernel for your target device is not added yet.
+            Contact SHARK Admin on discord[https://discord.com/invite/RUqY2h2s9u]
+            or pull up an issue."""
+        )
+        print(f"Target : {vulkan_device}")
+        return None
+
+
+def get_iree_vulkan_args():
+    # vulkan_flag = ["--iree-flow-demote-i64-to-i32"]
+    vulkan_flag = []
+    vulkan_triple_flag = get_vulkan_triple_flag()
+    if vulkan_triple_flag is not None:
+        vulkan_flag.append(vulkan_triple_flag)
+    return vulkan_flag
--- a/shark/model_annotation.py
+++ b/shark/model_annotation.py
@@ -0,0 +1,164 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import json
+import os
+from typing import List, Dict
+
+from iree.compiler import ir
+from iree.compiler.transforms import ireec as ireec_trans
+
+MATMUL_OP_NAMES = set(
+    ["linalg.matmul", "linalg.batch_matmul", "mhlo.dot", "mhlo.dot_general"]
+)
+idx = 0
+
+
+def model_annotation(
+    ctx: ir.Context, *, input_contents: str, config_path: str
+):
+    if os.path.isfile(input_contents):
+        with open(input_contents, "rb") as f:
+            input_contents = f.read()
+
+    module = ir.Module.parse(input_contents)
+
+    with open(config_path, "r") as f:
+        data = json.load(f)
+        configs = data["options"]
+
+    # The Python API does not expose a general walk() function, so we just
+    # do it ourselves.
+    walk_children(module.operation, configs)
+
+    if not module.operation.verify():
+        raise RuntimeError("Modified program does not verify!")
+
+    # More efficient than: print(module)
+    #   - Disables verification (already done above)
+    #   - Writes as binary, avoiding costly unicode conversions
+    sys.stdout.buffer.write(
+        module.operation.get_asm(assume_verified=True, binary=True)
+    )
+    return module
+
+
+def walk_children(op: ir.Operation, configs: List[Dict]):
+    for region in op.regions:
+        for block in region.blocks:
+            for child_op in block.operations:
+                # TODO: This is dumb. Both Operation and OpView should expose
+                # 'operation' and 'name' attributes.
+                if isinstance(child_op, ir.OpView):
+                    child_op = child_op.operation
+                if child_op.name in MATMUL_OP_NAMES:
+                    global idx
+                    (
+                        tile_sizes,
+                        pipeline,
+                        workgroup_size,
+                        split_k,
+                        pipeline_depth,
+                    ) = parse_config(configs[idx])
+
+                    add_compilation_info(
+                        child_op,
+                        tile_sizes=tile_sizes,
+                        pipeline=pipeline,
+                        workgroup_size=workgroup_size,
+                        pipeline_depth=pipeline_depth,
+                    )
+
+                    if split_k:
+                        add_split_k(child_op, split_k)
+
+                    idx = idx + 1
+                    print(f"Updated op {child_op}", file=sys.stderr)
+                walk_children(child_op, configs)
+
+
+def parse_config(config: Dict):
+    if config["pipeline"] == "GPU" or config["pipeline"] == "GPU_TENSORCORE":
+        pipeline = (
+            "LLVMGPUMatmulSimt"
+            if config["pipeline"] == "GPU"
+            else "LLVMGPUMatmulTensorCore"
+        )
+        tile_sizes = [config["work_group_tile_sizes"]]
+        workgroup_size = config["work_group_sizes"]
+        try:
+            pipeline_depth = config["pipeline_depth"]
+        except:
+            pipeline_depth = None
+        try:
+            split_k = config["split_k"]
+        except:
+            split_k = None
+    else:
+        pipeline = config["pipeline"]
+        tile_sizes = [
+            config["work_group_tile_sizes"],
+            config["l1_tile_sizes"],
+            config["vector_tile_sizes"],
+        ]
+        workgroup_size = []
+        split_k = None
+        pipeline_depth = None
+    return tile_sizes, pipeline, workgroup_size, split_k, pipeline_depth
+
+
+def add_compilation_info(
+    op: ir.Operation,
+    tile_sizes: List[List[int]],
+    pipeline: str,
+    workgroup_size: List[int],
+    pipeline_depth: int,
+):
+    # We don't have a Python binding for CompilationInfo, so we just parse
+    # its string form.
+    if pipeline_depth:
+        attr = ir.Attribute.parse(
+            f"#iree_codegen.compilation_info<"
+            f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
+            f"translation_info = <{pipeline} pipeline_depth = {pipeline_depth}>, "
+            f"workgroup_size = {repr(workgroup_size)}>"
+        )
+    else:
+        attr = ir.Attribute.parse(
+            f"#iree_codegen.compilation_info<"
+            f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
+            f"translation_info = <{pipeline}>, "
+            f"workgroup_size = {repr(workgroup_size)}>"
+        )
+    op.attributes["compilation_info"] = attr
+
+
+def add_split_k(op: ir.Operation, k: int):
+    attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), k)
+    op.attributes["iree_flow_split_k"] = attr
+
+
+def create_context() -> ir.Context:
+    context = ir.Context()
+    ireec_trans.register_all_dialects(context)
+    context.allow_unregistered_dialects = True
+    return context
+
+
+if __name__ == "__main__":
+    with create_context() as ctx:
+        model_annotation(
+            ctx, input_contents=sys.argv[1], config_path=sys.argv[2]
+        )
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -0,0 +1,80 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+
+def dir_path(path):
+    if os.path.isdir(path):
+        return path
+    else:
+        os.mkdir(path)
+        return path
+
+
+def dir_file(path):
+    if os.path.isfile(path):
+        return path
+    else:
+        raise argparse.ArgumentTypeError(
+            f"readable_file:{path} is not a valid file"
+        )
+
+
+parser = argparse.ArgumentParser(description="SHARK runner.")
+parser.add_argument(
+    "--device",
+    type=str,
+    default="cpu",
+    help="Device on which shark_runner runs. options are cpu, gpu, and vulkan",
+)
+parser.add_argument(
+    "--repro_dir",
+    help="Directory to which module files will be saved for reproduction or debugging.",
+    type=dir_path,
+    default="./shark_tmp",
+)
+parser.add_argument(
+    "--enable_tf32",
+    type=bool,
+    default=False,
+    help="Enables TF32 precision calculations on supported GPUs.",
+)
+parser.add_argument(
+    "--model_config_path",
+    help="Directory to where the tuned model config file is located.",
+    default=None,
+)
+
+parser.add_argument(
+    "--num_warmup_iterations",
+    type=int,
+    default=5,
+    help="Run the model for the specified number of warmup iterations.",
+)
+parser.add_argument(
+    "--num_iterations",
+    type=int,
+    default=100,
+    help="Run the model for the specified number of iterations.",
+)
+parser.add_argument(
+    "--onnx_bench",
+    default=False,
+    action="store_true",
+    help="When enabled, pytest bench results will include ONNX benchmark results.",
+)
+
+shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -0,0 +1,301 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from shark.shark_runner import SharkRunner
+from shark.iree_utils.compile_utils import export_iree_module_to_vmfb
+from shark.iree_utils.benchmark_utils import (
+    build_benchmark_args,
+    run_benchmark_module,
+)
+from shark.parser import shark_args
+from datetime import datetime
+import time
+import csv
+import os
+
+
+class OnnxFusionOptions(object):
+    def __init__(self):
+        self.disable_gelu = False
+        self.disable_layer_norm = False
+        self.disable_attention = False
+        self.disable_skip_layer_norm = False
+        self.disable_embed_layer_norm = False
+        self.disable_bias_skip_layer_norm = False
+        self.disable_bias_gelu = False
+        self.enable_gelu_approximation = False
+        self.use_mask_index = False
+        self.no_attention_mask = False
+
+
+class SharkBenchmarkRunner(SharkRunner):
+    # SharkRunner derived class with Benchmarking capabilities.
+    def __init__(
+        self,
+        mlir_module: str,
+        function_name: str = "forward",
+        device: str = "none",
+        mlir_dialect: str = "linalg",
+    ):
+        self.device = shark_args.device if device == "none" else device
+        self.frontend_model = None
+        self.vmfb_file = None
+        self.mlir_dialect = mlir_dialect
+        SharkRunner.__init__(
+            self,
+            mlir_module,
+            function_name,
+            device,
+            self.mlir_dialect,
+        )
+        if self.vmfb_file == None:
+            self.vmfb_file = export_iree_module_to_vmfb(
+                mlir_module, device, shark_args.repro_dir, self.mlir_dialect
+            )
+
+    def setup_cl(self, input_tensors):
+        self.benchmark_cl = build_benchmark_args(
+            self.vmfb_file,
+            self.device,
+            input_tensors,
+            mlir_dialect=self.mlir_dialect,
+        )
+        # print(self.benchmark_cl)
+
+    def benchmark_frontend(self, modelname):
+        if self.mlir_dialect in ["linalg", "torch"]:
+            return self.benchmark_torch(modelname)
+        elif self.mlir_dialect in ["mhlo", "tf"]:
+            return self.benchmark_tf(modelname)
+
+    def benchmark_torch(self, modelname):
+        import torch
+        from tank.model_utils import get_torch_model
+
+        if self.device == "gpu":
+            torch.set_default_tensor_type(torch.cuda.FloatTensor)
+        else:
+            torch.set_default_tensor_type(torch.FloatTensor)
+        torch_device = torch.device(
+            "cuda:0" if self.device == "gpu" else "cpu"
+        )
+        HFmodel, input = get_torch_model(modelname)[:2]
+        frontend_model = HFmodel.model
+        frontend_model.to(torch_device)
+        input.to(torch_device)
+
+        for i in range(shark_args.num_warmup_iterations):
+            frontend_model.forward(input)
+
+        begin = time.time()
+        for i in range(shark_args.num_iterations):
+            out = frontend_model.forward(input)
+            if i == shark_args.num_iterations - 1:
+                end = time.time()
+                break
+        print(
+            f"Torch benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+        return [
+            f"{shark_args.num_iterations/(end-begin)}",
+            f"{((end-begin)/shark_args.num_iterations)*1000}",
+        ]
+
+    def benchmark_tf(self, modelname):
+        import tensorflow as tf
+        from tank.model_utils_tf import get_tf_model
+
+        model, input, = get_tf_model(
+            modelname
+        )[:2]
+        frontend_model = model
+
+        for i in range(shark_args.num_warmup_iterations):
+            frontend_model.forward(*input)
+
+        begin = time.time()
+        for i in range(shark_args.num_iterations):
+            out = frontend_model.forward(*input)
+            if i == shark_args.num_iterations - 1:
+                end = time.time()
+                break
+        print(
+            f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+        return [
+            f"{shark_args.num_iterations/(end-begin)}",
+            f"{((end-begin)/shark_args.num_iterations)*1000}",
+        ]
+
+    def benchmark_c(self):
+        print(self.benchmark_cl)
+        result = run_benchmark_module(self.benchmark_cl)
+        print(f"Shark-IREE-C benchmark:{result} iter/second")
+        return [f"{result}", f"{1000/result}"]
+
+    def benchmark_python(self, inputs):
+        input_list = [x for x in inputs]
+        for i in range(shark_args.num_warmup_iterations):
+            self.run(input_list)
+
+        begin = time.time()
+        for i in range(shark_args.num_iterations):
+            out = self.run(input_list)
+            if i == shark_args.num_iterations - 1:
+                end = time.time()
+        print(
+            f"Shark-IREE Python benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+        return [
+            f"{shark_args.num_iterations/(end-begin)}",
+            f"{((end-begin)/shark_args.num_iterations)*1000}",
+        ]
+
+    def benchmark_onnx(self, modelname, inputs):
+        if self.device == "gpu":
+            print(
+                "Currently GPU benchmarking on ONNX is not supported in SHARK."
+            )
+            return ["N/A", "N/A"]
+        else:
+            from onnxruntime.transformers.benchmark import run_onnxruntime
+            from onnxruntime.transformers.huggingface_models import MODELS
+            from onnxruntime.transformers.benchmark_helper import (
+                ConfigModifier,
+                Precision,
+            )
+            import psutil
+
+            if modelname == "microsoft/MiniLM-L12-H384-uncased":
+                modelname = "bert-base-uncased"
+            if modelname not in MODELS:
+                print(
+                    f"{modelname} is currently not supported in ORT's HF. Check \
+https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
+for currently supported models. Exiting benchmark ONNX."
+                )
+                return ["N/A", "N/A"]
+            use_gpu = self.device == "gpu"
+            num_threads = psutil.cpu_count(logical=False)
+            batch_sizes = [1]
+            sequence_lengths = [128]
+            cache_dir = os.path.join(".", "cache_models")
+            onnx_dir = os.path.join(".", "onnx_models")
+            verbose = False
+            input_counts = [1]
+            optimize_onnx = True
+            validate_onnx = False
+            disable_ort_io_binding = False
+            use_raw_attention_mask = True
+            model_fusion_statistics = {}
+            overwrite = False
+            model_source = "pt"  # Either "pt" or "tf"
+            provider = None
+            config_modifier = ConfigModifier(None)
+            onnx_args = OnnxFusionOptions()
+            result = run_onnxruntime(
+                use_gpu,
+                provider,
+                (modelname,),
+                None,
+                config_modifier,
+                Precision.FLOAT32,
+                num_threads,
+                batch_sizes,
+                sequence_lengths,
+                shark_args.num_iterations,
+                input_counts,
+                optimize_onnx,
+                validate_onnx,
+                cache_dir,
+                onnx_dir,
+                verbose,
+                overwrite,
+                disable_ort_io_binding,
+                use_raw_attention_mask,
+                model_fusion_statistics,
+                model_source,
+                onnx_args,
+            )
+            print(
+                f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+            )
+            return [
+                result[0]["QPS"],
+                result[0]["average_latency_ms"],
+            ]
+
+    def benchmark_all_csv(
+        self, inputs: tuple, modelname, dynamic, device_str, frontend
+    ):
+        self.setup_cl(inputs)
+        field_names = [
+            "model",
+            "engine",
+            "dynamic",
+            "dialect",
+            "device",
+            "iter/sec",
+            "ms/iter",
+            "iterations",
+            "datetime",
+        ]
+        engines = ["frontend", "shark_python", "shark_iree_c"]
+        if shark_args.onnx_bench == True:
+            engines.append("onnxruntime")
+
+        if not os.path.exists("bench_results.csv"):
+            with open("bench_results.csv", mode="w", newline="") as f:
+                writer = csv.writer(f)
+                writer.writerow(field_names)
+
+        with open("bench_results.csv", mode="a", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=field_names)
+            bench_result = {}
+            bench_result["model"] = modelname
+            if dynamic == True:
+                bench_result["dynamic"] = "True"
+            else:
+                bench_result["dynamic"] = "False"
+            bench_result["device"] = device_str
+            for e in engines:
+                if e == "frontend":
+                    bench_result["engine"] = frontend
+                    (
+                        bench_result["iter/sec"],
+                        bench_result["ms/iter"],
+                    ) = self.benchmark_frontend(modelname)
+                elif e == "shark_python":
+                    bench_result["engine"] = "shark_python"
+                    (
+                        bench_result["iter/sec"],
+                        bench_result["ms/iter"],
+                    ) = self.benchmark_python(inputs)
+                elif e == "shark_iree_c":
+                    bench_result["engine"] = "shark_iree_c"
+                    (
+                        bench_result["iter/sec"],
+                        bench_result["ms/iter"],
+                    ) = self.benchmark_c()
+                elif e == "onnxruntime":
+                    bench_result["engine"] = "onnxruntime"
+                    (
+                        bench_result["iter/sec"],
+                        bench_result["ms/iter"],
+                    ) = self.benchmark_onnx(modelname, inputs)
+
+                bench_result["dialect"] = self.mlir_dialect
+                bench_result["iterations"] = shark_args.num_iterations
+                bench_result["datetime"] = str(datetime.now())
+                writer.writerow(bench_result)
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -0,0 +1,236 @@
+# Lint as: python3
+"""SHARK Downloader"""
+# Requirements : Put shark_tank in SHARK directory
+#   /SHARK
+#     /gen_shark_tank
+#       /tflite
+#         /albert_lite_base
+#         /...model_name...
+#       /tf
+#       /pytorch
+#
+#
+#
+
+import numpy as np
+import os
+import urllib.request
+import json
+import hashlib
+from pathlib import Path
+
+input_type_to_np_dtype = {
+    "float32": np.float32,
+    "float64": np.float64,
+    "bool": np.bool_,
+    "int32": np.int32,
+    "int64": np.int64,
+    "uint8": np.uint8,
+    "int8": np.int8,
+}
+
+# default hash is updated when nightly populate_sharktank_ci is successful
+shark_default_sha = "latest"
+
+# Save the model in the home local so it needn't be fetched everytime in the CI.
+home = str(Path.home())
+WORKDIR = os.path.join(home, ".local/shark_tank/")
+print(WORKDIR)
+
+
+# Checks whether the directory and files exists.
+def check_dir_exists(model_name, frontend="torch", dynamic=""):
+    model_dir = os.path.join(WORKDIR, model_name)
+
+    # Remove the _tf keyword from end.
+    if frontend in ["tf", "tensorflow"]:
+        model_name = model_name[:-3]
+    elif frontend in ["tflite"]:
+        model_name = model_name[:-7]
+    elif frontend in ["torch", "pytorch"]:
+        model_name = model_name[:-6]
+
+    if os.path.isdir(model_dir):
+        if (
+            os.path.isfile(
+                os.path.join(
+                    model_dir,
+                    model_name + dynamic + "_" + str(frontend) + ".mlir",
+                )
+            )
+            and os.path.isfile(os.path.join(model_dir, "function_name.npy"))
+            and os.path.isfile(os.path.join(model_dir, "inputs.npz"))
+            and os.path.isfile(os.path.join(model_dir, "golden_out.npz"))
+            and os.path.isfile(os.path.join(model_dir, "hash.npy"))
+        ):
+            print(
+                f"""The models are present in the {WORKDIR}. If you want a fresh 
+                download, consider deleting the directory."""
+            )
+            return True
+    return False
+
+
+# Downloads the torch model from gs://shark_tank dir.
+def download_torch_model(model_name, dynamic=False):
+    model_name = model_name.replace("/", "_")
+    dyn_str = "_dynamic" if dynamic else ""
+    os.makedirs(WORKDIR, exist_ok=True)
+    model_dir_name = model_name + "_torch"
+
+    def gs_download_model():
+        gs_command = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
+            + shark_default_sha
+            + "/"
+            + model_dir_name
+            + " "
+            + WORKDIR
+        )
+        if os.system(gs_command) != 0:
+            raise Exception("model not present in the tank. Contact Nod Admin")
+
+    if not check_dir_exists(model_dir_name, frontend="torch", dynamic=dyn_str):
+        gs_download_model()
+    else:
+        model_dir = os.path.join(WORKDIR, model_dir_name)
+        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
+        gs_hash = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
+            + shark_default_sha
+            + "/"
+            + model_dir_name
+            + "/hash.npy"
+            + " "
+            + os.path.join(model_dir, "upstream_hash.npy")
+        )
+        if os.system(gs_hash) != 0:
+            raise Exception("hash of the model not present in the tank.")
+        upstream_hash = str(
+            np.load(os.path.join(model_dir, "upstream_hash.npy"))
+        )
+        if local_hash != upstream_hash:
+            gs_download_model()
+
+    model_dir = os.path.join(WORKDIR, model_dir_name)
+    with open(
+        os.path.join(model_dir, model_name + dyn_str + "_torch.mlir")
+    ) as f:
+        mlir_file = f.read()
+
+    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
+    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
+    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
+
+    inputs_tuple = tuple([inputs[key] for key in inputs])
+    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
+    return mlir_file, function_name, inputs_tuple, golden_out_tuple
+
+
+# Downloads the tflite model from gs://shark_tank dir.
+def download_tflite_model(model_name, dynamic=False):
+    dyn_str = "_dynamic" if dynamic else ""
+    os.makedirs(WORKDIR, exist_ok=True)
+    model_dir_name = model_name + "_tflite"
+
+    def gs_download_model():
+        gs_command = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
+            + shark_default_sha
+            + "/"
+            + model_dir_name
+            + " "
+            + WORKDIR
+        )
+        if os.system(gs_command) != 0:
+            raise Exception("model not present in the tank. Contact Nod Admin")
+
+    if not check_dir_exists(
+        model_dir_name, frontend="tflite", dynamic=dyn_str
+    ):
+        gs_download_model()
+    else:
+        model_dir = os.path.join(WORKDIR, model_dir_name)
+        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
+        gs_hash = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
+            + shark_default_sha
+            + "/"
+            + model_dir_name
+            + "/hash.npy"
+            + " "
+            + os.path.join(model_dir, "upstream_hash.npy")
+        )
+        if os.system(gs_hash) != 0:
+            raise Exception("hash of the model not present in the tank.")
+        upstream_hash = str(
+            np.load(os.path.join(model_dir, "upstream_hash.npy"))
+        )
+        if local_hash != upstream_hash:
+            gs_download_model()
+
+    model_dir = os.path.join(WORKDIR, model_dir_name)
+    with open(
+        os.path.join(model_dir, model_name + dyn_str + "_tflite.mlir")
+    ) as f:
+        mlir_file = f.read()
+
+    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
+    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
+    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
+
+    inputs_tuple = tuple([inputs[key] for key in inputs])
+    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
+    return mlir_file, function_name, inputs_tuple, golden_out_tuple
+
+
+def download_tf_model(model_name):
+    model_name = model_name.replace("/", "_")
+    os.makedirs(WORKDIR, exist_ok=True)
+    model_dir_name = model_name + "_tf"
+
+    def gs_download_model():
+        gs_command = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
+            + shark_default_sha
+            + "/"
+            + model_dir_name
+            + " "
+            + WORKDIR
+        )
+        if os.system(gs_command) != 0:
+            raise Exception("model not present in the tank. Contact Nod Admin")
+
+    if not check_dir_exists(model_dir_name, frontend="tf"):
+        gs_download_model()
+    else:
+        model_dir = os.path.join(WORKDIR, model_dir_name)
+        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
+        gs_hash = (
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
+            + shark_default_sha
+            + "/"
+            + model_dir_name
+            + "/hash.npy"
+            + " "
+            + os.path.join(model_dir, "upstream_hash.npy")
+        )
+        if os.system(gs_hash) != 0:
+            raise Exception("hash of the model not present in the tank.")
+        upstream_hash = str(
+            np.load(os.path.join(model_dir, "upstream_hash.npy"))
+        )
+        if local_hash != upstream_hash:
+            gs_download_model()
+
+    model_dir = os.path.join(WORKDIR, model_dir_name)
+    with open(os.path.join(model_dir, model_name + "_tf.mlir")) as f:
+        mlir_file = f.read()
+
+    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
+    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
+    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
+
+    inputs_tuple = tuple([inputs[key] for key in inputs])
+    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
+    return mlir_file, function_name, inputs_tuple, golden_out_tuple
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -0,0 +1,236 @@
+# Lint as: python3
+"""SHARK Importer"""
+
+import sys
+import tempfile
+import os
+
+# List of the supported frontends.
+supported_frontends = {
+    "tensorflow",
+    "tf",
+    "pytorch",
+    "torch",
+    "tf-lite",
+    "tflite",
+}
+
+
+class SharkImporter:
+    """
+    SharkImporter converts frontend modules into a
+    mlir_module. The supported frameworks are tensorflow,
+    pytorch, and tf-lite.
+
+    ...
+
+    Attributes
+    ----------
+    module :
+        torch, tensorflow or tf-lite module.
+    inputs :
+        inputs to the module, may be required for the shape
+        information.
+    frontend: str
+        frontend to which the module belongs.
+    raw_model_file: str
+        temp tflite model path
+
+    Methods
+    -------
+    import_mlir(is_dynamic, tracing_required, func_name):
+        is_dynamic: input shapes to be totally dynamic (pytorch specific).
+        tracing_required: whether tracing is required (pytorch specific.
+        func_name: The function to be traced out or imported to mlir.
+
+    import_debug(is_dynamic, tracing_required, func_name):
+        returns the converted (mlir_module,func_name) with inputs and golden
+        outputs.
+        The inputs and outputs are converted into np array.
+    """
+
+    def __init__(
+        self,
+        module,
+        inputs: tuple = (),
+        frontend: str = "torch",
+        raw_model_file: str = "",
+    ):
+        self.module = module
+        self.inputs = None if len(inputs) == 0 else inputs
+        self.frontend = frontend
+        if not self.frontend in supported_frontends:
+            print(
+                f"The frontend is not in the supported_frontends: {supported_frontends}"
+            )
+            sys.exit(1)
+        self.raw_model_file = raw_model_file
+
+    # NOTE: The default function for torch is "forward" and tf-lite is "main".
+
+    def _torch_mlir(self, is_dynamic, tracing_required):
+        from shark.torch_mlir_utils import get_torch_mlir_module
+
+        return get_torch_mlir_module(
+            self.module, self.inputs, is_dynamic, tracing_required
+        )
+
+    def _tf_mlir(self, func_name):
+        from iree.compiler import tf as tfc
+
+        return tfc.compile_module(
+            self.module, exported_names=[func_name], import_only=True
+        )
+
+    def _tflite_mlir(self, func_name):
+        from iree.compiler import tflite as tflitec
+        from shark.iree_utils._common import IREE_TARGET_MAP
+
+        self.mlir_model = tflitec.compile_file(
+            self.raw_model_file,  # in tflite, it is a path to .tflite file, not a tflite interpreter
+            input_type="tosa",
+            import_only=True,
+        )
+        return self.mlir_model
+
+    # Adds the conversion of the frontend with the private function.
+    def import_mlir(
+        self,
+        is_dynamic=False,
+        tracing_required=False,
+        func_name="forward",
+    ):
+        if self.frontend in ["torch", "pytorch"]:
+            if self.inputs == None:
+                print(
+                    "Please pass in the inputs, the inputs are required to determine the shape of the mlir_module"
+                )
+                sys.exit(1)
+            return self._torch_mlir(is_dynamic, tracing_required), func_name
+        if self.frontend in ["tf", "tensorflow"]:
+            return self._tf_mlir(func_name), func_name
+        if self.frontend in ["tflite", "tf-lite"]:
+            func_name = "main"
+            return self._tflite_mlir(func_name), func_name
+
+    # Converts the frontend specific tensors into np array.
+    def convert_to_numpy(self, array_tuple: tuple):
+        if self.frontend in ["torch", "pytorch"]:
+            return [x.detach().numpy() for x in array_tuple]
+        if self.frontend in ["tf", "tensorflow"]:
+            return [x.numpy() for x in array_tuple]
+
+    # Saves `function_name.npy`, `inputs.npz`, `golden_out.npz` and `model_name.mlir` in the directory `dir`.
+    def save_data(
+        self, dir, model_name, mlir_data, func_name, inputs, outputs
+    ):
+        import numpy as np
+
+        inputs_name = "inputs.npz"
+        outputs_name = "golden_out.npz"
+        func_file_name = "function_name"
+        model_name_mlir = model_name + "_" + self.frontend + ".mlir"
+        np.savez(os.path.join(dir, inputs_name), *inputs)
+        np.savez(os.path.join(dir, outputs_name), *outputs)
+        np.save(os.path.join(dir, func_file_name), np.array(func_name))
+
+        mlir_str = mlir_data
+        if self.frontend == "torch":
+            mlir_str = mlir_data.operation.get_asm()
+        elif self.frontend == "tf":
+            mlir_str = mlir_data.decode("utf-8")
+        elif self.frontend == "tflite":
+            mlir_str = mlir_data.decode("utf-8")
+        with open(os.path.join(dir, model_name_mlir), "w") as mlir_file:
+            mlir_file.write(mlir_str)
+
+        return
+
+    def import_debug(
+        self,
+        is_dynamic=False,
+        tracing_required=False,
+        func_name="forward",
+        dir=tempfile.gettempdir(),
+        model_name="model",
+    ):
+        if self.inputs == None:
+            print(
+                f"There is no input provided: {self.inputs}, please provide inputs or simply run import_mlir."
+            )
+            sys.exit(1)
+
+        imported_mlir = self.import_mlir(
+            is_dynamic, tracing_required, func_name
+        )
+        # TODO: Make sure that any generic function name is accepted. Currently takes in the default function names.
+        # TODO: Check for multiple outputs.
+        if self.frontend in ["torch", "pytorch"]:
+            import torch
+
+            golden_out = self.module(*self.inputs)
+            if torch.is_tensor(golden_out):
+                golden_out = tuple(
+                    golden_out.detach().numpy(),
+                )
+            else:
+                golden_out = self.convert_to_numpy(golden_out)
+            # Save the artifacts in the directory dir.
+            self.save_data(
+                dir,
+                model_name,
+                imported_mlir[0],
+                imported_mlir[1],
+                self.inputs,
+                golden_out,
+            )
+            return (
+                imported_mlir,
+                self.convert_to_numpy(self.inputs),
+                golden_out,
+            )
+        if self.frontend in ["tf", "tensorflow"]:
+            import tensorflow as tf
+
+            golden_out = self.module.forward(*self.inputs)
+            if tf.is_tensor(golden_out):
+                golden_out = tuple(
+                    golden_out.numpy(),
+                )
+            elif golden_out is tuple:
+                golden_out = self.convert_to_numpy(golden_out)
+            elif hasattr(golden_out, "logits"):
+                # from transformers import TFSequenceClassifierOutput
+                golden_out = golden_out.logits
+            else:
+                golden_out = golden_out.last_hidden_state
+            # Save the artifacts in the directory dir.
+            self.save_data(
+                dir,
+                model_name,
+                imported_mlir[0],
+                imported_mlir[1],
+                self.inputs,
+                golden_out,
+            )
+            return (
+                imported_mlir,
+                self.convert_to_numpy(self.inputs),
+                golden_out,
+            )
+        if self.frontend in ["tflite", "tf-lite"]:
+            # TODO(Chi): Validate it for tflite models.
+            golden_out = self.module.invoke_tflite(self.inputs)
+            self.save_data(
+                dir,
+                model_name,
+                imported_mlir[0],
+                imported_mlir[1],
+                self.inputs,
+                golden_out,
+            )
+            return (
+                imported_mlir,
+                self.inputs,
+                golden_out,
+            )
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -0,0 +1,137 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from shark.shark_runner import SharkRunner
+import numpy as np
+
+
+dtype_to_np_dtype = {
+    "f32": np.float32,
+    "f64": np.float64,
+    "i32": np.int32,
+    "i64": np.int64,
+    "i1": np.bool_,
+}
+
+
+class SharkInference:
+    """
+    Runs prediction or inference on mlir_module.
+
+    ...
+
+    Attributes
+    ----------
+    mlir_module : str
+        mlir_module represented in string.
+    function_name : str
+        function to execute in the given mlir_module.
+    device : str
+        device to execute the mlir_module on.
+        currently supports cpu, cuda, vulkan, and metal backends.
+    mlir_dialect: str
+        The dialect in which the given mlir_module is in.
+        Refer to {https://mlir.llvm.org/docs/Dialects/}
+    is_benchmark: bool
+        Whether this SharkInference module should be benchmark-enabled.
+
+    Methods
+    -------
+    run(inputs=None):
+        Runs the mlir_module with the given inputs, if the inputs are not
+        given it autogenerates the inputs. Also, the inputs should be a
+        numpy array.
+    input_info():
+        Gives the information about the inputs required by the `function_name`.
+        This can be expensive as it does string matching to do so.
+
+    """
+
+    def __init__(
+        self,
+        mlir_module: str,
+        function_name: str = "forward",
+        device: str = "none",
+        mlir_dialect: str = "linalg",
+        is_benchmark: bool = False,
+    ):
+        self.mlir_module = mlir_module
+        self.function_name = function_name
+        self.device = device
+        self.mlir_dialect = mlir_dialect
+        self.is_benchmark = is_benchmark
+
+        self.shark_runner = None
+
+    def compile(self):
+
+        if self.is_benchmark == True:
+            from shark.shark_benchmark_runner import SharkBenchmarkRunner
+
+            self.shark_runner = SharkBenchmarkRunner(
+                self.mlir_module,
+                self.function_name,
+                self.device,
+                self.mlir_dialect,
+            )
+
+        else:
+            self.shark_runner = SharkRunner(
+                self.mlir_module,
+                self.function_name,
+                self.device,
+                self.mlir_dialect,
+            )
+
+    # inputs are considered to be tuple of np.array.
+    def forward(self, inputs: tuple):
+        return self.shark_runner.run(inputs)
+
+    # Captures the static input information from the mlir_module.
+    # TODO(pashu123): Generate the input information for dynamic shapes.
+    def _input_info(self):
+        # func_key to get the line which contains the function.
+        func_key = "func.func @" + self.function_name
+        func_header = None
+        for line in str(self.mlir_module).splitlines():
+            if func_key in line:
+                func_header = line
+                break
+        if func_header is None:
+            print(f"Function: {self.function_name} not found")
+
+        import re
+
+        inputs = re.findall("\(.*?\)", func_header)[0].split(",")
+        shapes = []
+        dtype = []
+        for inp in inputs:
+            shape_dtype = re.findall(r"<[^>]*>", inp)[0].split("x")
+            shape_dtype[0], shape_dtype[-1] = (
+                shape_dtype[0][1:],
+                shape_dtype[-1][:-1],
+            )
+            shapes.append(tuple([int(x) for x in shape_dtype[:-1]]))
+            dtype.append(shape_dtype[-1])
+
+        return shapes, dtype
+
+    # Generates random input to be feed into the graph.
+    def generate_random_inputs(self, low=0, high=1):
+        shapes, dtype = self._input_info()
+        inputs = []
+        for i, j in zip(shapes, dtype):
+            inputs.append(
+                np.random.uniform(low, high, size=i).astype(
+                    dtype_to_np_dtype[j]
+                )
+            )
+        return tuple(inputs)
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -0,0 +1,101 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from shark.iree_utils.compile_utils import (
+    get_iree_compiled_module,
+    get_results,
+    export_iree_module_to_vmfb,
+)
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.parser import shark_args
+import os
+import sys
+
+
+# supported dialects by the shark-runtime.
+supported_dialects = {"linalg", "mhlo", "tosa", "tf-lite"}
+
+
+class SharkRunner:
+    """
+    Base class for SharkInference and SharkTrainer
+    used to execute an mlir_module.
+
+    ...
+
+    Attributes
+    ----------
+    mlir_module : str
+        mlir_module represented in string.
+    function_name : str
+        function to execute in the given mlir_module.
+    device : str
+        device to execute the mlir_module on.
+        currently supports cpu, cuda, vulkan, and metal backends.
+    mlir_dialect: str
+        The dialect in which the given mlir_module is in.
+        Refer to {https://mlir.llvm.org/docs/Dialects/}
+
+    Methods
+    -------
+    run(inputs=None):
+        Runs the mlir_module with the given inputs, if the inputs are not
+        given it autogenerates the inputs. Also, the inputs should be a
+        numpy array.
+    input_info():
+        Gives the information about the inputs required by the `function_name`.
+        This can be expensive as it does string matching to do so.
+    """
+
+    def __init__(
+        self,
+        mlir_module: str,
+        function_name: str = "forward",
+        device: str = "none",
+        mlir_dialect: str = "linalg",
+    ):
+        self.mlir_module = mlir_module
+        self.function_name = function_name
+        self.device = shark_args.device if device == "none" else device
+        self.mlir_dialect = mlir_dialect
+
+        if check_device_drivers(self.device):
+            device_driver_info(self.device)
+            sys.exit(1)
+
+        # Compile the module to get the .vmfb.
+        (
+            self.iree_compilation_module,
+            self.iree_config,
+        ) = get_iree_compiled_module(
+            self.mlir_module,
+            self.device,
+            self.mlir_dialect,
+            func_name=self.function_name,
+        )
+
+    def run(self, inputs: tuple):
+        return get_results(
+            self.iree_compilation_module,
+            inputs,
+            self.iree_config,
+            self.mlir_dialect,
+        )
+
+    # TODO: Instead of passing directory and having names decided by the module
+    # , user may want to save the module with manual names.
+    def save_module(self, dir=os.getcwd()):
+        return export_iree_module_to_vmfb(
+            self.model, self.device, dir, self.mlir_dialect
+        )
--- a/shark/shark_trainer.py
+++ b/shark/shark_trainer.py
@@ -0,0 +1,152 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from shark.parser import shark_args
+from shark.shark_runner import SharkRunner
+from shark.backward_makefx import MakeFxModule
+import numpy as np
+from tqdm import tqdm
+import sys
+
+
+# Prints to stderr.
+def print_err(*a):
+    print(*a, file=sys.stderr)
+
+
+class SharkTrainer:
+    """Training pytorch, tensorflow module on shark runtime."""
+
+    def __init__(
+        self,
+        model,
+        input: tuple,
+        dynamic: bool = False,
+        device: str = None,
+        jit_trace: bool = False,
+        from_aot: bool = True,
+    ):
+        self.model = model
+        # Change tuple to list.
+        self.input = [x for x in input]
+        self.dynamic = dynamic
+        self.from_aot = from_aot
+        self.jit_trace = jit_trace
+        self.from_aot = from_aot
+
+        # By default it's the torch frontend.
+        self.frontend = "pytorch"
+        self.device = device if device is not None else shark_args.device
+
+        self.shark_runner = None
+
+    # Sets the frontend i.e `pytorch` or `tensorflow`.
+    def set_frontend(self, frontend: str):
+        if frontend not in [
+            "pytorch",
+            "torch",
+            "tensorflow",
+            "tf",
+            "mhlo",
+            "linalg",
+            "tosa",
+        ]:
+            print_err("frontend not supported.")
+        else:
+            self.frontend = frontend
+
+    # Training function is needed in the case of torch_fn.
+    def compile(self, training_fn=None):
+        if self.frontend in ["torch", "pytorch"]:
+            aot_module = MakeFxModule(
+                self.model, tuple(self.input), custom_inference_fn=training_fn
+            )
+            aot_module.generate_graph()
+            # Returns the backward graph.
+            training_graph = aot_module.training_graph
+            weights = self.get_torch_params()
+            self.shark_runner = SharkRunner(
+                training_graph,
+                weights + self.input,
+                self.dynamic,
+                self.device,
+                self.jit_trace,
+                self.from_aot,
+                self.frontend,
+            )
+        elif self.frontend in ["tensorflow", "tf", "mhlo"]:
+            self.shark_runner = SharkRunner(
+                self.model,
+                self.input,
+                self.dynamic,
+                self.device,
+                self.jit_trace,
+                self.from_aot,
+                self.frontend,
+            )
+        else:
+            print_err("Unknown frontend")
+            return
+
+    # The inputs to the mlir-graph are weights, buffers and inputs respectively.
+    def get_torch_params(self):
+        params = [i.detach() for i in self.model.parameters()]
+        buffers = [i.detach() for i in self.model.buffers()]
+        return params + buffers
+
+    # Function to train pytorch module.
+    def _train_torch(self, num_iters):
+        """Returns the updated weights after num_iters"""
+        params = self.get_torch_params()
+        params = [x.numpy() for x in params]
+        print(f"Training started for {num_iters} iterations:")
+        for i in tqdm(range(num_iters)):
+            params = self.shark_runner.forward(
+                params + self.input, self.frontend
+            )
+
+        return params
+
+    # Function to train tensorflow module.
+    # Output final loss.
+    # TODO(raikonenfnu): Save updated weight/states in SHARK.
+    def _train_tf(self, num_iters):
+        input_list = []
+        for x in self.input:
+            if isinstance(x, list):
+                nested_list = []
+                for val in x:
+                    if isinstance(val, np.ndarray):
+                        nested_list.append(val)
+                    else:
+                        nested_list.append(val.numpy())
+                input_list.append(nested_list)
+            elif isinstance(x, np.ndarray):
+                input_list.append(x)
+            else:
+                input_list.append(x.numpy())
+
+        print(f"Training started for {num_iters} iterations:")
+        for i in tqdm(range(num_iters)):
+            outputs = self.shark_runner.forward(input_list, self.frontend)
+        return outputs
+
+    def train(self, num_iters=1):
+        if self.frontend in ["torch", "pytorch"]:
+            return self._train_torch(num_iters)
+        elif self.frontend in ["tf", "tensorflow", "mhlo"]:
+            return self._train_tf(num_iters)
+        else:
+            print_err("Unknown frontend")
+            return
--- a/shark/tests/test_shark_importer.py
+++ b/shark/tests/test_shark_importer.py
@@ -0,0 +1,144 @@
+# RUN: %PYTHON %s
+import numpy as np
+from shark.shark_importer import SharkImporter
+import pytest
+from shark.parser import shark_args
+from shark.shark_inference import SharkInference
+from shark.tflite_utils import TFLitePreprocessor
+import sys
+
+# model_path = "https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1?lite-format=tflite"
+
+
+# Inputs modified to be useful albert inputs.
+def generate_inputs(input_details):
+    for input in input_details:
+        print(str(input["shape"]), input["dtype"].__name__)
+
+    args = []
+    args.append(
+        np.random.randint(
+            low=0,
+            high=256,
+            size=input_details[0]["shape"],
+            dtype=input_details[0]["dtype"],
+        )
+    )
+    args.append(
+        np.ones(
+            shape=input_details[1]["shape"], dtype=input_details[1]["dtype"]
+        )
+    )
+    args.append(
+        np.zeros(
+            shape=input_details[2]["shape"], dtype=input_details[2]["dtype"]
+        )
+    )
+    return args
+
+
+def compare_results(mlir_results, tflite_results, details):
+    print("Compare mlir_results VS tflite_results: ")
+    assert len(mlir_results) == len(
+        tflite_results
+    ), "Number of results do not match"
+    for i in range(len(details)):
+        mlir_result = mlir_results[i]
+        tflite_result = tflite_results[i]
+        mlir_result = mlir_result.astype(np.single)
+        tflite_result = tflite_result.astype(np.single)
+        assert mlir_result.shape == tflite_result.shape, "shape doesnot match"
+        max_error = np.max(np.abs(mlir_result - tflite_result))
+        print("Max error (%d): %f", i, max_error)
+
+
+class AlbertTfliteModuleTester:
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+        save_vmfb=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+
+    def create_and_check_module(self):
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        tflite_preprocessor = TFLitePreprocessor(model_name="albert_lite_base")
+
+        raw_model_file_path = tflite_preprocessor.get_raw_model_file()
+        inputs = tflite_preprocessor.get_inputs()
+        tflite_interpreter = tflite_preprocessor.get_interpreter()
+
+        my_shark_importer = SharkImporter(
+            module=tflite_interpreter,
+            inputs=inputs,
+            frontend="tflite",
+            raw_model_file=raw_model_file_path,
+        )
+        mlir_model, func_name = my_shark_importer.import_mlir()
+
+        shark_module = SharkInference(
+            mlir_module=mlir_model,
+            function_name=func_name,
+            device=self.device,
+            mlir_dialect="tflite",
+        )
+
+        # Case1: Use shark_importer default generate inputs
+        shark_module.compile()
+        mlir_results = shark_module.forward(inputs)
+        ## post process results for compare
+        input_details, output_details = tflite_preprocessor.get_model_details()
+        mlir_results = list(mlir_results)
+        for i in range(len(output_details)):
+            dtype = output_details[i]["dtype"]
+            mlir_results[i] = mlir_results[i].astype(dtype)
+        tflite_results = tflite_preprocessor.get_golden_output()
+        compare_results(mlir_results, tflite_results, output_details)
+
+        # Case2: Use manually set inputs
+        input_details, output_details = tflite_preprocessor.get_model_details()
+        inputs = generate_inputs(input_details)  # new inputs
+
+        shark_module = SharkInference(
+            mlir_module=mlir_model,
+            function_name=func_name,
+            device=self.device,
+            mlir_dialect="tflite",
+        )
+        shark_module.compile()
+        mlir_results = shark_module.forward(inputs)
+        ## post process results for compare
+        tflite_results = tflite_preprocessor.get_golden_output()
+        compare_results(mlir_results, tflite_results, output_details)
+        # print(mlir_results)
+
+
+# A specific case can be run by commenting different cases. Runs all the test
+# across cpu, gpu and vulkan according to available drivers.
+pytest_param = pytest.mark.parametrize(
+    ("dynamic", "device"),
+    [
+        pytest.param(False, "cpu"),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, "cpu", marks=pytest.mark.skip),
+    ],
+)
+
+
+@pytest_param
+@pytest.mark.xfail(
+    sys.platform == "darwin", reason="known macos tflite install issue"
+)
+def test_albert(dynamic, device):
+    module_tester = AlbertTfliteModuleTester(dynamic=dynamic, device=device)
+    module_tester.create_and_check_module()
+
+
+if __name__ == "__main__":
+    test_albert(False, "cpu")
--- a/shark/tflite_utils.py
+++ b/shark/tflite_utils.py
@@ -0,0 +1,208 @@
+import tensorflow as tf
+import numpy as np
+import os
+import csv
+import urllib.request
+
+
+class TFLiteModelUtil:
+    def __init__(self, raw_model_file):
+        self.raw_model_file = str(raw_model_file)
+        self.tflite_interpreter = None
+        self.input_details = None
+        self.output_details = None
+        self.inputs = []
+
+    def setup_tflite_interpreter(self):
+        self.tflite_interpreter = tf.lite.Interpreter(
+            model_path=self.raw_model_file
+        )
+        self.tflite_interpreter.allocate_tensors()
+        # default input initialization
+        return self.get_model_details()
+
+    def get_model_details(self):
+        print("Get tflite input output details")
+        self.input_details = self.tflite_interpreter.get_input_details()
+        self.output_details = self.tflite_interpreter.get_output_details()
+        return self.input_details, self.output_details
+
+    def invoke_tflite(self, inputs):
+        self.inputs = inputs
+        print("invoke_tflite")
+        for i, input in enumerate(self.inputs):
+            self.tflite_interpreter.set_tensor(
+                self.input_details[i]["index"], input
+            )
+        self.tflite_interpreter.invoke()
+
+        # post process tflite_result for compare with mlir_result,
+        # for tflite the output is a list of numpy.tensor
+        tflite_results = []
+        for output_detail in self.output_details:
+            tflite_results.append(
+                np.array(
+                    self.tflite_interpreter.get_tensor(output_detail["index"])
+                )
+            )
+
+        for i in range(len(self.output_details)):
+            # print("output_details ", i, "shape", self.output_details[i]["shape"].__name__,
+            #       ", dtype: ", self.output_details[i]["dtype"].__name__)
+            out_dtype = self.output_details[i]["dtype"]
+            tflite_results[i] = tflite_results[i].astype(out_dtype)
+        return tflite_results
+
+
+class TFLitePreprocessor:
+    def __init__(
+        self,
+        model_name,
+        input_details=None,
+        output_details=None,
+        model_path=None,
+    ):
+        self.model_name = model_name
+        self.input_details = (
+            input_details  # used for tflite, optional for tf/pytorch
+        )
+        self.output_details = (
+            output_details  # used for tflite, optional for tf/pytorch
+        )
+        self.inputs = []
+        self.model_path = model_path  # url to download the model
+        self.raw_model_file = (
+            None  # local address for raw tf/tflite/pytorch model
+        )
+        self.mlir_file = (
+            None  # local address for .mlir file of tf/tflite/pytorch model
+        )
+        self.mlir_model = None  # read of .mlir file
+        self.output_tensor = (
+            None  # the raw tf/pytorch/tflite_output_tensor, not mlir_tensor
+        )
+        self.interpreter = (
+            None  # could be tflite/tf/torch_interpreter in utils
+        )
+        self.input_file = None
+        self.output_file = None
+
+        # create tmp model file directory
+        if self.model_path is None and self.model_name is None:
+            print(
+                "Error. No model_path, No model name,Please input either one."
+            )
+            return
+
+        print("Setting up for TMP_WORK_DIR")
+        self.workdir = os.path.join(
+            os.path.dirname(__file__), "./../gen_shark_tank"
+        )
+        os.makedirs(self.workdir, exist_ok=True)
+        print(f"TMP_WORK_DIR = {self.workdir}")
+
+        # compile and run tfhub tflite
+        load_model_success = self.load_tflite_model()
+        if not load_model_success:
+            print("Error, load tflite model fail")
+            return
+
+        if (self.input_details is None) or (self.output_details is None):
+            # print("Setting up tflite interpreter to get model input details")
+            self.setup_interpreter()
+
+            inputs = self.generate_inputs(self.input_details)  # device_inputs
+        self.setup_inputs(inputs)
+
+    def load_tflite_model(self):
+        # use model name get dir.
+        tflite_model_name_dir = os.path.join(
+            self.workdir, str(self.model_name)
+        )
+
+        os.makedirs(tflite_model_name_dir, exist_ok=True)
+        print(f"TMP_TFLITE_MODELNAME_DIR = {tflite_model_name_dir}")
+
+        self.raw_model_file = "/".join(
+            [tflite_model_name_dir, str(self.model_name) + "_tflite.tflite"]
+        )
+        self.mlir_file = "/".join(
+            [tflite_model_name_dir, str(self.model_name) + "_tflite.mlir"]
+        )
+        self.input_file = "/".join([tflite_model_name_dir, "inputs"])
+        self.output_file = "/".join([tflite_model_name_dir, "golden_out"])
+        # np.save("/".join([tflite_model_name_dir, "function_name"]), np.array("main"))
+
+        if os.path.exists(self.raw_model_file):
+            print(
+                "Local address for .tflite model file Exists: ",
+                self.raw_model_file,
+            )
+        else:
+            print("No local tflite file, Download tflite model")
+            if self.model_path is None:
+                # get model file from tflite_model_list.csv or download from gs://bucket
+                print("No model_path, get from tflite_model_list.csv")
+                tflite_model_list_path = os.path.join(
+                    os.path.dirname(__file__),
+                    "../tank/tflite/tflite_model_list.csv",
+                )
+                tflite_model_list = csv.reader(open(tflite_model_list_path))
+                for row in tflite_model_list:
+                    if str(row[0]) == str(self.model_name):
+                        self.model_path = row[1]
+                        print("tflite_model_name", str(row[0]))
+                        print("tflite_model_link", self.model_path)
+            if self.model_path is None:
+                print("Error, No model path find in tflite_model_list.csv")
+                return False
+            urllib.request.urlretrieve(self.model_path, self.raw_model_file)
+        return True
+
+    def setup_interpreter(self):
+        self.interpreter = TFLiteModelUtil(self.raw_model_file)
+        (
+            self.input_details,
+            self.output_details,
+        ) = self.interpreter.setup_tflite_interpreter()
+
+    def generate_inputs(self, input_details):
+        self.inputs = []
+        for tmp_input in input_details:
+            print(
+                "input_details shape:",
+                str(tmp_input["shape"]),
+                " type:",
+                tmp_input["dtype"].__name__,
+            )
+            self.inputs.append(
+                np.ones(shape=tmp_input["shape"], dtype=tmp_input["dtype"])
+            )
+        return self.inputs
+
+    def setup_inputs(self, inputs):
+        # print("Setting up inputs")
+        self.inputs = inputs
+
+    def get_mlir_model(self):
+        return self.mlir_model
+
+    def get_mlir_file(self):
+        return self.mlir_file
+
+    def get_inputs(self):
+        return self.inputs
+
+    def get_golden_output(self):
+        self.output_tensor = self.interpreter.invoke_tflite(self.inputs)
+        np.savez(self.output_file, *self.output_tensor)
+        return self.output_tensor
+
+    def get_model_details(self):
+        return self.input_details, self.output_details
+
+    def get_raw_model_file(self):
+        return self.raw_model_file
+
+    def get_interpreter(self):
+        return self.interpreter
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -0,0 +1,72 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch_mlir.ir import StringAttr
+import torch_mlir
+from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
+
+
+def get_module_name_for_asm_dump(module):
+    """Gets a name suitable for an assembly dump.
+    The name is not guaranteed to be unique.
+    """
+    if not "torch.debug_module_name" in module.operation.attributes:
+        return "UnnammedModule"
+    return StringAttr(
+        module.operation.attributes["torch.debug_module_name"]
+    ).value
+
+
+def run_on_refbackend(torch_module, inputs):
+    backend = refbackend.RefBackendLinalgOnTensorsBackend()
+    compiled = backend.compile(torch_module)
+    jit_module = backend.load(compiled)
+    np_inputs = [x.numpy() for x in inputs]
+    return jit_module.forward(np_inputs[0])
+
+
+# Creates dynamic dims for all dims.
+# TODO: Pass user specified dynamic dims.
+def create_dynamic_placeholders(inputs):
+    placeholders = []
+    for inp in inputs:
+        placeholder = torch_mlir.TensorPlaceholder.like(
+            inp, dynamic_axes=[i for i in range(len(inp.shape))]
+        )
+        placeholders.append(placeholder)
+    return tuple(placeholders)
+
+
+def get_torch_mlir_module(
+    module,
+    input: tuple,
+    dynamic: bool,
+    jit_trace: bool,
+    from_torchscript: bool = False,
+):
+    """Get the MLIR's linalg-on-tensors module from torchscipt module."""
+    ignore_traced_shapes = False
+    if dynamic:
+        input = create_dynamic_placeholders(input)
+    if jit_trace:
+        ignore_traced_shapes = True
+
+    module = torch_mlir.compile(
+        module,
+        input,
+        output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
+        use_tracing=jit_trace,
+        ignore_traced_shapes=ignore_traced_shapes,
+    )
+    return module
--- a/tank/MiniLM-L12-H384-uncased/MiniLM-L12-H384-uncased_test.py
+++ b/tank/MiniLM-L12-H384-uncased/MiniLM-L12-H384-uncased_test.py
@@ -0,0 +1,101 @@
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+from shark.parser import shark_args
+
+import iree.compiler as ireec
+import unittest
+import pytest
+import numpy as np
+
+
+class MiniLMModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+        onnx_bench=False,
+    ):
+        self.benchmark = benchmark
+        self.onnx_bench = onnx_bench
+
+    def create_and_check_module(self, dynamic, device):
+        model, func_name, inputs, golden_out = download_tf_model(
+            "microsoft/MiniLM-L12-H384-uncased"
+        )
+
+        shark_module = SharkInference(
+            model,
+            func_name,
+            device=device,
+            mlir_dialect="mhlo",
+            is_benchmark=self.benchmark,
+        )
+        if self.benchmark == True:
+            shark_args.enable_tf32 = True
+            shark_module.compile()
+            shark_args.onnx_bench = self.onnx_bench
+            shark_module.shark_runner.benchmark_all_csv(
+                (inputs),
+                "microsoft/MiniLM-L12-H384-uncased",
+                dynamic,
+                device,
+                "tensorflow",
+            )
+            shark_args.enable_tf32 = False
+            rtol = 1e-01
+            atol = 1e-02
+
+        else:
+            shark_module.compile()
+            rtol = 1e-02
+            atol = 1e-03
+
+        # TODO: Remove catch once new MiniLM stable
+        try:
+            result = shark_module.forward(inputs)[0][1].to_host()
+
+        except:
+            result = shark_module.forward(inputs)
+
+        np.testing.assert_allclose(golden_out, result, rtol=rtol, atol=atol)
+
+
+class MiniLMModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = MiniLMModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/MiniLM-L12-H384-uncased_torch/MiniLM-L12-H384-uncased_torch_test.py
+++ b/tank/MiniLM-L12-H384-uncased_torch/MiniLM-L12-H384-uncased_torch_test.py
@@ -0,0 +1,114 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import compare_tensors
+from shark.shark_downloader import download_torch_model
+from shark.parser import shark_args
+
+import unittest
+import numpy as np
+import pytest
+
+
+class MiniLMModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+        onnx_bench=False,
+    ):
+        self.benchmark = benchmark
+        self.onnx_bench = onnx_bench
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "microsoft/MiniLM-L12-H384-uncased", dynamic
+        )
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        if self.benchmark == True:
+            shark_args.enable_tf32 = True
+            shark_module.compile()
+            shark_args.onnx_bench = self.onnx_bench
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "microsoft/MiniLM-L12-H384-uncased",
+                dynamic,
+                device,
+                "torch",
+            )
+            shark_args.enable_tf32 = False
+            rtol = 1e-01
+            atol = 1e-02
+        else:
+            shark_module.compile()
+            rtol = 1e-02
+            atol = 1e-03
+
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results, rtol, atol)
+
+
+class MiniLMModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = MiniLMModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/README.md
+++ b/tank/README.md
@@ -0,0 +1,13 @@
+To run the fine tuning example, from the root SHARK directory, run:
+
+```shell
+IMPORTER=1 ./setup_venv.sh
+source shark.venv/bin/activate
+pip install jupyter tf-models-nightly tf-datasets
+jupyter-notebook
+```
+if running from a google vm, you can view jupyter notebooks on your local system with:
+```shell
+gcloud compute ssh <YOUR_INSTANCE_DETAILS> --ssh-flag="-N -L localhost:8888:localhost:8888"
+```
+
--- a/tank/init.py
+++ b/tank/init.py
--- a/tank/albert-base-v2_tf/albert-base-v2_tf_test.py
+++ b/tank/albert-base-v2_tf/albert-base-v2_tf_test.py
@@ -0,0 +1,69 @@
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+
+import iree.compiler as ireec
+import unittest
+import pytest
+import numpy as np
+
+
+class AlbertBaseModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model, func_name, inputs, golden_out = download_tf_model(
+            "albert-base-v2"
+        )
+
+        shark_module = SharkInference(
+            model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        result = shark_module.forward(inputs)
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+
+
+class AlbertBaseModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = AlbertBaseModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/albert-base-v2_torch/albert-base-v2_torch_test.py
+++ b/tank/albert-base-v2_torch/albert-base-v2_torch_test.py
@@ -0,0 +1,113 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import compare_tensors
+from shark.shark_downloader import download_torch_model
+
+import unittest
+import numpy as np
+import pytest
+
+
+class AlbertModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "albert-base-v2", dynamic
+        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
+        if self.benchmark == True:
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "albert-base-v2",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class AlbertModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = AlbertModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/albert_lite_base/albert_lite_base_tflite_sharkimporter.txt
+++ b/tank/albert_lite_base/albert_lite_base_tflite_sharkimporter.txt
@@ -0,0 +1,177 @@
+# import numpy as np
+# from shark.shark_importer import SharkImporter
+# from shark.shark_inference import SharkInference
+# import pytest
+# import unittest
+# from shark.parser import shark_args
+# from shark.tflite_utils import TFLitePreprocessor
+#
+#
+# # model_path = "https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1?lite-format=tflite"
+# # model_path = model_path
+#
+# # Inputs modified to be useful albert inputs.
+# def generate_inputs(input_details):
+#     for input in input_details:
+#         print(str(input["shape"]), input["dtype"].__name__)
+#         # [  1 384] int32
+#         # [  1 384] int32
+#         # [  1 384] int32
+#
+#     args = []
+#     args.append(
+#         np.random.randint(
+#             low=0,
+#             high=256,
+#             size=input_details[0]["shape"],
+#             dtype=input_details[0]["dtype"],
+#         )
+#     )
+#     args.append(
+#         np.ones(
+#             shape=input_details[1]["shape"], dtype=input_details[1]["dtype"]
+#         )
+#     )
+#     args.append(
+#         np.zeros(
+#             shape=input_details[2]["shape"], dtype=input_details[2]["dtype"]
+#         )
+#     )
+#     return args
+#
+#
+# def compare_results(mlir_results, tflite_results):
+#     print("Compare mlir_results VS tflite_results: ")
+#     assert len(mlir_results) == len(
+#         tflite_results
+#     ), "Number of results do not match"
+#     rtol = 1e-02
+#     atol = 1e-03
+#     print(
+#         "numpy.allclose: ",
+#         np.allclose(mlir_results, tflite_results, rtol, atol),
+#     )
+#     for i in range(len(mlir_results)):
+#         mlir_result = mlir_results[i]
+#         tflite_result = tflite_results[i]
+#         mlir_result = mlir_result.astype(np.single)
+#         tflite_result = tflite_result.astype(np.single)
+#         assert mlir_result.shape == tflite_result.shape, "shape doesnot match"
+#         max_error = np.max(np.abs(mlir_result - tflite_result))
+#         print("Max error (%d): %f", i, max_error)
+#
+#
+# class AlbertTfliteModuleTester:
+#     def __init__(
+#         self,
+#         dynamic=False,
+#         device="cpu",
+#         save_mlir=False,
+#         save_vmfb=False,
+#     ):
+#         self.dynamic = dynamic
+#         self.device = device
+#         self.save_mlir = save_mlir
+#         self.save_vmfb = save_vmfb
+#
+#     def create_and_check_module(self):
+#         shark_args.save_mlir = self.save_mlir
+#         shark_args.save_vmfb = self.save_vmfb
+#
+#         # Preprocess to get SharkImporter input args
+#         tflite_preprocessor = TFLitePreprocessor(model_name="albert_lite_base")
+#         raw_model_file_path = tflite_preprocessor.get_raw_model_file()
+#         inputs = tflite_preprocessor.get_inputs()
+#         tflite_interpreter = tflite_preprocessor.get_interpreter()
+#
+#         # Use SharkImporter to get SharkInference input args
+#         my_shark_importer = SharkImporter(
+#             module=tflite_interpreter,
+#             inputs=inputs,
+#             frontend="tflite",
+#             raw_model_file=raw_model_file_path,
+#         )
+#         mlir_model, func_name = my_shark_importer.import_mlir()
+#
+#         # Use SharkInference to get inference result
+#         shark_module = SharkInference(
+#             mlir_module=mlir_model,
+#             function_name=func_name,
+#             device=self.device,
+#             mlir_dialect="tflite",
+#         )
+#
+#         # Case1: Use shark_importer default generate inputs
+#         shark_module.compile()
+#         mlir_results = shark_module.forward(inputs)
+#         ## post process results for compare
+#         # input_details, output_details = tflite_preprocessor.get_model_details()
+#         # mlir_results = list(mlir_results)
+#         # for i in range(len(output_details)):
+#         #     dtype = output_details[i]["dtype"]
+#         #     mlir_results[i] = mlir_results[i].astype(dtype)
+#         tflite_results = tflite_preprocessor.get_golden_output()
+#         compare_results(mlir_results, tflite_results)
+#         # import pdb
+#         # pdb.set_trace()
+#
+#         # Case2: Use manually set inputs
+#         # input_details, output_details = tflite_preprocessor.get_model_details()
+#         input_details = [
+#             {
+#                 "shape": [1, 384],
+#                 "dtype": np.int32,
+#             },
+#             {
+#                 "shape": [1, 384],
+#                 "dtype": np.int32,
+#             },
+#             {
+#                 "shape": [1, 384],
+#                 "dtype": np.int32,
+#             },
+#         ]
+#         inputs = generate_inputs(input_details)  # new inputs
+#
+#         shark_module = SharkInference(
+#             mlir_module=mlir_model,
+#             function_name=func_name,
+#             device=self.device,
+#             mlir_dialect="tflite",
+#         )
+#         shark_module.compile()
+#         mlir_results = shark_module.forward(inputs)
+#         ## post process results for compare
+#         tflite_results = tflite_preprocessor.get_golden_output()
+#         compare_results(mlir_results, tflite_results)
+#         # print(mlir_results)
+#
+#
+# class AlbertTfliteModuleTest(unittest.TestCase):
+#     @pytest.fixture(autouse=True)
+#     def configure(self, pytestconfig):
+#         self.save_mlir = pytestconfig.getoption("save_mlir")
+#         self.save_vmfb = pytestconfig.getoption("save_vmfb")
+#
+#     def setUp(self):
+#         self.module_tester = AlbertTfliteModuleTester(self)
+#         self.module_tester.save_mlir = self.save_mlir
+#
+#     import sys
+#
+#     @pytest.mark.xfail(
+#         sys.platform == "darwin", reason="known macos tflite install issue"
+#     )
+#     def test_module_static_cpu(self):
+#         self.module_tester.dynamic = False
+#         self.module_tester.device = "cpu"
+#         self.module_tester.create_and_check_module()
+
+
+# if __name__ == "__main__":
+# module_tester = AlbertTfliteModuleTester()
+# module_tester.save_mlir = True
+# module_tester.save_vmfb = True
+# module_tester.create_and_check_module()
+
+# unittest.main()
--- a/tank/albert_lite_base/albert_lite_base_tflite_test.py
+++ b/tank/albert_lite_base/albert_lite_base_tflite_test.py
@@ -0,0 +1,118 @@
+import numpy as np
+from shark.shark_downloader import download_tflite_model
+from shark.shark_inference import SharkInference
+import pytest
+import unittest
+from shark.parser import shark_args
+
+
+# model_path = "https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1?lite-format=tflite"
+# model_path = model_path
+
+# Inputs modified to be useful albert inputs.
+def generate_inputs(input_details):
+    for input in input_details:
+        print(str(input["shape"]), input["dtype"].__name__)
+    # [  1 384] int32
+    # [  1 384] int32
+    # [  1 384] int32
+
+    args = []
+    args.append(
+        np.random.randint(
+            low=0,
+            high=256,
+            size=input_details[0]["shape"],
+            dtype=input_details[0]["dtype"],
+        )
+    )
+    args.append(
+        np.ones(
+            shape=input_details[1]["shape"], dtype=input_details[1]["dtype"]
+        )
+    )
+    args.append(
+        np.zeros(
+            shape=input_details[2]["shape"], dtype=input_details[2]["dtype"]
+        )
+    )
+    return args
+
+
+def compare_results(mlir_results, tflite_results):
+    print("Compare mlir_results VS tflite_results: ")
+    assert len(mlir_results) == len(
+        tflite_results
+    ), "Number of results do not match"
+    rtol = 1e-02
+    atol = 1e-03
+    print(
+        "numpy.allclose: ",
+        np.allclose(mlir_results, tflite_results, rtol, atol),
+    )
+    for i in range(len(mlir_results)):
+        mlir_result = mlir_results[i]
+        tflite_result = tflite_results[i]
+        mlir_result = mlir_result.astype(np.single)
+        tflite_result = tflite_result.astype(np.single)
+        assert mlir_result.shape == tflite_result.shape, "shape doesnot match"
+        max_error = np.max(np.abs(mlir_result - tflite_result))
+        print("Max error (%d): %f", i, max_error)
+
+
+class AlbertTfliteModuleTester:
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+        save_vmfb=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+
+    def create_and_check_module(self):
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+
+        (
+            mlir_model,
+            function_name,
+            inputs,
+            tflite_results,
+        ) = download_tflite_model(model_name="albert_lite_base")
+
+        shark_module = SharkInference(
+            mlir_module=mlir_model,
+            function_name="main",
+            device=self.device,
+            mlir_dialect="tflite",
+        )
+        shark_module.compile()
+        mlir_results = shark_module.forward(inputs)
+        # print(shark_results)
+        compare_results(mlir_results, tflite_results)
+
+
+class AlbertTfliteModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+        self.save_vmfb = pytestconfig.getoption("save_vmfb")
+
+    def setUp(self):
+        self.module_tester = AlbertTfliteModuleTester(self)
+        self.module_tester.save_mlir = self.save_mlir
+
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == "__main__":
+    unittest.main()
+    # module_tester = AlbertTfliteModuleTester()
+    # module_tester.create_and_check_module()
--- a/tank/alexnet_torch/alexnet_torch_test.py
+++ b/tank/alexnet_torch/alexnet_torch_test.py
@@ -0,0 +1,115 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import compare_tensors
+from shark.shark_downloader import download_torch_model
+
+import unittest
+import numpy as np
+import pytest
+
+
+class AlexnetModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "alexnet", dynamic
+        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
+        if self.benchmark == True:
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "alexnet",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class AlexnetModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = AlexnetModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    @pytest.mark.xfail(
+        reason="Issue known, WIP",
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/arbitrary-image-stylization-v1-256/arbitrary-image-stylization-v1-256_tflite_test.py
+++ b/tank/arbitrary-image-stylization-v1-256/arbitrary-image-stylization-v1-256_tflite_test.py
@@ -0,0 +1,97 @@
+import numpy as np
+from shark.shark_downloader import download_tflite_model
+from shark.shark_inference import SharkInference
+import pytest
+import unittest
+from shark.parser import shark_args
+
+
+# model_path = "https://tfhub.dev/google/lite-model/magenta/arbitrary-image-stylization-v1-256/int8/prediction/1?lite-format=tflite"
+
+
+def compare_results(mlir_results, tflite_results):
+    print("Compare mlir_results VS tflite_results: ")
+    assert len(mlir_results) == len(
+        tflite_results
+    ), "Number of results do not match"
+    for i in range(len(mlir_results)):
+        mlir_result = mlir_results[i]
+        tflite_result = tflite_results[i]
+        mlir_result = mlir_result.astype(np.single)
+        tflite_result = tflite_result.astype(np.single)
+        mlir_result = np.expand_dims(mlir_result, axis=0)
+        print("mlir_result.shape", mlir_result.shape)
+        print("tflite_result.shape", tflite_result.shape)
+        assert mlir_result.shape == tflite_result.shape, "shape doesnot match"
+        max_error = np.max(np.abs(mlir_result - tflite_result))
+        print("Max error (%d): %f", i, max_error)
+
+
+class ArbitraryImageStylizationV1TfliteModuleTester:
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+        save_vmfb=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+
+    def create_and_check_module(self):
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+
+        (
+            mlir_model,
+            function_name,
+            inputs,
+            tflite_results,
+        ) = download_tflite_model(
+            model_name="arbitrary-image-stylization-v1-256"
+        )
+        shark_module = SharkInference(
+            mlir_module=mlir_model,
+            function_name="main",
+            device=self.device,
+            mlir_dialect="tflite",
+        )
+        # Case1: Use shark_importer default generate inputs
+        shark_module.compile()
+        mlir_results = shark_module.forward(inputs)
+        # print(shark_results)
+        compare_results(mlir_results, tflite_results)
+
+
+class ArbitraryImageStylizationV1TfliteModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+        self.save_vmfb = pytestconfig.getoption("save_vmfb")
+
+    def setUp(self):
+        self.module_tester = ArbitraryImageStylizationV1TfliteModuleTester(
+            self
+        )
+        self.module_tester.save_mlir = self.save_mlir
+
+    import sys
+
+    @pytest.mark.xfail(
+        reason="'tosa.conv2d' op attribute 'quantization_info' failed ",
+    )
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == "__main__":
+    # module_tester = ArbitraryImageStylizationV1TfliteModuleTester()
+    # module_tester.save_mlir = True
+    # module_tester.save_vmfb = True
+    # module_tester.create_and_check_module()
+
+    unittest.main()
--- a/tank/bert-base-cased_torch/bert-base-cased_torch_test.py
+++ b/tank/bert-base-cased_torch/bert-base-cased_torch_test.py
@@ -0,0 +1,117 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import compare_tensors
+from shark.shark_downloader import download_torch_model
+
+import torch
+import unittest
+import numpy as np
+import pytest
+
+
+class BertBaseUncasedModuleTester:
+    def __init__(
+        self,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False,
+    ):
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "bert-base-cased", dynamic
+        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
+        if self.benchmark == True:
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "bert-base-cased",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class BertBaseUncasedModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = BertBaseUncasedModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/bert-base-uncased_tf/bert-base-uncased_tf_test.py
+++ b/tank/bert-base-uncased_tf/bert-base-uncased_tf_test.py
@@ -0,0 +1,71 @@
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+from shark.parser import shark_args
+
+import unittest
+import pytest
+import numpy as np
+
+
+class BertBaseUncasedModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+        onnx_bench=False,
+    ):
+        self.benchmark = benchmark
+        self.onnx_bench = onnx_bench
+
+    def create_and_check_module(self, dynamic, device):
+        model, func_name, inputs, golden_out = download_tf_model(
+            "bert-base-uncased"
+        )
+
+        shark_module = SharkInference(
+            model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        result = shark_module.forward(inputs)
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+
+
+class BertBaseUncasedModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = BertBaseUncasedModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/bert-base-uncased_torch/bert-base-uncased_torch_test.py
+++ b/tank/bert-base-uncased_torch/bert-base-uncased_torch_test.py
@@ -0,0 +1,108 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from tank.model_utils import compare_tensors
+from shark.shark_downloader import download_torch_model
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import pytest
+
+
+class BertBaseUncasedModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+        onnx_bench=False,
+    ):
+        self.benchmark = benchmark
+        self.onnx_bench = onnx_bench
+
+    def create_and_check_module(self, dynamic, device):
+        model_mlir, func_name, input, act_out = download_torch_model(
+            "bert-base-uncased", dynamic
+        )
+
+        shark_module = SharkInference(
+            model_mlir,
+            func_name,
+            device=device,
+            mlir_dialect="linalg",
+            is_benchmark=self.benchmark,
+        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
+        if self.benchmark == True:
+            shark_args.onnx_bench = self.onnx_bench
+            shark_module.shark_runner.benchmark_all_csv(
+                (input),
+                "bert-base-uncased",
+                dynamic,
+                device,
+                "torch",
+            )
+
+
+class BertBaseUncasedModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = BertBaseUncasedModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/bert_fine_tuning/bert_fine_tune_tf.py
+++ b/tank/bert_fine_tuning/bert_fine_tune_tf.py
@@ -0,0 +1,182 @@
+import numpy as np
+
+from iree import runtime as ireert
+from iree.tf.support import module_utils
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+
+import tensorflow as tf
+
+try:
+    import tensorflow_datasets as tfds
+    import tensorflow_models as tfm
+    from official.nlp.modeling import layers
+    from official.nlp.modeling import networks
+    from official.nlp.modeling.models import bert_classifier
+except ModuleNotFoundError:
+    print(
+        "tensorflow models or datasets not found please run the following command with your virtual env active:\npip install tf-models-nightly tf-datasets"
+    )
+import json
+import time
+import os
+
+gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
+tf.io.gfile.listdir(gs_folder_bert)
+vocab_size = 100
+NUM_CLASSES = 2
+SEQUENCE_LENGTH = 128
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class BertModule(tf.Module):
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+
+        bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
+
+        config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())
+        encoder_config = tfm.nlp.encoders.EncoderConfig(
+            {"type": "bert", "bert": config_dict}
+        )
+        bert_encoder = tfm.nlp.encoders.build_encoder(encoder_config)
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            bert_encoder, num_classes=NUM_CLASSES
+        )
+        bert_trainer_model.summary()
+        checkpoint = tf.train.Checkpoint(encoder=bert_encoder)
+        checkpoint.read(
+            os.path.join(gs_folder_bert, "bert_model.ckpt")
+        ).assert_consumed()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.predict = tf.function(input_signature=[bert_input])(
+            self.m.predict
+        )
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(
+        input_signature=[
+            bert_input,  # inputs
+            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
+        ]
+    )
+    def learn(self, inputs, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            probs = self.m.call(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+
+if __name__ == "__main__":
+    glue, info = tfds.load("glue/mrpc", with_info=True, batch_size=BATCH_SIZE)
+
+    tokenizer = tfm.nlp.layers.FastWordpieceBertTokenizer(
+        vocab_file=os.path.join(gs_folder_bert, "vocab.txt"), lower_case=True
+    )
+
+    max_seq_length = SEQUENCE_LENGTH
+
+    packer = tfm.nlp.layers.BertPackInputs(
+        seq_length=max_seq_length,
+        special_tokens_dict=tokenizer.get_special_tokens_dict(),
+    )
+
+    class BertInputProcessor(tf.keras.layers.Layer):
+        def __init__(self, tokenizer, packer):
+            super().__init__()
+            self.tokenizer = tokenizer
+            self.packer = packer
+
+        def call(self, inputs):
+            tok1 = self.tokenizer(inputs["sentence1"])
+            tok2 = self.tokenizer(inputs["sentence2"])
+
+            packed = self.packer([tok1, tok2])
+
+            if "label" in inputs:
+                return packed, inputs["label"]
+            else:
+                return packed
+
+    bert_inputs_processor = BertInputProcessor(tokenizer, packer)
+    glue_train = glue["train"].map(bert_inputs_processor).prefetch(1)
+    glue_validation = glue["validation"].map(bert_inputs_processor).prefetch(1)
+    glue_test = glue["test"].map(bert_inputs_processor).prefetch(1)
+
+    # base tensorflow model
+    bert_model = BertModule()
+
+    # Compile the model using IREE
+    compiler_module = tfc.compile_module(
+        bert_model, exported_names=["learn"], import_only=True
+    )
+
+    # choose from dylib-llvm-aot or cuda
+    backend = "dylib-llvm-aot"
+    if backend == "dylib-llvm-aot":
+        args = [
+            "--iree-llvm-target-cpu-features=host",
+            "--iree-mhlo-demote-i64-to-i32=false",
+            "--iree-flow-demote-i64-to-i32",
+        ]
+        backend_config = "dylib"
+
+    else:
+        backend_config = "cuda"
+        args = [
+            "--iree-cuda-llvm-target-arch=sm_80",
+            "--iree-hal-cuda-disable-loop-nounroll-wa",
+            "--iree-enable-fusion-with-reduction-ops",
+        ]
+
+    flatbuffer_blob = compile_str(
+        compiler_module,
+        target_backends=[backend],
+        extra_args=args,
+        input_type="mhlo",
+    )
+
+    # Save module as MLIR file in a directory
+    vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
+    tracer = ireert.Tracer(os.getcwd())
+    config = ireert.Config("local-sync", tracer)
+    ctx = ireert.SystemContext(config=config)
+    ctx.add_vm_module(vm_module)
+    BertCompiled = ctx.modules.module
+
+    # compare output losses:
+
+    iterations = 10
+    for i in range(iterations):
+        example_inputs, example_labels = next(iter(glue_train))
+        example_labels = tf.cast(example_labels, tf.int32)
+        example_inputs = [value for key, value in example_inputs.items()]
+
+        # iree version
+        iree_loss = BertCompiled.learn(
+            example_inputs, example_labels
+        ).to_host()
+
+        # base tensorflow
+        tf_loss = np.array(bert_model.learn(example_inputs, example_labels))
+        print(np.allclose(iree_loss, tf_loss))
--- a/tank/birds_V1/birds_V1_tflite_test.py
+++ b/tank/birds_V1/birds_V1_tflite_test.py
@@ -0,0 +1,131 @@
+import numpy as np
+from shark.shark_downloader import download_tflite_model
+from shark.shark_inference import SharkInference
+import pytest
+import unittest
+from shark.parser import shark_args
+import os
+import sys
+import urllib.request
+from PIL import Image
+
+# model_path = "https://tfhub.dev/google/lite-model/aiy/vision/classifier/birds_V1/3?lite-format=tflite"
+
+
+def generate_inputs(input_details):
+    # input_details shape: [  1 224 224   3]  type: uint8
+    exe_basename = os.path.basename(sys.argv[0])
+    workdir = os.path.join(os.path.dirname(__file__), "../tmp", exe_basename)
+    os.makedirs(workdir, exist_ok=True)
+
+    img_path = "https://github.com/google-coral/test_data/raw/master/bird.bmp"
+    local_path = "/".join([workdir, "bird.bmp"])
+    urllib.request.urlretrieve(img_path, local_path)
+
+    shape = input_details[0]["shape"]
+    im = np.array(Image.open(local_path).resize((shape[1], shape[2])))
+    args = [im.reshape(shape)]
+    return args
+
+
+def compare_results(mlir_results, tflite_results):
+    print("Compare mlir_results VS tflite_results: ")
+    assert len(mlir_results) == len(
+        tflite_results
+    ), "Number of results do not match"
+    for i in range(len(mlir_results)):
+        mlir_result = mlir_results[i]
+        tflite_result = tflite_results[i]
+        mlir_result = mlir_result.astype(np.single)
+        tflite_result = tflite_result.astype(np.single)
+        mlir_result = np.expand_dims(mlir_result, axis=0)
+        print("mlir_result.shape", mlir_result.shape)
+        print("tflite_result.shape", tflite_result.shape)
+        assert mlir_result.shape == tflite_result.shape, "shape doesnot match"
+        max_error = np.max(np.abs(mlir_result - tflite_result))
+        print("Max error (%d): %f", i, max_error)
+
+
+class BirdsV1TfliteModuleTester:
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+        save_vmfb=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+
+    def create_and_check_module(self):
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+
+        (
+            mlir_model,
+            function_name,
+            inputs,
+            tflite_results,
+        ) = download_tflite_model(model_name="birds_V1")
+        shark_module = SharkInference(
+            mlir_module=mlir_model,
+            function_name="main",
+            device=self.device,
+            mlir_dialect="tflite",
+        )
+
+        # Case1: Use shark_importer default generate inputs
+        shark_module.compile()
+        mlir_results = shark_module.forward(inputs)
+        compare_results(mlir_results, tflite_results)
+
+        # Case2: Use manually set inputs
+        input_details = [
+            {
+                "shape": [1, 224, 224, 3],
+                "dtype": np.uint8,
+            }
+        ]
+        inputs = generate_inputs(input_details)  # device_inputs
+        shark_module = SharkInference(
+            mlir_module=mlir_model,
+            function_name="main",
+            device=self.device,
+            mlir_dialect="tflite",
+        )
+        shark_module.compile()
+        mlir_results = shark_module.forward(inputs)
+        compare_results(mlir_results, tflite_results)
+        # print(mlir_results)
+
+
+class BirdsV1TfliteModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+        self.save_vmfb = pytestconfig.getoption("save_vmfb")
+
+    def setUp(self):
+        self.module_tester = BirdsV1TfliteModuleTester(self)
+        self.module_tester.save_mlir = self.save_mlir
+
+    import sys
+
+    @pytest.mark.xfail(
+        reason="'tosa.conv2d' op attribute 'quantization_info' failed ",
+    )
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == "__main__":
+    # module_tester = BirdsV1TfliteModuleTester()
+    # module_tester.save_mlir = True
+    # module_tester.save_vmfb = True
+    # module_tester.create_and_check_module()
+
+    unittest.main()
--- a/tank/camembert-base_tf/camembert-base_tf_test.py
+++ b/tank/camembert-base_tf/camembert-base_tf_test.py
@@ -0,0 +1,68 @@
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+
+import iree.compiler as ireec
+import unittest
+import pytest
+import numpy as np
+
+
+class CamemBertModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model, func_name, inputs, golden_out = download_tf_model(
+            "camembert-base"
+        )
+
+        shark_module = SharkInference(
+            model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        result = shark_module.forward(inputs)
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+
+
+class CamemBertModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = CamemBertModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/cartoongan/cartoongan_tflite_test.py
+++ b/tank/cartoongan/cartoongan_tflite_test.py
@@ -0,0 +1,88 @@
+import numpy as np
+from shark.shark_downloader import download_tflite_model
+from shark.shark_inference import SharkInference
+import pytest
+import unittest
+from shark.parser import shark_args
+
+
+# model_path = "https://tfhub.dev/sayakpaul/lite-model/cartoongan/dr/1?lite-format=tflite"
+
+
+def compare_results(mlir_results, tflite_results):
+    print("Compare mlir_results VS tflite_results: ")
+    assert len(mlir_results) == len(
+        tflite_results
+    ), "Number of results do not match"
+    for i in range(len(mlir_results)):
+        mlir_result = mlir_results[i]
+        tflite_result = tflite_results[i]
+        mlir_result = mlir_result.astype(np.single)
+        tflite_result = tflite_result.astype(np.single)
+        mlir_result = np.expand_dims(mlir_result, axis=0)
+        print("mlir_result.shape", mlir_result.shape)
+        print("tflite_result.shape", tflite_result.shape)
+        assert mlir_result.shape == tflite_result.shape, "shape doesnot match"
+        max_error = np.max(np.abs(mlir_result - tflite_result))
+        print("Max error (%d): %f", i, max_error)
+
+
+class CartoonganTfliteModuleTester:
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+        save_vmfb=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+
+    def create_and_check_module(self):
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+
+        (
+            mlir_model,
+            function_name,
+            inputs,
+            tflite_results,
+        ) = download_tflite_model(model_name="cartoongan")
+        shark_module = SharkInference(
+            mlir_module=mlir_model,
+            function_name="main",
+            device=self.device,
+            mlir_dialect="tflite",
+        )
+
+        # Case1: Use shark_importer default generate inputs
+        shark_module.compile()
+        mlir_results = shark_module.forward(inputs)
+        compare_results(mlir_results, tflite_results)
+
+
+class CartoonganTfliteModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+        self.save_vmfb = pytestconfig.getoption("save_vmfb")
+
+    def setUp(self):
+        self.module_tester = CartoonganTfliteModuleTester(self)
+        self.module_tester.save_mlir = self.save_mlir
+
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == "__main__":
+    # module_tester = CartoonganTfliteModuleTester()
+    # module_tester.save_mlir = True
+    # module_tester.save_vmfb = True
+    # module_tester.create_and_check_module()
+
+    unittest.main()
--- a/tank/convbert-base-turkish-cased_tf/convbert-base-turkish-cased_tf_test.py
+++ b/tank/convbert-base-turkish-cased_tf/convbert-base-turkish-cased_tf_test.py
@@ -0,0 +1,71 @@
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+
+import iree.compiler as ireec
+import unittest
+import pytest
+import numpy as np
+
+
+class ConvBertModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model, func_name, inputs, golden_out = download_tf_model(
+            "dbmdz/convbert-base-turkish-cased"
+        )
+
+        shark_module = SharkInference(
+            model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        result = shark_module.forward(inputs)
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+
+
+class ConvBertModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = ConvBertModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    @pytest.mark.xfail(
+        reason="Issue: https://github.com/iree-org/iree/issues/9971",
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/deberta-base_tf/deberta-base_tf_test.py
+++ b/tank/deberta-base_tf/deberta-base_tf_test.py
@@ -0,0 +1,72 @@
+from shark.iree_utils._common import check_device_drivers, device_driver_info
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_tf_model
+from shark.parser import shark_args
+
+import iree.compiler as ireec
+import unittest
+import pytest
+import numpy as np
+import tempfile
+import os
+
+
+class DebertaBaseModuleTester:
+    def __init__(
+        self,
+        benchmark=False,
+    ):
+        self.benchmark = benchmark
+
+    def create_and_check_module(self, dynamic, device):
+        model, func_name, inputs, golden_out = download_tf_model(
+            "microsoft/deberta-base"
+        )
+
+        shark_module = SharkInference(
+            model, func_name, device=device, mlir_dialect="mhlo"
+        )
+        shark_module.compile()
+        result = shark_module.forward(inputs)
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+
+
+class DebertaBaseModuleTest(unittest.TestCase):
+    @pytest.skip(reason="Model can't be imported.", allow_module_level=True)
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.module_tester = DebertaBaseModuleTester(self)
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("gpu"), reason=device_driver_info("gpu")
+    )
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+    @pytest.mark.skipif(
+        check_device_drivers("intel-gpu"),
+        reason=device_driver_info("intel-gpu"),
+    )
+    def test_module_static_intel_gpu(self):
+        dynamic = False
+        device = "intel-gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tank/deeplabv3/deeplabv3_tflite_test.py
+++ b/tank/deeplabv3/deeplabv3_tflite_test.py
@@ -0,0 +1,90 @@
+import numpy as np
+from shark.shark_downloader import download_tflite_model
+from shark.shark_inference import SharkInference
+import pytest
+import unittest
+from shark.parser import shark_args
+
+
+# model_path = "https://tfhub.dev/google/lite-model/aiy/vision/classifier/birds_V1/3?lite-format=tflite"
+
+
+def compare_results(mlir_results, tflite_results):
+    print("Compare mlir_results VS tflite_results: ")
+    assert len(mlir_results) == len(
+        tflite_results
+    ), "Number of results do not match"
+    for i in range(len(mlir_results)):
+        mlir_result = mlir_results[i]
+        tflite_result = tflite_results[i]
+        mlir_result = mlir_result.astype(np.single)
+        tflite_result = tflite_result.astype(np.single)
+        mlir_result = np.expand_dims(mlir_result, axis=0)
+        print("mlir_result.shape", mlir_result.shape)
+        print("tflite_result.shape", tflite_result.shape)
+        assert mlir_result.shape == tflite_result.shape, "shape doesnot match"
+        max_error = np.max(np.abs(mlir_result - tflite_result))
+        print("Max error (%d): %f", i, max_error)
+
+
+class DeepLabV3TfliteModuleTester:
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+        save_vmfb=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+
+    def create_and_check_module(self):
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+
+        # preprocess to get SharkImporter input args
+        (
+            mlir_model,
+            function_name,
+            inputs,
+            tflite_results,
+        ) = download_tflite_model(model_name="deeplabv3")
+
+        shark_module = SharkInference(
+            mlir_module=mlir_model,
+            function_name="main",
+            device=self.device,
+            mlir_dialect="tflite",
+        )
+
+        # Case1: Use shark_importer default generate inputs
+        shark_module.compile()
+        mlir_results = shark_module.forward(inputs)
+        compare_results(mlir_results, tflite_results)
+
+
+class DeepLabV3TfliteModuleTest(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig):
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+        self.save_vmfb = pytestconfig.getoption("save_vmfb")
+
+    def setUp(self):
+        self.module_tester = DeepLabV3TfliteModuleTester(self)
+        self.module_tester.save_mlir = self.save_mlir
+
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == "__main__":
+    # module_tester = DeepLabV3TfliteModuleTester()
+    # module_tester.save_mlir = True
+    # module_tester.save_vmfb = True
+    # module_tester.create_and_check_module()
+
+    unittest.main()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Prashant Kumar	d8c9225af8	Add unet_torch reference. (#283 ) * Add unet_torch reference. * Delete distilbert-base-uncased_torch_test.py	2022-08-19 13:35:10 +05:30
Prashant Kumar	62f3573d43	Add distilbert_torch reference.	2022-08-19 13:34:26 +05:30
Stanley Winata	b73f79be66	Add Substantial testing for IntelGPU	2022-08-18 21:21:43 -07:00