Mini LM Loader Example

-Add example to load miniLM from SharkHUB and benchmark. -Modify TF benchmark to have growing GPU allocation. -Add shark_load helper function
2026-01-12 07:18:27 -05:00 · 2022-06-15 02:57:42 +00:00
149 changed files with 10744 additions and 205 deletions
--- a/.github/workflows/gh-pages-releases.yml
+++ b/.github/workflows/gh-pages-releases.yml
@@ -1,37 +0,0 @@
-# See: https://github.com/llvm/torch-mlir/issues/1374
-name: Publish releases page
-
-on:
-  workflow_dispatch:
-
-jobs:
-  scrape_and_publish_releases:
-    name: "Scrape and publish releases"
-    runs-on: ubuntu-latest
-
-    # Don't run this in everyone's forks.
-    if: github.repository == 'nod-ai/SHARK'
-
-    steps:
-      - name: Checking out repository
-        uses: actions/checkout@v2
-        with:
-          token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-      - name: Run scrape releases script
-        run: python ./build_tools/scrape_releases.py nod-ai SHARK > /tmp/index.html
-        shell: bash
-      - run: git fetch --all
-      - run: git switch github-pages
-      - run: git config --global user.email "none@none.com"
-      - run: git config --global user.name "nod-team"
-      - run: mv /tmp/index.html package-index/index.html
-      - run: git add package-index/index.html
-
-      # Only try to make a commit if the file has changed.
-      - run: git diff --cached --exit-code || git commit -m "Update releases."
-
-      - name: GitHub Push
-        uses: ad-m/github-push-action@v0.6.0
-        with:
-          github_token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
-          branch: github-pages
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -11,12 +11,11 @@ on:
 jobs:
  build:

-    runs-on: a100
+    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.10"]
-        backend: [IREE, SHARK]

    steps:
    - uses: actions/checkout@v3
@@ -39,10 +38,6 @@ jobs:
        tag_name="${package_version}"
        echo "package_version=${package_version}" >> $GITHUB_ENV
        echo "tag_name=${tag_name}" >> $GITHUB_ENV    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
    - name: Create Release
      id: create_release
      uses: actions/create-release@v1
@@ -54,76 +49,43 @@ jobs:
        body: |
          Automatic snapshot release of nod.ai SHARK.
        draft: true
-        prerelease: false
-    - name: Find Torch-MLIR Release
-      run: |
-        TM_HTML_URL="$(python3 -c "import urllib.request, json, sys; u=json.loads(urllib.request.urlopen('https://api.github.com/repos/llvm/torch-mlir/releases/latest').read().decode()).get('html_url', False); print(u) if u else sys.exit(1);")"
-        TM_RELEASE_DIR=${TM_HTML_URL/"tag"/"expanded_assets"}
-        echo "TM_RELEASE_DIR=${TM_RELEASE_DIR}" >> $GITHUB_ENV
+        prerelease: false        
    - name: Install dependencies
      run: |
-        echo "Torch-MLIR Release DIR is ${{ env.TM_RELEASE_DIR }}"
        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
+        python -m pip install flake8 pytest yapf toml
+        if [ -f requirements.txt ]; then pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cpu  -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude shark.venv,lit.cfg.py 
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
-    - name: Build and validate the IREE package
-      if: ${{ matrix.backend == 'IREE' }}
-      run: |
-        cd $GITHUB_WORKSPACE
-        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
-        source iree.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/iree-org/iree/releases
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        if !(grep -Fxq " failed" pytest_results.txt) 
-          then 
-            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/$SHA
-            gsutil -m cp -r gs://shark_tank/$SHA/* gs://shark_tank/latest/
-        fi
-        rm -rf ./wheelhouse/nodai*
+        yapf -i --style .style.yapf shark/*.py

-    - name: Build and validate the SHARK Runtime package
-      if: ${{ matrix.backend == 'SHARK' }}
+    - name: Build and validate the package
      run: |
        cd $GITHUB_WORKSPACE
-        ./setup_venv.sh
+        IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f ${{ env.TM_RELEASE_DIR }} -f https://github.com/nod-ai/SHARK-Runtime/releases
+        pip wheel -v -w wheelhouse . --extra-index-url https://download.pytorch.org/whl/nightly/cpu  -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/SHARK-Runtime/releases
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" tank/test_models.py |
-          tail -n 1 |
-          tee -a pytest_results.txt
+        pytest -k 'not benchmark' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/
    
    - name: Upload Release Assets
-      if: ${{ matrix.backend == 'SHARK' }}
      id: upload-release-assets
      uses: dwenegar/upload-release-assets@v1
      env:
        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
      with:
        release_id: ${{ steps.create_release.outputs.id }}
-        assets_path: ${GITHUB_WORKSPACE}/wheelhouse/nodai_*.whl
+        assets_path: ./wheelhouse/nodai_*.whl

    - name: Publish Release
-      if: ${{ matrix.backend == 'SHARK' }}
      id: publish_release
      uses: eregon/publish-release@v1
      env:
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

-name: Validate Models on Shark Runtime
+name: Validate torch-models on Shark Runtime

 on:
  push:
@@ -11,103 +11,92 @@ on:
  workflow_dispatch:

 jobs:
-  build-validate:
-    strategy:
-      fail-fast: true
-      matrix:
-        os: [icelake, a100, MacStudio, ubuntu-latest]
-        suite: [cpu,cuda,vulkan]
-        python-version: ["3.10"]
-        include:
-          - os: ubuntu-latest
-            suite: lint
-        exclude:
-          - os: ubuntu-latest
-            suite: vulkan
-          - os: ubuntu-latest
-            suite: cuda
-          - os: ubuntu-latest
-            suite: cpu
-          - os: MacStudio
-            suite: cuda
-          - os: MacStudio
-            suite: cpu
-          - os: MacStudio
-            suite: vulkan
-          - os: icelake
-            suite: vulkan
-          - os: icelake
-            suite: cuda
-          - os: a100
-            suite: cpu
+  build-linux:

-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]

    steps:
    - uses: actions/checkout@v3
-    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-        
-    - name: Set up Python Version File ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
-      run: |
-        # See https://github.com/actions/setup-python/issues/433
-        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
-    
    - name: Set up Python ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v3
      with:
-        python-version: '${{ matrix.python-version }}'
-        #cache: 'pip'
-        #cache-dependency-path: |
-        #  **/requirements-importer.txt
-        #  **/requirements.txt
-          
+        python-version: ${{ matrix.python-version }}
+    
+    - name: Setup pip cache
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+
    - name: Install dependencies
-      if: matrix.suite == 'lint'
      run: |
        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest toml black
-        
+        python -m pip install flake8 pytest yapf toml
+
    - name: Lint with flake8
-      if: matrix.suite == 'lint'
      run: |
-        # black format check
-        black --version
-        black --line-length 79 --check .
        # stop the build if there are Python syntax errors or undefined names
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude lit.cfg.py
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py
+        yapf -i --style .style.yapf shark/*.py

-    - name: Validate Models on CPU
-      if: matrix.suite == 'cpu'
+    - name: Validate Models
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cpu
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
-        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
+        pytest -k 'not benchmark' --ignore=tank/tf/ --ignore=shark/tests/test_shark_importer.py
+        
+  perf-macOS:
+    runs-on: MacStudio
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]

-    - name: Validate Models on NVIDIA GPU
-      if: matrix.suite == 'cuda'
+    steps:
+    - uses: actions/checkout@v3   
+    - name: Validate Models dependencies
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python3.10 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k cuda
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
-        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
+        pytest -k 'not benchmark' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=tank/tf/ --ignore=shark/tests/test_shark_importer.py 
+        
+  perf-linux:
+    runs-on: a100
+    timeout-minutes: 45
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]

-    - name: Validate Vulkan Models
-      if: matrix.suite == 'vulkan'
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    
+    - name: Setup pip cache
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+
+    - name: Validate Models
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/data/anush" tank/test_models.py -k vulkan
+        pytest --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/
--- a/.gitignore
+++ b/.gitignore
@@ -162,7 +162,6 @@ cython_debug/

 # Shark related artefacts
 *venv/
-shark_tmp/

 # ORT related artefacts
 cache_models/
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "inference/thirdparty/shark-runtime"]
+	path = inference/thirdparty/shark-runtime
+	url =https://github.com/nod-ai/SHARK-Runtime.git
+	branch = shark-06032022
--- a/.style.yapf
+++ b/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+  based_on_style = google
+  column_limit = 80
--- a/README.md
+++ b/README.md
@@ -0,0 +1,260 @@
+# SHARK
+
+High Performance Machine Learning and Data Analytics for CPUs, GPUs, Accelerators and Heterogeneous Clusters
+
+[![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
+[![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)
+
+## Communication Channels
+
+*   [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
+*   [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
+
+
+## Installation
+
+<details>
+  <summary>Installation (Linux and macOS)</summary>
+  
+### Setup a new pip Virtual Environment
+
+This step sets up a new VirtualEnv for Python
+  
+```shell
+python --version #Check you have 3.7->3.10 on Linux or 3.10 on macOS
+python -m venv shark_venv
+source shark_venv/bin/activate
+
+# If you are using conda create and activate a new conda env
+
+# Some older pip installs may not be able to handle the recent PyTorch deps
+python -m pip install --upgrade pip
+```
+
+*macOS Metal* users please install https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg
+
+### Install SHARK
+  
+This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10
+
+```shell
+pip install nodai-shark -f https://github.com/nod-ai/SHARK/releases -f https://github.com/llvm/torch-mlir/releases -f https://github.com/nod-ai/shark-runtime/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+```
+If you are on an Intel macOS machine you need this [workaround](https://github.com/nod-ai/SHARK/issues/102) for an upstream issue.
+
+### Download and run Resnet50 sample
+    
+```shell
+curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/resnet50_script.py
+#Install deps for test script
+pip install --pre torch torchvision torchaudio tqdm pillow --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+python ./resnet50_script.py --device="cpu"  #use cuda or vulkan or metal 
+```
+        
+### Download and run BERT (MiniLM) sample
+```shell
+curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/minilm_jit.py
+#Install deps for test script
+pip install transformers torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal 
+```
+</details>
+
+
+<details>
+  <summary>Source Installation</summary>
+
+## Check out the code
+
+```shell
+git clone https://github.com/nod-ai/SHARK.git 
+```
+
+## Setup your Python VirtualEnvironment and Dependencies
+```shell
+# Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...).
+./setup_venv.sh
+# Please activate the venv after installation.
+```
+
+### Run a demo script
+```shell
+python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
+```
+
+
+### Run all model tests on CPU/GPU/VULKAN/Metal
+```shell
+pytest shark/tests/models
+
+# If on Linux for quicker results:
+pytest shark/tests/models -n auto
+```
+
+### Run all model benchmark tests on CPU/GPU/VULKAN/Metal
+```shell
+pytest shark/tests/benchmarks
+```
+</details>
+
+
+<details>
+  <summary>API Reference</summary>
+
+### Shark Inference API
+
+```
+from shark_runner import SharkInference
+
+shark_module = SharkInference(
+        module = model class.
+        (input,)  = inputs to model (must be a torch-tensor)
+        dynamic (boolean) = Pass the input shapes as static or dynamic.
+        device = `cpu`, `gpu` or `vulkan` is supported.
+        tracing_required = (boolean) = Jit trace the module with the given input, useful in the case where jit.script doesn't work. )
+shark_module.set_frontend("pytorch") # Use tensorflow, mhlo, linalg, tosa
+shark_module.compile()
+
+result = shark_module.forward(inputs)
+```
+
+
+### Example demonstrating running MHLO IR.
+
+```
+from shark.shark_inference import SharkInference
+import numpy as np
+
+mhlo_ir = r"""builtin.module  {
+      func.func @forward(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
+        %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<4x4xf32>
+        %1 = "mhlo.abs"(%0) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+        return %1 : tensor<4x4xf32>
+      }
+}"""
+
+arg0 = np.ones((1, 4)).astype(np.float32)
+arg1 = np.ones((4, 1)).astype(np.float32)
+
+shark_module = SharkInference(mhlo_ir, (arg0, arg1))
+shark_module.set_frontend("mhlo")
+shark_module.compile()
+print(shark_module.forward((arg0, arg1)))
+```
+</details>
+
+
+## Supported and Validated Models
+
+<details>
+  <summary>PyTorch Models</summary>
+
+### Huggingface PyTorch Models
+
+| Hugging Face Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :heavy_check_mark: (JIT)          | :heavy_check_mark:         |          |             |
+| Albert              | :heavy_check_mark: (JIT)            | :heavy_check_mark:         |          |             |
+| BigBird             | :heavy_check_mark: (AOT)            |          |          |             |
+| DistilBERT          | :heavy_check_mark: (JIT)            | :heavy_check_mark:         |          |             |
+| GPT2                | :x: (AOT)            |          |          |             |
+
+### Torchvision  Models
+  
+| TORCHVISION Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|--------------------|----------------------|----------|----------|-------------|
+| AlexNet            | :heavy_check_mark: (Script)         | :heavy_check_mark:         | :heavy_check_mark:         |             |
+| DenseNet121        | :heavy_check_mark: (Script)         |          |          |             |
+| MNasNet1_0         | :heavy_check_mark: (Script)         |          |          |             |
+| MobileNetV2        | :heavy_check_mark: (Script)         |          |          |             |
+| MobileNetV3        | :heavy_check_mark: (Script)         |          |          |             |
+| Unet               | :x: (Script)         |          |          |             |
+| Resnet18           | :heavy_check_mark: (Script)         | :heavy_check_mark:         |  :heavy_check_mark:        |             |
+| Resnet50           | :heavy_check_mark: (Script)         | :heavy_check_mark:         |   :heavy_check_mark:       |             |
+| Resnet101           | :heavy_check_mark: (Script)         | :heavy_check_mark:         |   :heavy_check_mark:       |             |
+| Resnext50_32x4d    | :heavy_check_mark: (Script)         |          |          |             |
+| ShuffleNet_v2      | :x: (Script)         |          |          |             |
+| SqueezeNet         | :heavy_check_mark: (Script)         | :heavy_check_mark:         |   :heavy_check_mark:       |             |
+| EfficientNet       | :heavy_check_mark: (Script)         |          |          |             |
+| Regnet             | :heavy_check_mark: (Script)         |          |          |             |
+| Resnest            | :x: (Script)         |          |          |             |
+| Vision Transformer | :heavy_check_mark: (Script)         |          |          |             |
+| VGG 16             | :heavy_check_mark: (Script)         | :heavy_check_mark:         |   :heavy_check_mark:       |             |
+| Wide Resnet        | :heavy_check_mark: (Script)         | :heavy_check_mark:         | :heavy_check_mark:         |             |
+| RAFT               | :x: (JIT)            |          |          |             |
+
+For more information refer to [MODEL TRACKING SHEET](https://docs.google.com/spreadsheets/d/15PcjKeHZIrB5LfDyuw7DGEEE8XnQEX2aX8lm8qbxV8A/edit#gid=0)
+
+### PyTorch Training Models 
+
+| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :x:           | :x:         |          |             |
+| FullyConnected                | :heavy_check_mark:           | :heavy_check_mark:         |          |             |
+
+</details>
+  
+<details>
+  <summary>JAX Models</summary>
+
+
+### JAX  Models 
+
+| Models | JAX-MHLO lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| DALL-E                | :x:           | :x:         |          |             |
+| FullyConnected                | :heavy_check_mark:           | :heavy_check_mark:         |          |             |
+ 
+</details>
+  
+<details>
+  <summary>TFLite Models</summary>
+ 
+### TFLite Models 
+
+| Models | TOSA/LinAlg  | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :x:           | :x:         |          |             |
+| FullyConnected                | :heavy_check_mark:           | :heavy_check_mark:         |          |             |
+  
+</details>
+
+<details>
+  <summary>TF Models</summary>
+ 
+### Tensorflow Models 
+
+| Models | Torch-MLIR lowerable | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
+|---------------------|----------------------|----------|----------|-------------|
+| BERT                | :x:           | :x:         |          |             |
+| FullyConnected                | :heavy_check_mark:           | :heavy_check_mark:         |          |             |
+  
+</details>
+
+## Related Projects
+  
+<details>
+  <summary>IREE Project Channels</summary>
+
+*   [Upstream IREE issues](https://github.com/google/iree/issues): Feature requests,
+    bugs, and other work tracking
+*   [Upstream IREE Discord server](https://discord.gg/26P4xW4): Daily development
+    discussions with the core team and collaborators
+*   [iree-discuss email list](https://groups.google.com/forum/#!forum/iree-discuss):
+    Announcements, general and low-priority discussion
+</details>
+    
+<details>
+  <summary>MLIR and Torch-MLIR Project Channels</summary>
+
+* `#torch-mlir` channel on the LLVM [Discord](https://discord.gg/xS7Z362) - this is the most active communication channel
+* Torch-MLIR Github issues [here](https://github.com/llvm/torch-mlir/issues)
+* [`torch-mlir` section](https://llvm.discourse.group/c/projects-that-want-to-become-official-llvm-projects/torch-mlir/41) of LLVM Discourse
+*  Weekly meetings on Mondays 9AM PST. See [here](https://discourse.llvm.org/t/community-meeting-developer-hour-refactoring-recurring-meetings/62575) for more information.
+* [MLIR topic within LLVM Discourse](https://llvm.discourse.group/c/llvm-project/mlir/31) SHARK and IREE is enabled by and heavily relies on [MLIR](https://mlir.llvm.org).
+</details>
+  
+## License
+
+nod.ai SHARK is licensed under the terms of the Apache 2.0 License with LLVM Exceptions.
+See [LICENSE](LICENSE) for more information.
--- a/benchmarks/init.py
+++ b/benchmarks/init.py
--- a/benchmarks/hf_model_benchmark.py
+++ b/benchmarks/hf_model_benchmark.py
@@ -0,0 +1,22 @@
+import torch
+from shark.parser import parser
+from benchmarks.hf_transformer import SharkHFBenchmarkRunner
+
+parser.add_argument(
+    "--model_name",
+    type=str,
+    required=True,
+    help=
+    "Specifies name of HF model to benchmark. (For exmaple \"microsoft/MiniLM-L12-H384-uncased\""
+)
+load_args, unknown = parser.parse_known_args()
+
+if __name__ == "__main__":
+    model_name = load_args.model_name
+    test_input = torch.randint(2, (1, 128))
+    shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
+                                          jit_trace=True)
+    shark_module.benchmark_c()
+    shark_module.benchmark_python((test_input,))
+    shark_module.benchmark_torch(test_input)
+    shark_module.benchmark_onnx(test_input)
--- a/benchmarks/hf_transformer.py
+++ b/benchmarks/hf_transformer.py
@@ -0,0 +1,137 @@
+import torch
+from shark.shark_runner import SharkBenchmarkRunner
+from shark.parser import shark_args
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from onnxruntime.transformers.benchmark import run_pytorch, run_tensorflow, run_onnxruntime
+from onnxruntime.transformers.huggingface_models import MODELS
+from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
+import os
+import psutil
+
+
+class OnnxFusionOptions(object):
+
+    def __init__(self):
+        self.disable_gelu = False
+        self.disable_layer_norm = False
+        self.disable_attention = False
+        self.disable_skip_layer_norm = False
+        self.disable_embed_layer_norm = False
+        self.disable_bias_skip_layer_norm = False
+        self.disable_bias_gelu = False
+        self.enable_gelu_approximation = False
+        self.use_mask_index = False
+        self.no_attention_mask = False
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=
+            2,  # The number of output labels--2 for binary classification.
+            output_attentions=
+            False,  # Whether the model returns attentions weights.
+            output_hidden_states=
+            False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
+    # SharkRunner derived class with Benchmarking capabilities.
+    def __init__(
+        self,
+        model_name: str,
+        input: tuple,
+        dynamic: bool = False,
+        device: str = None,
+        jit_trace: bool = False,
+        from_aot: bool = False,
+        frontend: str = "torch",
+    ):
+        self.device = device if device is not None else shark_args.device
+        if self.device == "gpu":
+            raise ValueError(
+                "Currently GPU Benchmarking is not supported due to OOM from ORT."
+            )
+        self.model_name = model_name
+        model = HuggingFaceLanguage(model_name)
+        SharkBenchmarkRunner.__init__(self, model, input, dynamic, self.device,
+                                      jit_trace, from_aot, frontend)
+
+    def benchmark_torch(self, inputs):
+        use_gpu = self.device == "gpu"
+        # Set set the model's layer number to automatic.
+        config_modifier = ConfigModifier(None)
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        verbose = False
+        result = run_pytorch(use_gpu, [self.model_name], None, config_modifier,
+                             Precision.FLOAT32, num_threads, batch_sizes,
+                             sequence_lengths, shark_args.num_iterations, False,
+                             cache_dir, verbose)
+        print(
+            f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    # TODO: Currently non-functional due to TF runtime error. There might be some issue with, initializing TF.
+    def benchmark_tf(self, inputs):
+        use_gpu = self.device == "gpu"
+        # Set set the model's layer number to automatic.
+        config_modifier = ConfigModifier(None)
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        verbose = False
+        result = run_tensorflow(use_gpu, [self.model_name], None,
+                                config_modifier, Precision.FLOAT32, num_threads,
+                                batch_sizes, sequence_lengths,
+                                shark_args.num_iterations, cache_dir, verbose)
+        print(
+            f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    def benchmark_onnx(self, inputs):
+        if self.model_name not in MODELS:
+            print(
+                f"{self.model_name} is currently not supported in ORT's HF. Check \
+https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
+for currently supported models. Exiting benchmark ONNX.")
+            return
+        use_gpu = self.device == "gpu"
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        onnx_dir = os.path.join(".", "onnx_models")
+        verbose = False
+        input_counts = [1]
+        optimize_onnx = True
+        validate_onnx = False
+        disable_ort_io_binding = False
+        use_raw_attention_mask = True
+        model_fusion_statistics = {}
+        overwrite = False
+        model_source = "pt"  #Either "pt" or "tf"
+        provider = None
+        config_modifier = ConfigModifier(None)
+        onnx_args = OnnxFusionOptions()
+        result = run_onnxruntime(
+            use_gpu, provider, [self.model_name], None, config_modifier,
+            Precision.FLOAT32, num_threads, batch_sizes, sequence_lengths,
+            shark_args.num_iterations, input_counts, optimize_onnx,
+            validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
+            disable_ort_io_binding, use_raw_attention_mask,
+            model_fusion_statistics, model_source, onnx_args)
+        print(
+            f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -0,0 +1,210 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+
+import torch
+import tensorflow as tf
+import numpy as np
+import torchvision.models as models
+from transformers import AutoModelForSequenceClassification, BertTokenizer, TFBertModel
+import importlib
+import pytest
+import unittest
+
+torch.manual_seed(0)
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+  tf.config.experimental.set_memory_growth(gpu, True)
+
+##################### Tensorflow Hugging Face LM Models ###################################
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+tf_bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class TFHuggingFaceLanguage(tf.Module):
+
+    def __init__(self, hf_model_name):
+        super(TFHuggingFaceLanguage, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(hf_model_name, from_pt=True)
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+
+    @tf.function(input_signature=tf_bert_input)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+def get_TFhf_model(name):
+    model = TFHuggingFaceLanguage(name)
+    tokenizer = BertTokenizer.from_pretrained(name)
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text,
+                              padding='max_length',
+                              truncation=True,
+                              max_length=MAX_SEQUENCE_LENGTH)
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0)
+    test_input = (encoded_input["input_ids"], encoded_input["attention_mask"],
+                  encoded_input["token_type_ids"])
+    actual_out = model.forward(*test_input)
+    return model, test_input, actual_out
+
+
+##################### Hugging Face LM Models ###################################
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=
+            2,  # The number of output labels--2 for binary classification.
+            output_attentions=
+            False,  # Whether the model returns attentions weights.
+            output_hidden_states=
+            False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+def get_hf_model(name):
+    model = HuggingFaceLanguage(name)
+    # TODO: Currently the test input is set to (1,128)
+    test_input = torch.randint(2, (1, 128))
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+
+################################################################################
+
+##################### Torch Vision Models    ###################################
+
+
+class VisionModule(torch.nn.Module):
+
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+def get_vision_model(torch_model):
+    model = VisionModule(torch_model)
+    # TODO: Currently the test input is set to (1,128)
+    test_input = torch.randn(1, 3, 224, 224)
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+
+#############################   Benchmark Tests ####################################
+
+pytest_benchmark_param = pytest.mark.parametrize(
+    ('dynamic', 'device'),
+    [
+        pytest.param(False, 'cpu'),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, 'cpu', marks=pytest.mark.skip),
+        pytest.param(False,
+                     'gpu',
+                     marks=pytest.mark.skipif(check_device_drivers("gpu"),
+                                              reason="nvidia-smi not found")),
+        pytest.param(True,
+                     'gpu',
+                     marks=pytest.mark.skip),
+        pytest.param(
+            False,
+            'vulkan',
+            marks=pytest.mark.skipif(
+                check_device_drivers("vulkan"),
+                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )),
+        pytest.param(
+            True,
+            'vulkan',
+            marks=pytest.mark.skipif(
+                check_device_drivers("vulkan"),
+                reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )),
+    ])
+
+
+@pytest.mark.skipif(importlib.util.find_spec("iree.tools") is None,
+                    reason="Cannot find tools to import TF")
+@pytest_benchmark_param
+def test_bench_minilm_torch(dynamic, device):
+    model, test_input, act_out = get_hf_model(
+        "microsoft/MiniLM-L12-H384-uncased")
+    shark_module = SharkInference(model, (test_input,),
+                                  device=device,
+                                  dynamic=dynamic,
+                                  jit_trace=True,
+                                  benchmark_mode=True)
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.compile()
+        shark_module.benchmark_all((test_input,))
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
+
+
+@pytest.mark.skipif(importlib.util.find_spec("iree.tools") is None,
+                    reason="Cannot find tools to import TF")
+@pytest_benchmark_param
+def test_bench_distilbert(dynamic, device):
+    model, test_input, act_out = get_TFhf_model("distilbert-base-uncased")
+    shark_module = SharkInference(model,
+                                  test_input,
+                                  device=device,
+                                  dynamic=dynamic,
+                                  jit_trace=True,
+                                  benchmark_mode=True)
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.set_frontend("tensorflow")
+        shark_module.compile()
+        shark_module.benchmark_all(test_input)
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
+
+
+@pytest.mark.skip(reason="XLM Roberta too large to test.")
+@pytest_benchmark_param
+def test_bench_xlm_roberta(dynamic, device):
+    model, test_input, act_out = get_TFhf_model("xlm-roberta-base")
+    shark_module = SharkInference(model,
+                                  test_input,
+                                  device=device,
+                                  dynamic=dynamic,
+                                  jit_trace=True,
+                                  benchmark_mode=True)
+    try:
+        # If becnhmarking succesful, assert success/True.
+        shark_module.set_frontend("tensorflow")
+        shark_module.compile()
+        shark_module.benchmark_all(test_input)
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
--- a/benchmarks/tests/test_hf_benchmark.py
+++ b/benchmarks/tests/test_hf_benchmark.py
@@ -0,0 +1,39 @@
+import torch
+from benchmarks.hf_transformer import SharkHFBenchmarkRunner
+import importlib
+import pytest
+
+torch.manual_seed(0)
+
+############################# HF Benchmark Tests ####################################
+
+# Test running benchmark module without failing.
+pytest_benchmark_param = pytest.mark.parametrize(
+    ('dynamic', 'device'),
+    [
+        pytest.param(False, 'cpu'),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, 'cpu', marks=pytest.mark.skip),
+    ])
+
+
+@pytest.mark.skipif(importlib.util.find_spec("onnxruntime") is None,
+                    reason="Cannot find ONNXRUNTIME.")
+@pytest_benchmark_param
+def test_HFbench_minilm_torch(dynamic, device):
+    model_name = "bert-base-uncased"
+    test_input = torch.randint(2, (1, 128))
+    try:
+        shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
+                                              jit_trace=True,
+                                              dynamic=dynamic,
+                                              device=device)
+        shark_module.benchmark_c()
+        shark_module.benchmark_python((test_input,))
+        shark_module.benchmark_torch(test_input)
+        shark_module.benchmark_onnx(test_input)
+        # If becnhmarking succesful, assert success/True.
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -0,0 +1,192 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.17)
+
+project(sharkbackend LANGUAGES C CXX)
+
+#
+# Options
+#
+
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
+option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+#
+# Dependencies
+#
+# FetchContent requires us to include the transitive closure of all
+# repos that we depend on so that we can override the tags.
+#
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-core
+  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_TAG ${TRITON_CORE_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-backend
+  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(repo-common repo-core repo-backend)
+
+#
+# The backend must be built into a shared library. Use an ldscript to
+# hide all symbols except for the TRITONBACKEND API.
+#
+configure_file(src/libtriton_dshark.ldscript libtriton_dshark.ldscript COPYONLY)
+
+add_library(
+  triton-dshark-backend SHARED
+  src/dshark.cc
+  #src/dshark_driver_module.c
+)
+
+add_library(
+  SharkBackend::triton-dshark-backend ALIAS triton-dshark-backend
+)
+
+target_include_directories(
+  triton-dshark-backend
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_BINARY_DIR}/lib/cmake/mlir")
+
+add_subdirectory(thirdparty/shark-runtime EXCLUDE_FROM_ALL)
+
+target_link_libraries(triton-dshark-backend PRIVATE iree_base_base
+  iree_hal_hal
+  iree_hal_cuda_cuda
+  iree_hal_cuda_registration_registration
+  iree_hal_vmvx_registration_registration
+  iree_hal_dylib_registration_registration
+  iree_modules_hal_hal
+  iree_vm_vm
+  iree_vm_bytecode_module
+  iree_hal_local_loaders_system_library_loader
+  iree_hal_local_loaders_vmvx_module_loader
+  )
+
+target_compile_features(triton-dshark-backend PRIVATE cxx_std_11)
+
+
+target_link_libraries(
+  triton-dshark-backend
+  PRIVATE
+    triton-core-serverapi   # from repo-core
+    triton-core-backendapi  # from repo-core
+    triton-core-serverstub  # from repo-core
+    triton-backend-utils    # from repo-backend
+)
+
+if(WIN32)
+  set_target_properties(
+    triton-dshark-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_dshark
+  )
+else()
+  set_target_properties(
+    triton-dshark-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_dshark
+    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_dshark.ldscript
+    LINK_FLAGS "-Wl,--version-script libtriton_dshark.ldscript"
+  )
+endif()
+
+
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/SharkBackend)
+
+install(
+  TARGETS
+    triton-dshark-backend
+  EXPORT
+    triton-dshark-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
+  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/dshark
+)
+
+install(
+  EXPORT
+    triton-dshark-backend-targets
+  FILE
+    SharkBackendTargets.cmake
+  NAMESPACE
+    SharkBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/SharkBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-dshark-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/SharkBackendTargets.cmake
+  NAMESPACE SharkBackend::
+)
+
+export(PACKAGE SharkBackend)
+
--- a/inference/README.md
+++ b/inference/README.md
@@ -0,0 +1,100 @@
+# SHARK Triton Backend
+
+The triton backend for shark.
+
+# Build
+
+Install SHARK
+
+```
+git clone https://github.com/nod-ai/SHARK.git
+# skip above step if dshark is already installed
+cd SHARK/inference
+```
+
+install dependancies
+
+```
+apt-get install patchelf rapidjson-dev python3-dev
+git submodule update --init
+```
+
+update the submodules of iree
+
+```
+cd thirdparty/shark-runtime
+git submodule update --init
+```
+
+Next, make the backend and install it
+
+```
+cd ../..
+mkdir build && cd build
+cmake -DTRITON_ENABLE_GPU=ON \
+-DIREE_HAL_DRIVER_CUDA=ON \
+-DIREE_TARGET_BACKEND_CUDA=ON \
+-DMLIR_ENABLE_CUDA_RUNNER=ON \
+-DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \
+-DTRITON_BACKEND_REPO_TAG=r22.02 \
+-DTRITON_CORE_REPO_TAG=r22.02 \
+-DTRITON_COMMON_REPO_TAG=r22.02 ..
+make install
+```
+
+# Incorporating into Triton
+
+There are much more in depth explenations for the following steps in triton's documentation:
+https://github.com/triton-inference-server/server/blob/main/docs/compose.md#triton-with-unsupported-and-custom-backends
+
+There should be a file at /build/install/backends/dshark/libtriton_dshark.so.  You will need to copy it into your triton server image.  
+More documentation is in the link above, but to create the docker image, you need to run the compose.py command in the triton-backend server repo
+
+
+To first build your image, clone the tritonserver repo.
+
+```
+git clone https://github.com/triton-inference-server/server.git
+```
+
+then run `compose.py` to build a docker compose file 
+```
+cd server
+python3 compose.py --repoagent checksum --dry-run
+```
+
+Because dshark is a third party backend, you will need to manually modify the `Dockerfile.compose` to include the dshark backend.  To do this, in the Dockerfile.compose file produced, copy this line.
+the dshark backend will be located in the build folder from earlier under `/build/install/backends`
+
+```
+COPY /path/to/build/install/backends/dshark /opt/tritonserver/backends/dshark
+```
+
+Next run 
+```
+docker build -t tritonserver_custom -f Dockerfile.compose .
+docker run -it --gpus=1 --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
+```
+
+where `path/to/model_repos` is where you are storing the models you want to run
+
+if your not using gpus, omit `--gpus=1`
+
+```
+docker run -it  --net=host -v/path/to/model_repos:/models  tritonserver_custom:latest tritonserver --model-repository=/models
+```
+
+# Setting up a model
+
+to include a model in your backend, add a directory with your model name to your model repository directory.  examples of models can be seen here: https://github.com/triton-inference-server/backend/tree/main/examples/model_repos/minimal_models
+
+make sure to adjust the input correctly in the config.pbtxt file, and save a vmfb file under 1/model.vmfb
+
+# CUDA
+
+if you're having issues with cuda, make sure your correct drivers are installed, and that `nvidia-smi` works, and also make sure that the nvcc compiler is on the path.
+
+
+
+
+
--- a/inference/cmake/SharkBackendConfig.cmake.in
+++ b/inference/cmake/SharkBackendConfig.cmake.in
@@ -0,0 +1,39 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  SHARKBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${SHARKBACKEND_CMAKE_DIR})
+
+if(NOT TARGET SharkBackend::triton-dshark-backend)
+  include("${SHARKBACKEND_CMAKE_DIR}/SharkBackendTargets.cmake")
+endif()
+
+set(SHARKBACKEND_LIBRARIES SharkBackend::triton-dshark-backend)
--- a/inference/src/dshark.cc
+++ b/inference/src/dshark.cc
--- a/inference/src/libtriton_dshark.ldscript
+++ b/inference/src/libtriton_dshark.ldscript
@@ -0,0 +1,30 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
--- a/inference/thirdparty/shark-runtime
+++ b/inference/thirdparty/shark-runtime
--- a/package-index/index.html
+++ b/package-index/index.html
@@ -1,45 +0,0 @@
-<!DOCTYPE html>
-<html>
-  <body>
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230130.481/shark_sd_20230130_481.exe'>shark_sd_20230130_481.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230130.481/shark_sd_cli_20230130_481.exe'>shark_sd_cli_20230130_481.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.479/shark_sd_20230129_479.exe'>shark_sd_20230129_479.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.479/shark_sd_cli_20230129_479.exe'>shark_sd_cli_20230129_479.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.480/shark_sd_20230129_480.exe'>shark_sd_20230129_480.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.480/shark_sd_cli_20230129_480.exe'>shark_sd_cli_20230129_480.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.478/shark_sd_20230129_478.exe'>shark_sd_20230129_478.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230129.478/shark_sd_cli_20230129_478.exe'>shark_sd_cli_20230129_478.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230128.477/shark_sd_20230128_477.exe'>shark_sd_20230128_477.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230128.477/shark_sd_cli_20230128_477.exe'>shark_sd_cli_20230128_477.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230127.476/shark_sd_20230127_476.exe'>shark_sd_20230127_476.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230127.476/shark_sd_cli_20230127_476.exe'>shark_sd_cli_20230127_476.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230126.475/shark_sd_20230126_475.exe'>shark_sd_20230126_475.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230126.475/shark_sd_cli_20230126_475.exe'>shark_sd_cli_20230126_475.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.474/shark_sd_20230125_474.exe'>shark_sd_20230125_474.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.474/shark_sd_cli_20230125_474.exe'>shark_sd_cli_20230125_474.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.473/shark_sd_20230125_473.exe'>shark_sd_20230125_473.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.473/shark_sd_cli_20230125_473.exe'>shark_sd_cli_20230125_473.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.472/shark_sd_20230125_472.exe'>shark_sd_20230125_472.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.471/shark_sd_20230125_471.exe'>shark_sd_20230125_471.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230125.468/shark_sd_20230125_468.exe'>shark_sd_20230125_468.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.470/shark_sd_20230124_470.exe'>shark_sd_20230124_470.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.470/shark_sd_cli_20230124_470.exe'>shark_sd_cli_20230124_470.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.469/shark_sd_20230124_469.exe'>shark_sd_20230124_469.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.467/shark_sd_20230124_467.exe'>shark_sd_20230124_467.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.466/shark_sd_20230124_466.exe'>shark_sd_20230124_466.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230124.462/shark_sd_20230124_462.exe'>shark_sd_20230124_462.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230123.461/shark_sd_20230123_461.exe'>shark_sd_20230123_461.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230123.460/shark_sd_20230123_460.exe'>shark_sd_20230123_460.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.459/shark_sd_20230122_459.exe'>shark_sd_20230122_459.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.458/shark_sd_20230122_458.exe'>shark_sd_20230122_458.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230122.457/shark_sd_20230122_457.exe'>shark_sd_20230122_457.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230121.456/shark_sd_20230121_456.exe'>shark_sd_20230121_456.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230120.455/shark_sd_20230120_455.exe'>shark_sd_20230120_455.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230119.454/shark_sd_20230119_454.exe'>shark_sd_20230119_454.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230118.453/shark_sd_20230118_453.exe'>shark_sd_20230118_453.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230117.452/shark_sd_20230117_452.exe'>shark_sd_20230117_452.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230116.451/shark_sd_20230116_451.exe'>shark_sd_20230116_451.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230115.450/shark_sd_20230115_450.exe'>shark_sd_20230115_450.exe</a><br />
-    <a href='https://github.com/nod-ai/SHARK/releases/download/20230114.449/shark_sd_20230114_449.exe'>shark_sd_20230114_449.exe</a><br />
-  </body>
-</html>
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,12 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel",
+    "packaging",
+
+    "numpy==1.22.4",
+    "torch-mlir>=20220428.420",
+    "iree-compiler>=20220427.13",
+    "iree-runtime>=20220427.13",
+]
+build-backend = "setuptools.build_meta"
--- a/pytest.ini
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+addopts = --verbose -p no:warnings
+norecursedirs = inference tank/tflite 
--- a/requirements-importer-macos.txt
+++ b/requirements-importer-macos.txt
@@ -0,0 +1,40 @@
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+--pre
+
+numpy
+torch
+torchvision
+
+tqdm
+
+#iree-compiler  | iree-runtime should already be installed
+#these dont work ok osx
+#iree-tools-tflite
+#iree-tools-xla
+#iree-tools-tf
+
+# TensorFlow and JAX.
+gin-config
+tensorflow-macos
+tensorflow-metal
+#tf-models-nightly
+#tensorflow-text-nightly
+transformers==4.18.0
+#jax[cpu]
+
+# tflitehub dependencies.
+Pillow
+
+# Testing and support.
+#lit
+#pyyaml
+
+#ONNX and ORT for benchmarking
+#--extra-index-url https://test.pypi.org/simple/
+#protobuf
+#coloredlogs
+#flatbuffers
+#sympy
+#psutil
+#onnx-weekly
+#ort-nightly
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -0,0 +1,39 @@
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+--pre
+
+numpy==1.22.4
+torch
+torchvision
+
+tqdm
+
+#iree-compiler  | iree-runtime should already be installed
+iree-tools-tflite
+iree-tools-xla
+iree-tools-tf
+
+# TensorFlow and JAX.
+gin-config
+tensorflow
+tf-models-nightly
+tensorflow-text-nightly
+transformers==4.18.0
+#jax[cpu]
+
+
+# tflitehub dependencies.
+Pillow
+
+# Testing and support.
+lit
+pyyaml
+
+#ONNX and ORT for benchmarking
+--extra-index-url https://test.pypi.org/simple/
+protobuf
+coloredlogs
+flatbuffers
+sympy
+psutil
+onnx-weekly
+ort-nightly
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+setuptools
+wheel
+
+#SHARK Runner
+tqdm
+
+#Testing
+pytest
+pytest-xdist
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,38 @@
+from setuptools import find_packages
+from setuptools import setup
+
+import os
+
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+PACKAGE_VERSION = os.environ.get("SHARK_PACKAGE_VERSION") or "0.0.4"
+
+setup(
+    name="nodai-SHARK",
+    version=f"{PACKAGE_VERSION}",
+    description="SHARK provides a High Performance Machine Learning Framework",
+    author="nod.ai",
+    author_email="stdin@nod.ai",
+    url="https://nod.ai",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    project_urls={
+        "Code": "https://github.com/nod-ai/SHARK",
+        "Bug Tracker": "https://github.com/nod-ai/SHARK/issues",
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    packages=find_packages(exclude=('examples')),
+    python_requires=">=3.7",
+    install_requires=[
+        "numpy",
+        "PyYAML",
+        "torch-mlir>=20220428.420",
+        "iree-compiler>=20220427.13",
+        "iree-runtime>=20220427.13",
+    ],
+)
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+# Sets up a venv suitable for running samples.
+# e.g:
+# ./setup_venv.sh  #setup a default $PYTHON3 shark.venv
+# Environment Variables by the script.
+# PYTHON=$PYTHON3.10 ./setup_venv.sh  #pass a version of $PYTHON to use
+# VENV_DIR=myshark.venv #create a venv called myshark.venv
+# USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
+# IMPORTER=1 #Install importer deps
+# if you run the script from a conda env it will install in your conda env
+
+TD="$(cd $(dirname $0) && pwd)"
+if [ -z "$PYTHON" ]; then
+  PYTHON="$(which python3)"
+fi
+
+function die() {
+  echo "Error executing command: $*"
+  exit 1
+}
+
+PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))'`
+
+echo "Python: $PYTHON"
+echo "Python version: $PYTHON_VERSION_X_Y"
+
+if [[ -z "${CONDA_PREFIX}" ]]; then
+  # Not a conda env. So create a new VENV dir
+  VENV_DIR=${VENV_DIR:-shark.venv}
+  echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
+  $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
+  source "$VENV_DIR/bin/activate" || die "Could not activate venv"
+  PYTHON="$(which python3)"
+else
+  echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
+fi
+
+Red=`tput setaf 1`
+Green=`tput setaf 2`
+Yellow=`tput setaf 3`
+
+# Assume no binary torch-mlir.
+# Currently available for macOS m1&intel (3.10) and Linux(3.7,3.8,3.9,3.10)
+torch_mlir_bin=false
+if [[ $(uname -s) = 'Darwin' ]]; then
+  echo "${Yellow}Apple macOS detected"
+  if [[ $(uname -m) == 'arm64' ]]; then
+    echo "${Yellow}Apple M1 Detected"
+    hash rustc 2>/dev/null
+    if [ $? -eq 0 ];then
+      echo "${Green}rustc found to compile HF tokenizers"
+    else
+      echo "${Red}Could not find rustc" >&2
+      echo "${Red}Please run:"
+      echo "${Red}curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh"
+      exit 1
+    fi
+  fi
+  echo "${Yellow}Run the following commands to setup your SSL certs for your Python version if you see SSL errors with tests"
+  echo "${Yellow}/Applications/Python\ 3.XX/Install\ Certificates.command"
+  if [ "$PYTHON_VERSION_X_Y" == "3.10" ]; then
+    torch_mlir_bin=true
+  fi
+elif [[ $(uname -s) = 'Linux' ]]; then
+  echo "${Yellow}Linux detected"
+  if [ "$PYTHON_VERSION_X_Y" == "3.7" ] || [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.9" ] || [ "$PYTHON_VERSION_X_Y" == "3.10" ] ; then
+    torch_mlir_bin=true
+  fi
+else
+  echo "${Red}OS not detected. Pray and Play"
+fi
+
+# Upgrade pip and install requirements.
+$PYTHON -m pip install --upgrade pip || die "Could not upgrade pip"
+$PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
+if [ "$torch_mlir_bin" = true ]; then
+  $PYTHON -m pip install --find-links https://github.com/llvm/torch-mlir/releases torch-mlir --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  if [ $? -eq 0 ];then
+    echo "Successfully Installed torch-mlir"
+  else
+    echo "Could not install torch-mlir" >&2
+  fi
+else
+  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
+  echo "${Yello}Python 3.10 supported on macOS and 3.7,3.8,3.9 and 3.10 on Linux"
+  echo "${Red}Please build torch-mlir from source in your environment"
+  exit 1
+fi
+if [[ -z "${USE_IREE}" ]]; then
+  RUNTIME="nod-ai/SHARK-Runtime"
+else
+  RUNTIME="google/iree"
+fi
+echo "Installing ${RUNTIME}..."
+$PYTHON -m pip install --find-links https://github.com/${RUNTIME}/releases iree-compiler iree-runtime
+
+if [[ ! -z "${IMPORTER}" ]]; then
+  echo "${Yellow}Installing importer tools.."
+  if [[ $(uname -s) = 'Linux' ]]; then
+    echo "${Yellow}Linux detected.. installing Linux importer tools"
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://test.pypi.org/simple/ --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  elif [[ $(uname -s) = 'Darwin' ]]; then
+    echo "${Yellow}macOS detected.. installing macOS importer tools"
+    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
+    $PYTHON -m pip install --upgrade -r "$TD/requirements-importer-macos.txt" -f https://github.com/${RUNTIME}/releases --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  fi
+fi
+
+$PYTHON -m pip install -e . --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://github.com/llvm/torch-mlir/releases -f https://github.com/${RUNTIME}/releases
+
+if [[ -z "${CONDA_PREFIX}" ]]; then
+  echo "${Green}Before running examples activate venv with:"
+  echo "  ${Green}source $VENV_DIR/bin/activate"
+fi
+
--- a/shark/init.py
+++ b/shark/init.py
--- a/shark/backward_makefx.py
+++ b/shark/backward_makefx.py
@@ -0,0 +1,72 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch._decomp import get_decompositions
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.nn.utils import _stateless
+
+from torch import fx
+import copy
+import tempfile
+
+
+class MakeFxModule:
+
+    def __init__(self, model, inputs, labels=None, custom_inference_fn=None):
+        self.model = model
+        self.inputs = inputs
+        self.custom_inference_fn = custom_inference_fn
+        self.training_graph = None
+
+    # Doesn't replace the None type.
+    def change_fx_graph_return_to_tuple(self, fx_g: fx.GraphModule):
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                # output nodes always have one argument
+                node_arg = node.args[0]
+                out_nodes = []
+                if isinstance(node_arg, list):
+                    # Don't return NoneType elements.
+                    for out_node in node_arg:
+                        if not isinstance(out_node, type(None)):
+                            out_nodes.append(out_node)
+                    # If there is a single tensor/element to be returned don't
+                    # a tuple for it.
+                    if len(out_nodes) == 1:
+                        node.args = out_nodes
+                    else:
+                        node.args = (tuple(out_nodes),)
+        fx_g.graph.lint()
+        fx_g.recompile()
+        return fx_g
+
+    def generate_graph(self):
+        fx_g = make_fx(self.custom_inference_fn,
+                       decomposition_table=get_decompositions([
+                           torch.ops.aten.embedding_dense_backward,
+                           torch.ops.aten.native_layer_norm_backward,
+                           torch.ops.aten.slice_backward,
+                           torch.ops.aten.select_backward
+                       ]))(dict(self.model.named_parameters()),
+                           dict(self.model.named_buffers()), self.inputs)
+        fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+        fx_g.recompile()
+        fx_g = self.change_fx_graph_return_to_tuple(fx_g)
+        ts_g = torch.jit.script(fx_g)
+        temp = tempfile.NamedTemporaryFile(suffix='_shark_ts',
+                                           prefix='temp_ts_')
+        ts_g.save(temp.name)
+        new_ts = torch.jit.load(temp.name)
+        self.training_graph = new_ts
--- a/shark/cuda_utils.py
+++ b/shark/cuda_utils.py
@@ -0,0 +1,78 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import ctypes
+
+#Some constants taken from cuda.h
+CUDA_SUCCESS = 0
+CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
+CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
+CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
+CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
+
+
+def get_cuda_sm_cc():
+    libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')
+    for libname in libnames:
+        try:
+            cuda = ctypes.CDLL(libname)
+        except OSError:
+            continue
+        else:
+            break
+    else:
+        raise OSError("could not load any of: " + ' '.join(libnames))
+
+    nGpus = ctypes.c_int()
+    name = b' ' * 100
+    cc_major = ctypes.c_int()
+    cc_minor = ctypes.c_int()
+
+    result = ctypes.c_int()
+    device = ctypes.c_int()
+    context = ctypes.c_void_p()
+    error_str = ctypes.c_char_p()
+
+    result = cuda.cuInit(0)
+    if result != CUDA_SUCCESS:
+        cuda.cuGetErrorString(result, ctypes.byref(error_str))
+        print("cuInit failed with error code %d: %s" %
+              (result, error_str.value.decode()))
+        return 1
+    result = cuda.cuDeviceGetCount(ctypes.byref(nGpus))
+    if result != CUDA_SUCCESS:
+        cuda.cuGetErrorString(result, ctypes.byref(error_str))
+        print("cuDeviceGetCount failed with error code %d: %s" %
+              (result, error_str.value.decode()))
+        return 1
+    print("Found %d device(s)." % nGpus.value)
+    for i in range(nGpus.value):
+        result = cuda.cuDeviceGet(ctypes.byref(device), i)
+        if result != CUDA_SUCCESS:
+            cuda.cuGetErrorString(result, ctypes.byref(error_str))
+            print("cuDeviceGet failed with error code %d: %s" %
+                  (result, error_str.value.decode()))
+            return 1
+        print("Device: %d" % i)
+        if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name),
+                                device) == CUDA_SUCCESS:
+            print("  Name: %s" % (name.split(b'\0', 1)[0].decode()))
+        if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major),
+                                          ctypes.byref(cc_minor),
+                                          device) == CUDA_SUCCESS:
+            print("  Compute Capability: %d.%d" %
+                  (cc_major.value, cc_minor.value))
+    sm = f"sm_{cc_major.value}{cc_minor.value}"
+    return sm
--- a/shark/examples/shark_eager/dynamo_demo.ipynb
+++ b/shark/examples/shark_eager/dynamo_demo.ipynb
@@ -0,0 +1,300 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mlevental/miniconda3/envs/torch-mlir/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "# standard imports\n",
+    "import torch\n",
+    "from shark.iree_utils import get_iree_compiled_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "# torch dynamo related imports\n",
+    "try:\n",
+    "    import torchdynamo\n",
+    "    from torchdynamo.optimizations.backends import create_backend\n",
+    "    from torchdynamo.optimizations.subgraph import SubGraph\n",
+    "except ModuleNotFoundError:\n",
+    "    print(\"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\")\n",
+    "    exit()\n",
+    "\n",
+    "# torch-mlir imports for compiling\n",
+    "from torch_mlir import compile, OutputType"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "[TorchDynamo](https://github.com/pytorch/torchdynamo) is a compiler for PyTorch programs that uses the [frame evaluation API](https://www.python.org/dev/peps/pep-0523/) in CPython to dynamically modify Python bytecode right before it is executed. It creates this FX Graph through bytecode analysis and is designed to mix Python execution with compiled backends."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "def toy_example(*args):\n",
+    "    a, b = args\n",
+    "\n",
+    "    x = a / (torch.abs(a) + 1)\n",
+    "    if b.sum() < 0:\n",
+    "        b = b * -1\n",
+    "    return x * b"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [],
+   "source": [
+    "# compiler that lowers fx_graph to through MLIR\n",
+    "def __torch_mlir(fx_graph, *args, **kwargs):\n",
+    "    assert isinstance(\n",
+    "        fx_graph, torch.fx.GraphModule\n",
+    "    ), \"Model must be an FX GraphModule.\"\n",
+    "\n",
+    "    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule):\n",
+    "        \"\"\"Replace tuple with tuple element in functions that return one-element tuples.\"\"\"\n",
+    "\n",
+    "        for node in fx_g.graph.nodes:\n",
+    "            if node.op == \"output\":\n",
+    "                assert len(node.args) == 1, \"Output node must have a single argument\"\n",
+    "                node_arg = node.args[0]\n",
+    "                if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
+    "                    node.args = (node_arg[0],)\n",
+    "        fx_g.graph.lint()\n",
+    "        fx_g.recompile()\n",
+    "        return fx_g\n",
+    "\n",
+    "    fx_graph = _unwrap_single_tuple_return(fx_graph)\n",
+    "    ts_graph = torch.jit.script(fx_graph)\n",
+    "\n",
+    "    # torchdynamo does munges the args differently depending on whether you use\n",
+    "    # the @torchdynamo.optimize decorator or the context manager\n",
+    "    if isinstance(args, tuple):\n",
+    "        args = list(args)\n",
+    "    assert isinstance(args, list)\n",
+    "    if len(args) == 1 and isinstance(args[0], list):\n",
+    "        args = args[0]\n",
+    "\n",
+    "    linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)\n",
+    "    callable, _ = get_iree_compiled_module(linalg_module, \"cuda\", func_name=\"forward\")\n",
+    "\n",
+    "    def forward(*inputs):\n",
+    "        return callable(*inputs)\n",
+    "\n",
+    "    return forward"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Simplest way to use TorchDynamo with the `torchdynamo.optimize` context manager:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 1 device(s).\n",
+      "Device: 0\n",
+      "  Name: NVIDIA GeForce RTX 3080\n",
+      "  Compute Capability: 8.6\n",
+      "[-0.40066046 -0.4210303   0.03225489 -0.44849953  0.10370405 -0.04422468\n",
+      "  0.33262825 -0.20109026  0.02102537 -0.24882983]\n",
+      "[-0.07824923 -0.17004533  0.06439921 -0.06163602  0.26633525 -1.1560082\n",
+      " -0.06660341  0.24227881  0.1462235  -0.32055548]\n",
+      "[-0.01464001  0.442209   -0.0607936  -0.5477967  -0.25226554 -0.08588809\n",
+      " -0.30497575  0.00061084 -0.50069696  0.2317973 ]\n",
+      "[ 0.25726247  0.39388427 -0.24093066  0.12316308 -0.01981307  0.5661146\n",
+      "  0.26199922  0.8123446  -0.01576749  0.30846444]\n",
+      "[ 0.7878203  -0.45975062 -0.29956317 -0.07032048 -0.55817443 -0.62506855\n",
+      " -1.6837492  -0.38442805  0.28220773 -1.5325156 ]\n",
+      "[ 0.07975311  0.67754704 -0.30927914  0.00347631 -0.07326564  0.01893554\n",
+      " -0.7518105  -0.03078967 -0.07623022  0.38865626]\n",
+      "[-0.7751679  -0.5841397  -0.6622711   0.18574935 -0.6049372   0.02844244\n",
+      " -0.20471913  0.3337415  -0.3619432  -0.35087156]\n",
+      "[-0.08569919 -0.10775139 -0.02338934  0.21933547 -0.46712473  0.00062137\n",
+      " -0.58207744  0.06457533  0.18276742  0.03866556]\n",
+      "[-0.2311981  -0.43036282  0.20561649 -0.10363232 -0.13248594  0.02885137\n",
+      " -0.31241602 -0.36907142  0.08861586  0.2331427 ]\n",
+      "[-0.07273526 -0.31246194 -0.24218291 -0.24145737  0.0364486   0.14382267\n",
+      " -0.00531162  0.15447603 -0.5220248  -0.09016377]\n"
+     ]
+    }
+   ],
+   "source": [
+    "with torchdynamo.optimize(__torch_mlir):\n",
+    "    for _ in range(10):\n",
+    "        print(toy_example(torch.randn(10), torch.randn(10)))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "It can also be used through a decorator:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [],
+   "source": [
+    "@create_backend\n",
+    "def torch_mlir(subgraph, *args, **kwargs):\n",
+    "    assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
+    "    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
+    "\n",
+    "@torchdynamo.optimize(\"torch_mlir\")\n",
+    "def toy_example2(*args):\n",
+    "    a, b = args\n",
+    "\n",
+    "    x = a / (torch.abs(a) + 1)\n",
+    "    if b.sum() < 0:\n",
+    "        b = b * -1\n",
+    "    return x * b"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 1 device(s).\n",
+      "Device: 0\n",
+      "  Name: NVIDIA GeForce RTX 3080\n",
+      "  Compute Capability: 8.6\n",
+      "[-0.35494277  0.03409214 -0.02271946  0.7335942   0.03122527 -0.41881397\n",
+      " -0.6609761  -0.6418614   0.29336175 -0.01973678]\n",
+      "[-2.7246824e-01 -3.5543957e-01  6.0087401e-01 -7.4570496e-03\n",
+      " -4.2481605e-02 -5.0296803e-04  7.2928613e-01 -1.4673788e-03\n",
+      " -2.7621329e-01 -6.0995776e-02]\n",
+      "[-0.03165906  0.3889693   0.24052973  0.27279532 -0.02773128 -0.12602475\n",
+      " -1.0124422   0.5720256  -0.35437614 -0.20992722]\n",
+      "[-0.41831446  0.5525326  -0.29749998 -0.17044766  0.11804754 -0.05210691\n",
+      " -0.46145165 -0.8776549   0.10090438  0.17463352]\n",
+      "[ 0.02194221  0.20959911  0.26973712  0.12551276 -0.0020404   0.1490246\n",
+      " -0.04456685  1.1100804   0.8105744   0.6676846 ]\n",
+      "[ 0.06528181 -0.13591261  0.5370964  -0.4398162  -0.03372452  0.9691372\n",
+      " -0.01120087  0.2947028   0.4804801  -0.3324341 ]\n",
+      "[ 0.33549032 -0.23001772 -0.08681437  0.16490957 -0.11223086  0.09168988\n",
+      "  0.02403045  0.17344482  0.46406478 -0.00129451]\n",
+      "[-0.27475086  0.42384806  1.9090122  -0.41147137 -0.6888369   0.08435658\n",
+      " -0.26628923 -0.17436793 -0.8058869  -0.02582378]\n",
+      "[-0.10109414  0.08681287 -0.10055986  0.6858881   0.29267687 -0.02797117\n",
+      " -0.01425194  0.4882803   0.3551982  -0.858935  ]\n",
+      "[-0.22086617  0.524994    0.17721705 -0.03813264 -0.54570735 -0.4421502\n",
+      "  0.11938014 -0.01122053  0.39294165 -0.61770755]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for _ in range(10):\n",
+    "    print(toy_example2(torch.randn(10), torch.randn(10)))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/shark/examples/shark_eager/dynamo_demo.py
+++ b/shark/examples/shark_eager/dynamo_demo.py
@@ -0,0 +1,84 @@
+import torch
+from torch_mlir import compile, OutputType
+
+from shark.iree_utils import get_iree_compiled_module
+
+try:
+    import torchdynamo
+    from torchdynamo.optimizations.backends import create_backend
+    from torchdynamo.optimizations.subgraph import SubGraph
+except ModuleNotFoundError:
+    print("Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo")
+    exit()
+
+NUM_ITERS = 10
+
+
+def __torch_mlir(fx_graph, *args, **kwargs):
+    assert isinstance(
+        fx_graph, torch.fx.GraphModule
+    ), "Model must be an FX GraphModule."
+
+    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule):
+        """Replace tuple with tuple element in functions that return one-element tuples."""
+
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert len(node.args) == 1, "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, tuple) and len(node_arg) == 1:
+                    node.args = (node_arg[0],)
+        fx_g.graph.lint()
+        fx_g.recompile()
+        return fx_g
+
+    fx_graph = _unwrap_single_tuple_return(fx_graph)
+    ts_graph = torch.jit.script(fx_graph)
+
+    if isinstance(args, tuple):
+        args = list(args)
+    assert isinstance(args, list)
+    if len(args) == 1 and isinstance(args[0], list):
+        args = args[0]
+
+    linalg_module = compile(ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS)
+    callable, _ = get_iree_compiled_module(linalg_module, "cuda", func_name="forward")
+
+    def forward(*inputs):
+        return callable(*inputs)
+
+    return forward
+
+
+def toy_example(*args):
+    a, b = args
+
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+with torchdynamo.optimize(__torch_mlir):
+    for _ in range(10):
+        print(toy_example(torch.randn(10), torch.randn(10)))
+
+
+@create_backend
+def torch_mlir(subgraph, *args, **kwargs):
+    assert isinstance(subgraph, SubGraph), "Model must be a dynamo SubGraph."
+    return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))
+
+
+@torchdynamo.optimize("torch_mlir")
+def toy_example2(*args):
+    a, b = args
+
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+
+for _ in range(10):
+    print(toy_example2(torch.randn(10), torch.randn(10)))
--- a/shark/examples/shark_eager/eager_mode.ipynb
+++ b/shark/examples/shark_eager/eager_mode.ipynb
@@ -0,0 +1,805 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mlevental/miniconda3/envs/torch-mlir/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "# standard imports\n",
+    "import torch\n",
+    "from torch_mlir.eager_mode import torch_mlir_tensor"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "# eager mode imports\n",
+    "from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor\n",
+    "from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The simplest way of using Eager Mode (through IREE) requires setting a \"backend\":"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(\"cpu\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and wrapping all your `torch.Tensor`s:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_ITERS = 10\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = 2 * torch.ones((10, 10))\n",
+    "\n",
+    "tt = TorchMLIRTensor(t)\n",
+    "print(tt)\n",
+    "uu = TorchMLIRTensor(u)\n",
+    "print(uu)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "`TorchMLIRTensor` is a \"tensor wrapper subclass\" (more info [here](https://github.com/albanD/subclass_zoo)) that keeps the IREE `DeviceArray` in a field `elem`:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(NUM_ITERS):\n",
+    "    yy = tt + uu\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())\n",
+    "    yy = tt * uu\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "If you have a GPU (and CUDA installed) that works too (you can verify by having `watch -n1 nvidia-smi` up in a terminal while running the next cell):"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
+      " [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(\"gpu\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = 2 * torch.ones((10, 10))\n",
+    "\n",
+    "tt = TorchMLIRTensor(t)\n",
+    "print(tt)\n",
+    "uu = TorchMLIRTensor(u)\n",
+    "print(uu)\n",
+    "\n",
+    "yy = tt + uu\n",
+    "print(yy.elem.to_host())\n",
+    "yy = tt * uu\n",
+    "print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "There is a convenience class `SharkEagerMode` that will handle both the installation of the backend and the wrapping of `torch.Tensor`s:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# eager mode RAII\n",
+    "from shark.shark_runner import SharkEagerMode\n",
+    "\n",
+    "shark_eager_mode = SharkEagerMode(\"cpu\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = torch.ones((10, 10))\n",
+    "\n",
+    "print(t)\n",
+    "print(u)\n",
+    "\n",
+    "for i in range(NUM_ITERS):\n",
+    "    yy = t + u\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())\n",
+    "    yy = t * u\n",
+    "    print(type(yy))\n",
+    "    print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The `SharkEagerMode` class is a hacky take on [RAII](https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization) that defines a \"deleter\" that runs when an instantiation (of `SharkEagerMode`) is garbage collected. Takeaway is that if you want to turn off `SharkEagerMode`, or switch backends, you need to `del` the instance:"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
+      " [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
+      "<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
+      "[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
+      " [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "del shark_eager_mode\n",
+    "shark_eager_mode = SharkEagerMode(\"cuda\")\n",
+    "\n",
+    "t = torch.ones((10, 10))\n",
+    "u = torch.ones((10, 10))\n",
+    "\n",
+    "print(t)\n",
+    "print(u)\n",
+    "\n",
+    "yy = t + u\n",
+    "print(type(yy))\n",
+    "print(yy.elem.to_host())\n",
+    "yy = t * u\n",
+    "print(type(yy))\n",
+    "print(yy.elem.to_host())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/shark/examples/shark_eager/eager_mode.py
+++ b/shark/examples/shark_eager/eager_mode.py
@@ -0,0 +1,148 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.utils.cpp_extension import load_inline, include_paths
+from torch_mlir.eager_mode import torch_mlir_tensor
+from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor
+
+from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
+from shark.shark_runner import SharkEagerMode
+
+
+def test_cpu():
+    torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend("cpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = 2 * torch.ones((10, 10), device="cpu")
+
+    tt = TorchMLIRTensor(t)
+    print(tt)
+    uu = TorchMLIRTensor(u)
+    print(uu)
+
+    for i in range(NUM_ITERS):
+        yy = tt + uu
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = tt * uu
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+def test_gpu():
+    source = """
+    #include <iostream>
+    #include "cuda.h"
+    #include "cuda_runtime_api.h"
+
+    using namespace std;
+
+    void print_free_mem() {
+        int num_gpus;
+        size_t free, total;
+        cudaSetDevice(0);
+        int id;
+        cudaGetDevice(&id);
+        cudaMemGetInfo(&free, &total);
+        cout << "GPU " << id << " memory: used=" << (total-free)/(1<<20) << endl;
+    }
+    """
+    gpu_stats = load_inline(
+        name="inline_extension",
+        cpp_sources=[source],
+        extra_include_paths=include_paths(cuda=True),
+        functions=["print_free_mem"],
+    )
+    torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend("gpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = 2 * torch.ones((10, 10), device="cpu")
+
+    tt = TorchMLIRTensor(t)
+    print(tt)
+    uu = TorchMLIRTensor(u)
+    print(uu)
+
+    for i in range(NUM_ITERS):
+        yy = tt + uu
+        print(yy.elem.to_host())
+        yy = tt * uu
+        print(yy.elem.to_host())
+        gpu_stats.print_free_mem()
+
+
+def test_python_mode_ref_backend():
+    # hide this wherever you want?
+    _ = SharkEagerMode("refbackend")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        print(i)
+        yy = t + u
+        print(yy.elem)
+        yy = t * u
+        print(yy.elem)
+
+
+def test_python_mode_iree_cpu():
+    # hide this wherever you want?
+    _ = SharkEagerMode("cpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        yy = t + u
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = t * u
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+def test_python_mode_iree_gpu():
+    _ = SharkEagerMode("gpu")
+
+    t = torch.ones((10, 10), device="cpu")
+    u = torch.ones((10, 10), device="cpu")
+
+    print(t)
+    print(u)
+
+    for i in range(NUM_ITERS):
+        yy = t + u
+        print(type(yy))
+        print(yy.elem.to_host())
+        yy = t * u
+        print(type(yy))
+        print(yy.elem.to_host())
+
+
+if __name__ == "__main__":
+    NUM_ITERS = 10
+    test_cpu()
+    if torch.cuda.is_available():
+        test_gpu()
+    test_python_mode_ref_backend()
+    test_python_mode_iree_cpu()
+    test_python_mode_iree_gpu()
--- a/shark/examples/shark_inference/CLIPModel_tf.py
+++ b/shark/examples/shark_inference/CLIPModel_tf.py
@@ -0,0 +1,51 @@
+from PIL import Image
+import requests
+
+from transformers import CLIPProcessor, TFCLIPModel
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+# Create a set of inputs
+clip_vit_inputs = [
+    tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
+    tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
+    tf.TensorSpec(shape=[1, 3, 224, 224], dtype=tf.float32)
+]
+
+
+class CLIPModule(tf.Module):
+
+    def __init__(self):
+        super(CLIPModule, self).__init__()
+        self.m = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+
+        self.m.predict = lambda x, y, z: self.m(
+            input_ids=x, attention_mask=y, pixel_values=z)
+
+    @tf.function(input_signature=clip_vit_inputs)
+    def forward(self, input_ids, attention_mask, pixel_values):
+        return self.m.predict(input_ids, attention_mask,
+                              pixel_values).logits_per_image
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    inputs = processor(text=["a photo of a cat", "a photo of a dog"],
+                       images=image,
+                       return_tensors="tf",
+                       padding=True)
+
+    shark_module = SharkInference(
+        CLIPModule(),
+        (inputs["input_ids"], inputs["attention_mask"], inputs["pixel_values"]))
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+
+    print(
+        shark_module.forward((inputs["input_ids"], inputs["attention_mask"],
+                              inputs["pixel_values"])))
--- a/shark/examples/shark_inference/gpt2_tf.py
+++ b/shark/examples/shark_inference/gpt2_tf.py
@@ -0,0 +1,38 @@
+from PIL import Image
+import requests
+
+from transformers import GPT2Tokenizer, TFGPT2Model
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+# Create a set of inputs
+gpt2_inputs = [
+    tf.TensorSpec(shape=[1, 8], dtype=tf.int32),
+    tf.TensorSpec(shape=[1, 8], dtype=tf.int32),
+]
+
+
+class GPT2Module(tf.Module):
+
+    def __init__(self):
+        super(GPT2Module, self).__init__()
+        self.m = TFGPT2Model.from_pretrained("distilgpt2")
+
+        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
+
+    @tf.function(input_signature=gpt2_inputs)
+    def forward(self, input_ids, attention_mask):
+        return self.m.predict(input_ids, attention_mask)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
+    text = "I love the distilled version of models."
+
+    inputs = tokenizer(text, return_tensors='tf')
+    shark_module = SharkInference(
+        GPT2Module(), (inputs["input_ids"], inputs["attention_mask"]))
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    print(shark_module.forward((inputs["input_ids"], inputs["attention_mask"])))
--- a/shark/examples/shark_inference/mhlo_example.py
+++ b/shark/examples/shark_inference/mhlo_example.py
@@ -0,0 +1,18 @@
+from shark.shark_inference import SharkInference
+import numpy as np
+
+mhlo_ir = r"""builtin.module  {
+      func.func @forward(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
+        %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<4x4xf32>
+        %1 = "mhlo.abs"(%0) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+        return %1 : tensor<4x4xf32>
+      }
+}"""
+
+arg0 = np.ones((1, 4)).astype(np.float32)
+arg1 = np.ones((4, 1)).astype(np.float32)
+
+shark_module = SharkInference(mhlo_ir, (arg0, arg1))
+shark_module.set_frontend("mhlo")
+shark_module.compile()
+print(shark_module.forward((arg0, arg1)))
--- a/shark/examples/shark_inference/minilm_benchmark.py
+++ b/shark/examples/shark_inference/minilm_benchmark.py
@@ -0,0 +1,36 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from shark.shark_inference import SharkInference
+
+torch.manual_seed(0)
+tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
+
+
+class MiniLMSequenceClassification(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased",  # The pretrained model.
+            num_labels=
+            2,  # The number of output labels--2 for binary classification.
+            output_attentions=
+            False,  # Whether the model returns attentions weights.
+            output_hidden_states=
+            False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+test_input = torch.randint(2, (1, 128))
+
+shark_module = SharkInference(MiniLMSequenceClassification(), (test_input,),
+                              jit_trace=True,
+                              benchmark_mode=True)
+
+shark_module.compile()
+shark_module.forward((test_input,))
+shark_module.benchmark_all((test_input,))
--- a/shark/examples/shark_inference/minilm_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_benchmark_tf.py
@@ -0,0 +1,58 @@
+import tensorflow as tf
+from transformers import BertModel, BertTokenizer, TFBertModel
+from shark.shark_inference import SharkInference
+
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+  tf.config.experimental.set_memory_growth(gpu, True)
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased", from_pt=True)
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+
+    @tf.function(input_signature=bert_input)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = BertTokenizer.from_pretrained(
+        "microsoft/MiniLM-L12-H384-uncased")
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text,
+                              padding='max_length',
+                              truncation=True,
+                              max_length=MAX_SEQUENCE_LENGTH)
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0)
+
+    test_input = (encoded_input["input_ids"], encoded_input["attention_mask"],
+         encoded_input["token_type_ids"])
+    shark_module = SharkInference(
+        BertModule(),
+        test_input,
+        benchmark_mode=True)
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    shark_module.benchmark_all(test_input)
--- a/shark/examples/shark_inference/minilm_jit.py
+++ b/shark/examples/shark_inference/minilm_jit.py
@@ -0,0 +1,35 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from shark.shark_inference import SharkInference
+
+torch.manual_seed(0)
+tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
+
+
+class MiniLMSequenceClassification(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased",  # The pretrained model.
+            num_labels=
+            2,  # The number of output labels--2 for binary classification.
+            output_attentions=
+            False,  # Whether the model returns attentions weights.
+            output_hidden_states=
+            False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+test_input = torch.randint(2, (1, 128))
+
+shark_module = SharkInference(MiniLMSequenceClassification(), (test_input,),
+                              jit_trace=True)
+
+shark_module.compile()
+result = shark_module.forward((test_input,))
+print("Obtained result", result)
--- a/shark/examples/shark_inference/minilm_load_benchmark_tf.py
+++ b/shark/examples/shark_inference/minilm_load_benchmark_tf.py
@@ -0,0 +1,41 @@
+import tensorflow as tf
+from transformers import BertModel, BertTokenizer, TFBertModel
+from shark.shark_inference import SharkInference
+from shark.shark_importer import shark_load
+from shark.parser import parser
+import os
+
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+  tf.config.experimental.set_memory_growth(gpu, True)
+
+parser.add_argument(
+    "--download_mlir_path",
+    type=str,
+    default="minilm_tf_inference.mlir",
+    help="Specifies path to target mlir file that will be loaded.")
+load_args, unknown = parser.parse_known_args()
+
+MAX_SEQUENCE_LENGTH = 512
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = BertTokenizer.from_pretrained(
+        "microsoft/MiniLM-L12-H384-uncased")
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text,
+                              padding='max_length',
+                              truncation=True,
+                              max_length=MAX_SEQUENCE_LENGTH)
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0)
+    model_name = "minilm_tf_inference"
+    minilm_mlir = shark_load(model_name, load_args.download_mlir_path)
+    test_input = (encoded_input["input_ids"], encoded_input["attention_mask"],
+         encoded_input["token_type_ids"])
+    shark_module = SharkInference(
+        minilm_mlir, test_input, benchmark_mode=True)
+    shark_module.set_frontend("mhlo")
+    shark_module.compile()
+    shark_module.benchmark_all(test_input)
--- a/shark/examples/shark_inference/minilm_tf.py
+++ b/shark/examples/shark_inference/minilm_tf.py
@@ -0,0 +1,56 @@
+import tensorflow as tf
+from transformers import BertModel, BertTokenizer, TFBertModel
+from shark.shark_inference import SharkInference
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased", from_pt=True)
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+
+    @tf.function(input_signature=bert_input)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = BertTokenizer.from_pretrained(
+        "microsoft/MiniLM-L12-H384-uncased")
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text,
+                              padding='max_length',
+                              truncation=True,
+                              max_length=MAX_SEQUENCE_LENGTH)
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0)
+
+    shark_module = SharkInference(
+        BertModule(),
+        (encoded_input["input_ids"], encoded_input["attention_mask"],
+         encoded_input["token_type_ids"]))
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+
+    print(
+        shark_module.forward(
+            (encoded_input["input_ids"], encoded_input["attention_mask"],
+             encoded_input["token_type_ids"])))
--- a/shark/examples/shark_inference/minilm_tf_gpu_config.json
+++ b/shark/examples/shark_inference/minilm_tf_gpu_config.json
--- a/shark/examples/shark_inference/resnet50_script.py
+++ b/shark/examples/shark_inference/resnet50_script.py
@@ -0,0 +1,80 @@
+from PIL import Image
+import requests
+import torch
+import torchvision.models as models
+from torchvision import transforms
+import sys
+from shark.shark_inference import SharkInference
+
+
+################################## Preprocessing inputs and model ############
+def load_and_preprocess_image(url: str):
+    headers = {
+        "User-Agent":
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
+    }
+    img = Image.open(requests.get(url, headers=headers,
+                                  stream=True).raw).convert("RGB")
+    # preprocessing pipeline
+    preprocess = transforms.Compose([
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                             std=[0.229, 0.224, 0.225]),
+    ])
+    img_preprocessed = preprocess(img)
+    return torch.unsqueeze(img_preprocessed, 0)
+
+
+def load_labels():
+    classes_text = requests.get(
+        "https://raw.githubusercontent.com/cathyzhyi/ml-data/main/imagenet-classes.txt",
+        stream=True,
+    ).text
+    labels = [line.strip() for line in classes_text.splitlines()]
+    return labels
+
+
+def top3_possibilities(res):
+    _, indexes = torch.sort(res, descending=True)
+    percentage = torch.nn.functional.softmax(res, dim=1)[0] * 100
+    top3 = [(labels[idx], percentage[idx].item()) for idx in indexes[0][:3]]
+    return top3
+
+
+class Resnet50Module(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.resnet = models.resnet50(pretrained=True)
+        self.train(False)
+
+    def forward(self, img):
+        return self.resnet.forward(img)
+
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/2/26/YellowLabradorLooking_new.jpg"
+print("load image from " + image_url, file=sys.stderr)
+img = load_and_preprocess_image(image_url)
+labels = load_labels()
+
+##############################################################################
+
+input = torch.randn(1, 3, 224, 224)
+print(input.shape)
+
+## The img is passed to determine the input shape.
+shark_module = SharkInference(Resnet50Module(), (img,))
+shark_module.compile()
+
+## Can pass any img or input to the forward module.
+results = shark_module.forward((img,))
+
+print("The top 3 results obtained via shark_runner is:")
+print(top3_possibilities(torch.from_numpy(results)))
+
+print()
+
+print("The top 3 results obtained via torch is:")
+print(top3_possibilities(Resnet50Module()(img)))
--- a/shark/examples/shark_inference/t5_tf.py
+++ b/shark/examples/shark_inference/t5_tf.py
@@ -0,0 +1,38 @@
+from PIL import Image
+import requests
+
+from transformers import T5Tokenizer, TFT5Model
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+# Create a set of inputs
+t5_inputs = [
+    tf.TensorSpec(shape=[1, 10], dtype=tf.int32),
+    tf.TensorSpec(shape=[1, 10], dtype=tf.int32),
+]
+
+class T5Module(tf.Module):
+
+    def __init__(self):
+        super(T5Module, self).__init__()
+        self.m = TFT5Model.from_pretrained("t5-small")
+        self.m.predict = lambda x,y: self.m(input_ids=x, decoder_input_ids=y)
+
+    @tf.function(input_signature=t5_inputs)
+    def forward(self, input_ids, decoder_input_ids):
+        return self.m.predict(input_ids, decoder_input_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    text = "I love the distilled version of models."
+    inputs = tokenizer(
+        text, return_tensors="tf"
+    ).input_ids
+
+    shark_module = SharkInference(
+        T5Module(), (inputs, inputs))
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    print(shark_module.forward((inputs,inputs)))
--- a/shark/examples/shark_inference/torch_vision_models_script.py
+++ b/shark/examples/shark_inference/torch_vision_models_script.py
@@ -0,0 +1,44 @@
+import torch
+import torchvision.models as models
+from shark.shark_inference import SharkInference
+
+
+class VisionModule(torch.nn.Module):
+
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+input = torch.randn(1, 3, 224, 224)
+
+## The vision models present here: https://pytorch.org/vision/stable/models.html
+vision_models_list = [
+    models.resnet18(pretrained=True),
+    models.alexnet(pretrained=True),
+    models.vgg16(pretrained=True),
+    models.squeezenet1_0(pretrained=True),
+    models.densenet161(pretrained=True),
+    models.inception_v3(pretrained=True),
+    models.shufflenet_v2_x1_0(pretrained=True),
+    models.mobilenet_v2(pretrained=True),
+    models.mobilenet_v3_small(pretrained=True),
+    models.resnext50_32x4d(pretrained=True),
+    models.wide_resnet50_2(pretrained=True),
+    models.mnasnet1_0(pretrained=True),
+    models.efficientnet_b0(pretrained=True),
+    models.regnet_y_400mf(pretrained=True),
+    models.regnet_x_400mf(pretrained=True),
+]
+
+for i, vision_model in enumerate(vision_models_list):
+    shark_module = SharkInference(
+        VisionModule(vision_model),
+        (input,),
+    )
+    shark_module.compile()
+    shark_module.forward((input,))
--- a/shark/examples/shark_inference/unet_script.py
+++ b/shark/examples/shark_inference/unet_script.py
@@ -0,0 +1,32 @@
+import torch
+from shark_runner import SharkInference
+
+
+# Currently not supported aten.transpose_conv2d missing.
+class UnetModule(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.model = torch.hub.load(
+            "mateuszbuda/brain-segmentation-pytorch",
+            "unet",
+            in_channels=3,
+            out_channels=1,
+            init_features=32,
+            pretrained=True,
+        )
+        self.train(False)
+
+    def forward(self, input):
+        return self.model(input)
+
+
+input = torch.randn(1, 3, 224, 224)
+
+print(input)
+shark_module = SharkInference(
+    UnetModule(),
+    (input,),
+)
+shark_module.benchmark_forward((input,))
+print(input)
--- a/shark/examples/shark_training/bert_training.py
+++ b/shark/examples/shark_training/bert_training.py
@@ -0,0 +1,50 @@
+import torch
+from torch.nn.utils import _stateless
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from shark.shark_runner import SharkTrainer
+
+
+class MiniLMSequenceClassification(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased",  # The pretrained model.
+            num_labels=
+            2,  # The number of output labels--2 for binary classification.
+            output_attentions=
+            False,  # Whether the model returns attentions weights.
+            output_hidden_states=
+            False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+mod = MiniLMSequenceClassification()
+
+
+def get_sorted_params(named_params):
+    return [i[1] for i in sorted(named_params.items())]
+
+
+print(dict(mod.named_buffers()))
+
+inp = (torch.randint(2, (1, 128)),)
+
+
+def forward(params, buffers, args):
+    params_and_buffers = {**params, **buffers}
+    _stateless.functional_call(mod, params_and_buffers, args,
+                               {}).sum().backward()
+    optim = torch.optim.SGD(get_sorted_params(params), lr=0.01)
+    # optim.load_state_dict(optim_state)
+    optim.step()
+    return params, buffers
+
+
+shark_module = SharkTrainer(mod, inp, custom_inference_fn=forward)
+
+print(shark_module.forward())
--- a/shark/examples/shark_training/bert_training_load_tf.py
+++ b/shark/examples/shark_training/bert_training_load_tf.py
@@ -0,0 +1,45 @@
+import numpy as np
+import os
+import time
+import tensorflow as tf
+
+from shark.shark_trainer import SharkTrainer
+from shark.parser import parser
+from shark.shark_importer import shark_load
+
+parser.add_argument(
+    "--download_mlir_path",
+    type=str,
+    default="bert_tf_training.mlir",
+    help="Specifies path to target mlir file that will be loaded.")
+load_args, unknown = parser.parse_known_args()
+
+tf.random.set_seed(0)
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Download BERT model from tank and train.
+if __name__ == "__main__":
+    predict_sample_input = [
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH))
+    ]
+    model_name = "bert_tf_training"
+    bert_mlir = shark_load(model_name, load_args.download_mlir_path)
+    sample_input_tensors = [tf.convert_to_tensor(val, dtype=tf.int32) for val in predict_sample_input]
+    num_iter = 10
+    shark_module = SharkTrainer(
+        bert_mlir,
+        (sample_input_tensors,
+         tf.convert_to_tensor(np.random.randint(5, size=(BATCH_SIZE)), dtype=tf.int32)))
+    shark_module.set_frontend("mhlo")
+    shark_module.compile()
+    start = time.time()
+    print(shark_module.train(num_iter))
+    end = time.time()
+    total_time = end - start
+    print("time: " + str(total_time))
+    print("time/iter: " + str(total_time / num_iter))
--- a/shark/examples/shark_training/bert_training_tf.py
+++ b/shark/examples/shark_training/bert_training_tf.py
@@ -0,0 +1,88 @@
+import sys
+from absl import app
+import time
+
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+
+from official.nlp.modeling import layers
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import bert_classifier
+
+from shark.shark_trainer import SharkTrainer
+
+
+tf.random.set_seed(0)
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+        test_network = networks.BertEncoder(vocab_size=vocab_size,
+                                            num_layers=2,
+                                            dict_outputs=dict_outputs)
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            test_network, num_classes=NUM_CLASSES)
+        bert_trainer_model.summary()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.predict = tf.function(input_signature=[bert_input])(self.m.predict)
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(input_signature=[
+        bert_input,  # inputs
+        tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32)  # labels
+    ])
+    def forward(self, inputs, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            probs = self.m(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+
+if __name__ == "__main__":
+    predict_sample_input = [
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH))
+    ]
+    sample_input_tensors = [tf.convert_to_tensor(val, dtype=tf.int32) for val in predict_sample_input]
+    num_iter = 10
+    shark_module = SharkTrainer(
+        BertModule(),
+        (sample_input_tensors,
+         tf.convert_to_tensor(np.random.randint(5, size=(BATCH_SIZE)), dtype=tf.int32)))
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    start = time.time()
+    print(shark_module.train(num_iter))
+    end = time.time()
+    total_time = end - start
+    print("time: " + str(total_time))
+    print("time/iter: " + str(total_time / num_iter))
--- a/shark/examples/shark_training/neural_net_training.py
+++ b/shark/examples/shark_training/neural_net_training.py
@@ -0,0 +1,44 @@
+import torch
+from torch.nn.utils import _stateless
+from shark.shark_trainer import SharkTrainer
+
+
+class Foo(torch.nn.Module):
+
+    def __init__(self):
+        super(Foo, self).__init__()
+        self.l1 = torch.nn.Linear(10, 16)
+        self.relu = torch.nn.ReLU()
+        self.l2 = torch.nn.Linear(16, 2)
+
+    def forward(self, x):
+        out = self.l1(x)
+        out = self.relu(out)
+        out = self.l2(out)
+        return out
+
+
+mod = Foo()
+inp = (torch.randn(10, 10),)
+
+
+def get_sorted_params(named_params):
+    return [i[1] for i in sorted(named_params.items())]
+
+
+def forward(params, buffers, args):
+    params_and_buffers = {**params, **buffers}
+    _stateless.functional_call(mod, params_and_buffers, args,
+                               {}).sum().backward()
+    optim = torch.optim.SGD(get_sorted_params(params), lr=0.01)
+    optim.step()
+    return params, buffers
+
+
+# fx_graph = forward(dict(mod.named_parameters()), dict(mod.named_buffers()), inp)
+
+shark_module = SharkTrainer(mod, inp)
+# Pass the training function in case of torch
+shark_module.compile(training_fn=forward)
+
+shark_module.train(num_iters=10)
--- a/shark/iree_eager_backend.py
+++ b/shark/iree_eager_backend.py
@@ -0,0 +1,81 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Any
+
+import iree
+import iree.runtime as ireert
+import numpy as np
+import torch
+from iree.runtime import DeviceArray
+from torch_mlir._mlir_libs._mlir.ir import Module
+from torch_mlir.compiler_utils import (
+    get_module_name_for_debug_dump,
+    run_pipeline_with_repro_report,
+)
+from torch_mlir.eager_mode.torch_mlir_eager_backend import (
+    TorchMLIREagerBackend,
+    TensorMetaData,
+)
+from torch_mlir_e2e_test.eager_backends.refbackend import NUMPY_TO_TORCH_DTYPE_DICT
+
+from shark.iree_utils import get_iree_compiled_module, IREE_DEVICE_MAP
+
+
+class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):
+    """Main entry-point for the iree backend for torch-mlir eager mode.
+
+    EagerModeIREELinalgOnTensorsBackend uses iree.DeviceArray representations of tensors and
+    thus all of the wrapping and unwrapping and munging here is done to between torch.Tensor and iree.DeviceArray,
+    with np.ndarray as an intermediary.
+    """
+
+    def __init__(self, device: str):
+        self.torch_device_str = device
+        self.iree_device_str = IREE_DEVICE_MAP[device]
+        self.config = ireert.Config(self.iree_device_str)
+
+    def get_torch_metadata(self, tensor: DeviceArray,
+                           kwargs: Dict[str, Any]) -> TensorMetaData:
+        return TensorMetaData(
+            size=tensor.shape,
+            dtype=NUMPY_TO_TORCH_DTYPE_DICT[tensor.dtype.type],
+            device=torch.device(self.torch_device_str),
+            requires_grad=tensor.dtype.type
+            in {np.float, np.float32, np.float64} and
+            kwargs.get("requires_grad", False),
+        )
+
+    def compile(self, imported_module: Module):
+        fn_name = get_module_name_for_debug_dump(imported_module)
+        run_pipeline_with_repro_report(
+            imported_module,
+            "torch-function-to-torch-backend-pipeline,torch-backend-to-linalg-on-tensors-backend-pipeline",
+            "EagerMode",
+        )
+        callable, _ = get_iree_compiled_module(imported_module,
+                                               self.iree_device_str,
+                                               func_name=fn_name)
+        return callable
+
+    def copy_into(self, dst, src):
+        """Copy output back to appropriate arg that it should alias."""
+        np.copyto(dst, src)
+
+    def transfer_from_device_to_torch(self, e):
+        return torch.from_numpy(e.to_host())
+
+    def transfer_from_torch_to_device(self,
+                                      tensor: torch.Tensor) -> DeviceArray:
+        return iree.runtime.asdevicearray(self.config.device, tensor.numpy())
--- a/shark/iree_utils.py
+++ b/shark/iree_utils.py
@@ -0,0 +1,359 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import iree.runtime as ireert
+import iree.runtime.scripts.iree_benchmark_module as benchmark_module
+import iree.compiler as ireec
+from shark.torch_mlir_utils import get_module_name_for_asm_dump
+from shark.cuda_utils import get_cuda_sm_cc
+from shark.model_annotation import *
+import subprocess
+import numpy as np
+import os
+import re
+import sys
+
+IREE_DEVICE_MAP = {
+    "cpu": "local-task",
+    "gpu": "cuda",
+    "cuda": "cuda",
+    "vulkan": "vulkan",
+    "metal": "vulkan",
+    "rocm": "rocm"
+}
+
+IREE_TARGET_MAP = {
+    "cpu": "dylib",
+    "gpu": "cuda",
+    "cuda": "cuda",
+    "vulkan": "vulkan",
+    "metal": "vulkan",
+    "rocm": "rocm"
+}
+
+UNIT_TO_SECOND_MAP = {"ms": 0.001, "s": 1}
+
+
+def check_device_drivers(device):
+    """Checks necessary drivers present for gpu and vulkan devices"""
+    if (device in ["gpu", "cuda"]):
+        try:
+            subprocess.check_output('nvidia-smi')
+        except Exception:
+            return True
+    elif (device in ["metal", "vulkan"]):
+        try:
+            subprocess.check_output('vulkaninfo')
+        except Exception:
+            return True
+    elif (device == "cpu"):
+        return False
+    # Unknown device.
+    else:
+        return True
+
+    return False
+
+
+def get_iree_cpu_args():
+    find_triple_cmd = "uname -s -m"
+    os_name, proc_name = subprocess.run(
+        find_triple_cmd, shell=True, stdout=subprocess.PIPE,
+        check=True).stdout.decode('utf-8').split()
+    if os_name == "Darwin":
+        find_kernel_version_cmd = "uname -r"
+        kernel_version = subprocess.run(find_kernel_version_cmd,
+                                        shell=True,
+                                        stdout=subprocess.PIPE,
+                                        check=True).stdout.decode('utf-8')
+        target_triple = f"{proc_name}-apple-darwin{kernel_version}"
+    elif os_name == "Linux":
+        target_triple = f"{proc_name}-linux-gnu"
+    else:
+        error_message = f"OS Type f{os_name} not supported and triple can't be determined, open issue to dSHARK team please :)"
+        raise Exception(error_message)
+    print(f"Target triple found:{target_triple}")
+    return [f"-iree-llvm-target-triple={target_triple}"]
+
+
+def get_iree_gpu_args():
+    ireert.flags.FUNCTION_INPUT_VALIDATION = False
+    ireert.flags.parse_flags("--cuda_allow_inline_execution")
+    sm_arch = get_cuda_sm_cc()
+    if sm_arch in ['sm_70', 'sm_72', 'sm_75', 'sm_80', 'sm_84', 'sm_86']:
+        return [
+            "--iree-hal-cuda-disable-loop-nounroll-wa",
+            f"--iree-hal-cuda-llvm-target-arch={sm_arch}"
+        ]
+    else:
+        return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
+
+
+def get_vulkan_triple_flag():
+    vulkan_device_cmd = "vulkaninfo | grep deviceName | awk \'END{{print $NF}}\'"
+    vulkan_device = run_cmd(vulkan_device_cmd).strip()
+    if vulkan_device == "M1":
+        print("Found Apple Device. Using m1-moltenvk-macos")
+        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
+    elif vulkan_device == "A100-SXM4-40GB":
+        print("Found Nvidia Device. Using ampere-rtx3080-linux")
+        return "-iree-vulkan-target-triple=ampere-rtx3080-linux"
+    else:
+        print(
+            "Optimized kernel for your target device is not added yet. Contact SHARK Admin on discord[https://discord.com/invite/RUqY2h2s9u] or pull up an issue."
+        )
+        return None
+
+
+def get_iree_vulkan_args():
+    #vulkan_flag = ["--iree-flow-demote-i64-to-i32"]
+    vulkan_flag = []
+    vulkan_triple_flag = get_vulkan_triple_flag()
+    if vulkan_triple_flag is not None:
+        vulkan_flag.append(vulkan_triple_flag)
+    return vulkan_flag
+
+
+def get_iree_device_args(device):
+    if device == "cpu":
+        return get_iree_cpu_args()
+    if device in ["gpu", "cuda"]:
+        return get_iree_gpu_args()
+    if device in ["metal", "vulkan"]:
+        return get_iree_vulkan_args()
+    return []
+
+
+def get_iree_frontend_args(frontend):
+    if frontend in ["torch", "pytorch", "linalg"]:
+        return ["--iree-llvm-target-cpu-features=host"]
+    elif frontend in ["tensorflow", "tf", "mhlo"]:
+        return [
+            "--iree-llvm-target-cpu-features=host",
+            "--iree-mhlo-demote-i64-to-i32=false",
+            "--iree-flow-demote-i64-to-i32"
+        ]
+    else:
+        # Frontend not found.
+        return []
+
+
+def compile_module_to_flatbuffer(module, device, frontend, func_name,
+                                 model_config_path):
+    # Setup Compile arguments wrt to frontends.
+    input_type = ""
+    args = get_iree_frontend_args(frontend)
+    args += get_iree_device_args(device)
+
+    if frontend in ["tensorflow", "tf"]:
+        input_type = "mhlo"
+    elif frontend in ["mhlo", "tosa"]:
+        input_type = frontend
+    elif frontend in ["tflite"]:
+        input_type = "tosa"
+
+    # Annotate the input module with the configs
+    if model_config_path != None:
+        # Currently tuned model only works on tf frontend
+        if frontend in ["tensorflow", "tf"]:
+            input_module = module.decode('utf-8')
+        elif frontend in ["pytorch", "torch"]:
+            input_module = module.operation.get_asm()
+        with create_context() as ctx:
+            module = model_annotation(ctx,
+                                      input_contents=input_module,
+                                      config_path=model_config_path)
+            module = str(module)
+
+    # Compile according to the input type, else just try compiling.
+    if input_type not in ["mhlo", "tosa"]:
+        module = str(module)
+    if input_type != "":
+        # Currently for MHLO/TOSA.
+        flatbuffer_blob = ireec.compile_str(
+            module,
+            target_backends=[IREE_TARGET_MAP[device]],
+            extra_args=args,
+            input_type=input_type)
+    else:
+        # Currently for Torch.
+        flatbuffer_blob = ireec.compile_str(
+            str(module),
+            target_backends=[IREE_TARGET_MAP[device]],
+            extra_args=args)
+    return flatbuffer_blob
+
+
+def get_iree_module(flatbuffer_blob, device, func_name):
+    vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
+    config = ireert.Config(IREE_DEVICE_MAP[device])
+    ctx = ireert.SystemContext(config=config)
+    ctx.add_vm_module(vm_module)
+    ModuleCompiled = ctx.modules.module[func_name]
+    return ModuleCompiled, config
+
+
+def get_iree_compiled_module(module,
+                             device: str,
+                             frontend: str = "torch",
+                             func_name: str = "forward",
+                             model_config_path: str = None):
+    """Given a module returns the compiled .vmfb and configs"""
+    flatbuffer_blob = compile_module_to_flatbuffer(module, device, frontend,
+                                                   func_name, model_config_path)
+    return get_iree_module(flatbuffer_blob, device, func_name)
+
+
+def export_iree_module_to_vmfb(module,
+                               device: str,
+                               directory: str,
+                               frontend: str = "torch",
+                               func_name: str = "forward",
+                               model_config_path: str = None):
+    flatbuffer_blob = compile_module_to_flatbuffer(module, device, frontend,
+                                                   func_name, model_config_path)
+    module_name = f"{frontend}_{func_name}_{device}"
+    filename = os.path.join(directory, module_name + ".vmfb")
+    print(f"Saved vmfb in {filename}.")
+    with open(filename, 'wb') as f:
+        f.write(flatbuffer_blob)
+    return filename
+
+
+def export_module_to_mlir_file(module, frontend, directory: str):
+    mlir_str = module
+    if frontend in ["tensorflow", "tf", "mhlo"]:
+        mlir_str = module.decode('utf-8')
+    elif frontend in ["pytorch", "torch"]:
+        mlir_str = module.operation.get_asm()
+    filename = os.path.join(directory, "model.mlir")
+    with open(filename, 'w') as f:
+        f.write(mlir_str)
+    print(f"Saved mlir in {filename}.")
+    return filename
+
+
+def get_results(compiled_vm, input, config, frontend="torch"):
+    """Runs a .vmfb file given inputs and config and returns output."""
+    device_inputs = input
+    if frontend in ["torch", "pytorch"]:
+        device_inputs = [ireert.asdevicearray(config.device, a) for a in input]
+    if frontend in ["tensorflow", "tf", "tflite"]:
+        device_inputs = []
+        for a in input:
+            if (isinstance(a, list)):
+                device_inputs.append([
+                    ireert.asdevicearray(config.device, val, dtype=np.int32)
+                    for val in a
+                ])
+            else:
+                device_inputs.append(ireert.asdevicearray(config.device, a))
+    result = compiled_vm(*device_inputs)
+    result_tensors = []
+    if (isinstance(result, tuple)):
+        for val in result:
+            result_tensors.append(np.copy(np.asarray(val, val.dtype)))
+        return result_tensors
+    elif (isinstance(result, dict)):
+        data = list(result.items())
+        res = np.array(data, dtype=object)
+        return np.copy(res)
+    else:
+        return np.copy(np.asarray(result, dtype=result.dtype))
+
+
+######### Benchmark Related Tools ###########
+
+
+def tensor_to_type_str(input_tensors: tuple, frontend: str):
+    """
+    Input: A tuple of input tensors i.e tuple(torch.tensor)
+    Output: list of string that represent mlir types (i.e 1x24xf64)
+    # TODO: Support more than floats, and ints
+    """
+    list_of_type = []
+    for input_tensor in input_tensors:
+        type_string = "x".join([str(dim) for dim in input_tensor.shape])
+        if frontend in ["torch", "pytorch"]:
+            dtype_string = str(input_tensor.dtype).replace("torch.", "")
+        elif frontend in ["tensorflow", "tf", "mhlo"]:
+            dtype = input_tensor.dtype
+            dtype_string = re.findall('\'[^"]*\'',
+                                      str(dtype))[0].replace("\'", "")
+        regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
+        match = regex_split.match(dtype_string)
+        mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
+        type_string += f"x{mlir_type_string}"
+        list_of_type.append(type_string)
+    return list_of_type
+
+
+def build_benchmark_args(input_file: str,
+                         device: str,
+                         input_tensors: tuple,
+                         frontend: str,
+                         training=False):
+    """
+    Inputs: input_file leading to vmfb, input_tensor to function, target device, and whether it is training or not.
+    Outputs: string that execute benchmark-module on target model.
+    """
+    path = benchmark_module.__path__[0]
+    benchmarker_path = os.path.join(path, "..", "..", "iree-benchmark-module")
+    benchmark_cl = [benchmarker_path, f"--module_file={input_file}"]
+    fn_name = "forward"
+    if training == True:
+        # TODO: Replace name of train with actual train fn name.
+        fn_name = "train"
+    benchmark_cl.append(f"--entry_function={fn_name}")
+    benchmark_cl.append(f"--device={IREE_DEVICE_MAP[device]}")
+    mlir_input_types = tensor_to_type_str(input_tensors, frontend)
+    for mlir_input in mlir_input_types:
+        benchmark_cl.append(f"--function_input={mlir_input}")
+    time_extractor = "| awk \'END{{print $2 $3}}\'"
+    benchmark_cl.append(time_extractor)
+    return benchmark_cl
+
+
+def run_cmd(cmd):
+    """
+    Inputs: cli command string.
+    """
+    try:
+        result = subprocess.run(cmd,
+                                shell=True,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                check=True)
+        result_str = result.stdout.decode()
+        return result_str
+    except Exception:
+        sys.exit("Exiting program due to error running:", cmd)
+
+
+def run_benchmark_module(benchmark_cl):
+    """
+    Run benchmark command, extract result and return iteration/seconds.
+
+    Input: benchmark command.
+    """
+    benchmark_path = benchmark_cl[0]
+    assert os.path.exists(
+        benchmark_path
+    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
+    bench_result = run_cmd(' '.join(benchmark_cl))
+    regex_split = re.compile("([0-9]+[.]*[0-9]*)([a-zA-Z]+)")
+    match = regex_split.match(bench_result)
+    time = float(match.group(1))
+    unit = match.group(2)
+    return 1.0 / (time * UNIT_TO_SECOND_MAP[unit])
--- a/shark/model_annotation.py
+++ b/shark/model_annotation.py
@@ -0,0 +1,143 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import json
+import os
+from typing import List, Dict
+
+from iree.compiler import ir
+from iree.compiler.transforms import ireec as ireec_trans
+
+MATMUL_OP_NAMES = set(
+    ["linalg.matmul", "linalg.batch_matmul", "mhlo.dot", "mhlo.dot_general"])
+idx = 0
+
+
+def model_annotation(ctx: ir.Context, *, input_contents: str, config_path: str):
+    if os.path.isfile(input_contents):
+        with open(input_contents, "rb") as f:
+            input_contents = f.read()
+
+    module = ir.Module.parse(input_contents)
+
+    with open(config_path, "r") as f:
+        data = json.load(f)
+        configs = data["options"]
+
+    # The Python API does not expose a general walk() function, so we just
+    # do it ourselves.
+    walk_children(module.operation, configs)
+
+    if not module.operation.verify():
+        raise RuntimeError("Modified program does not verify!")
+
+    # More efficient than: print(module)
+    #   - Disables verification (already done above)
+    #   - Writes as binary, avoiding costly unicode conversions
+    sys.stdout.buffer.write(
+        module.operation.get_asm(assume_verified=True, binary=True))
+    return module
+
+
+def walk_children(op: ir.Operation, configs: List[Dict]):
+    for region in op.regions:
+        for block in region.blocks:
+            for child_op in block.operations:
+                # TODO: This is dumb. Both Operation and OpView should expose
+                # 'operation' and 'name' attributes.
+                if isinstance(child_op, ir.OpView):
+                    child_op = child_op.operation
+                if child_op.name in MATMUL_OP_NAMES:
+                    global idx
+                    tile_sizes, pipeline, workgroup_size, \
+                    split_k, pipeline_depth = parse_config(configs[idx])
+
+                    add_compilation_info(child_op,
+                                         tile_sizes=tile_sizes,
+                                         pipeline=pipeline,
+                                         workgroup_size=workgroup_size,
+                                         pipeline_depth=pipeline_depth)
+
+                    if split_k:
+                        add_split_k(child_op, split_k)
+
+                    idx = idx + 1
+                    print(f"Updated op {child_op}", file=sys.stderr)
+                walk_children(child_op, configs)
+
+
+def parse_config(config: Dict):
+    if config["pipeline"] == "GPU" or config["pipeline"] == "GPU_TENSORCORE":
+        pipeline = "LLVMGPUMatmulSimt" if config[
+            "pipeline"] == "GPU" else "LLVMGPUMatmulTensorCore"
+        tile_sizes = [config["work_group_tile_sizes"]]
+        workgroup_size = config["work_group_sizes"]
+        try:
+            pipeline_depth = config["pipeline_depth"]
+        except:
+            pipeline_depth = None
+        try:
+            split_k = config["split_k"]
+        except:
+            split_k = None
+    else:
+        pipeline = config["pipeline"]
+        tile_sizes = [
+            config["work_group_tile_sizes"], config["l1_tile_sizes"],
+            config["vector_tile_sizes"]
+        ]
+        workgroup_size = []
+        split_k = None
+        pipeline_depth = None
+    return tile_sizes, pipeline, workgroup_size, split_k, pipeline_depth
+
+
+def add_compilation_info(op: ir.Operation, tile_sizes: List[List[int]],
+                         pipeline: str, workgroup_size: List[int],
+                         pipeline_depth: int):
+    # We don't have a Python binding for CompilationInfo, so we just parse
+    # its string form.
+    if pipeline_depth:
+        attr = ir.Attribute.parse(
+            f"#iree_codegen.compilation_info<"
+            f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
+            f"translation_info = <{pipeline} pipeline_depth = {pipeline_depth}>, "
+            f"workgroup_size = {repr(workgroup_size)}>")
+    else:
+        attr = ir.Attribute.parse(
+            f"#iree_codegen.compilation_info<"
+            f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
+            f"translation_info = <{pipeline}>, "
+            f"workgroup_size = {repr(workgroup_size)}>")
+    op.attributes["compilation_info"] = attr
+
+
+def add_split_k(op: ir.Operation, k: int):
+    attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), k)
+    op.attributes["iree_flow_split_k"] = attr
+
+
+def create_context() -> ir.Context:
+    context = ir.Context()
+    ireec_trans.register_all_dialects(context)
+    context.allow_unregistered_dialects = True
+    return context
+
+
+if __name__ == "__main__":
+    with create_context() as ctx:
+        model_annotation(ctx,
+                         input_contents=sys.argv[1],
+                         config_path=sys.argv[2])
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -0,0 +1,71 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+
+def dir_path(path):
+    if os.path.isdir(path):
+        return path
+    else:
+        raise argparse.ArgumentTypeError(
+            f"readable_dir:{path} is not a valid path")
+
+
+def dir_file(path):
+    if os.path.isfile(path):
+        return path
+    else:
+        raise argparse.ArgumentTypeError(
+            f"readable_file:{path} is not a valid file")
+
+
+parser = argparse.ArgumentParser(description='SHARK runner.')
+parser.add_argument(
+    "--device",
+    type=str,
+    default="cpu",
+    help="Device on which shark_runner runs. options are cpu, gpu, and vulkan")
+parser.add_argument(
+    "--repro_dir",
+    help=
+    "Directory to which module files will be saved for reproduction or debugging.",
+    type=dir_path,
+    default="/tmp/")
+parser.add_argument("--save_mlir",
+                    default=False,
+                    action="store_true",
+                    help="Saves input MLIR module to /tmp/ directory.")
+parser.add_argument("--save_vmfb",
+                    default=False,
+                    action="store_true",
+                    help="Saves iree .vmfb module to /tmp/ directory.")
+parser.add_argument(
+    "--model_config_path",
+    help="Directory to where the tuned model config file is located.",
+    default=None)
+
+parser.add_argument(
+    "--num_warmup_iterations",
+    type=int,
+    default=2,
+    help="Run the model for the specified number of warmup iterations.")
+parser.add_argument(
+    "--num_iterations",
+    type=int,
+    default=1,
+    help="Run the model for the specified number of iterations.")
+
+shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -0,0 +1,136 @@
+# Lint as: python3
+"""SHARK Importer"""
+
+import iree.compiler.tflite as iree_tflite_compile
+import iree.runtime as iree_rt
+import numpy as np
+import os
+import sys
+import tensorflow.compat.v2 as tf
+import urllib.request
+from shark.shark_inference import SharkInference
+
+
+class SharkImporter:
+
+    def __init__(self,
+                 model_path,
+                 model_type: str = "tflite",
+                 model_source_hub: str = "tfhub",
+                 device: str = None,
+                 dynamic: bool = False,
+                 jit_trace: bool = False,
+                 benchmark_mode: bool = False):
+        self.model_path = model_path
+        self.model_type = model_type
+        self.model_source_hub = model_source_hub
+        self.device = device
+        self.dynamic = dynamic
+        self.jit_trace = jit_trace
+        self.benchmark_mode = benchmark_mode
+        self.inputs = None
+        self.input_details = None
+        self.output_details = None
+
+        # create tmp model file directory
+        if self.model_path is None:
+            print("Error. No model_path, Please input model path.")
+            return
+
+        if self.model_source_hub == "tfhub":
+            # compile and run tfhub tflite
+            if self.model_type == "tflite":
+                print("Setting up for TMP_DIR")
+                exe_basename = os.path.basename(sys.argv[0])
+                self.workdir = os.path.join(os.path.dirname(__file__), "tmp",
+                                            exe_basename)
+                print(f"TMP_DIR = {self.workdir}")
+                os.makedirs(self.workdir, exist_ok=True)
+                self.tflite_file = '/'.join([self.workdir, 'model.tflite'])
+                print("Setting up local address for tflite model file: ",
+                      self.tflite_file)
+                if os.path.exists(self.model_path):
+                    self.tflite_file = self.model_path
+                else:
+                    print("Download tflite model")
+                    urllib.request.urlretrieve(self.model_path,
+                                               self.tflite_file)
+                print("Setting up tflite interpreter")
+                self.tflite_interpreter = tf.lite.Interpreter(
+                    model_path=self.tflite_file)
+                self.tflite_interpreter.allocate_tensors()
+                # default input initialization
+                self.input_details, self.output_details = self.get_model_details(
+                )
+                inputs = self.generate_inputs(
+                    self.input_details)  # device_inputs
+                self.setup_inputs(inputs)
+
+    def generate_inputs(self, input_details):
+        args = []
+        for input in input_details:
+            print(str(input["shape"]), input["dtype"].__name__)
+            args.append(np.zeros(shape=input["shape"], dtype=input["dtype"]))
+        return args
+
+    def get_model_details(self):
+        if self.model_type == "tflite":
+            print("Get tflite input output details")
+            self.input_details = self.tflite_interpreter.get_input_details()
+            self.output_details = self.tflite_interpreter.get_output_details()
+            return self.input_details, self.output_details
+
+    def setup_inputs(self, inputs):
+        print("Setting up inputs")
+        self.inputs = inputs
+
+    def compile(self, inputs=None):
+        if inputs is not None:
+            self.setup_inputs(inputs)
+        # preprocess model_path to get model_type and Model Source Hub
+        print("Shark Importer Intialize SharkInference and Do Compile")
+        if self.model_source_hub == "tfhub":
+            # compile and run tfhub tflite
+            print("Inference tfhub model")
+            self.shark_module = SharkInference(self.tflite_file,
+                                               self.inputs,
+                                               device=self.device,
+                                               dynamic=self.dynamic,
+                                               jit_trace=self.jit_trace)
+            self.shark_module.set_frontend("tflite")
+            self.shark_module.compile()
+        elif self.model_source_hub == "huggingface":
+            print("Inference", self.model_source_hub, " not implemented yet")
+        elif self.model_source_hub == "jaxhub":
+            print("Inference", self.model_source_hub, " not implemented yet")
+
+    def forward(self, inputs=None):
+        if inputs is not None:
+            self.setup_inputs(inputs)
+        # preprocess model_path to get model_type and Model Source Hub
+        print("Shark Importer forward Model")
+        if self.model_source_hub == "tfhub":
+            shark_results = self.shark_module.forward(self.inputs)
+            # Fix type information for unsigned cases.
+            # for test compare result
+            shark_results = list(shark_results)
+            for i in range(len(self.output_details)):
+                dtype = self.output_details[i]["dtype"]
+                shark_results[i] = shark_results[i].astype(dtype)
+            return shark_results
+        elif self.model_source_hub == "huggingface":
+            print("Inference", self.model_source_hub, " not implemented yet")
+        elif self.model_source_hub == "jaxhub":
+            print("Inference", self.model_source_hub, " not implemented yet")
+
+
+def shark_load(model_name, file_path):
+    file_link = f"https://storage.googleapis.com/shark_tank/users/stanley/{model_name}.mlir"
+    response = urllib.request.urlretrieve(file_link, file_path)
+    if not os.path.isfile(file_path):
+        raise ValueError(
+            f"Tried looking for target mlir in {file_path}, but cannot be found."
+        )
+    with open(file_path, "rb") as input_file:
+        model_mlir = input_file.read()
+    return model_mlir
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -0,0 +1,115 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from shark.torch_mlir_utils import get_torch_mlir_module, run_on_refbackend
+import os
+from shark.parser import shark_args
+from shark.shark_runner import SharkRunner, SharkBenchmarkRunner
+import time
+import sys
+
+
+# Prints to stderr.
+def print_err(*a):
+    print(*a, file=sys.stderr)
+
+
+class SharkInference:
+    """Inference API targeting pytorch, tensorflow, linalg, mhlo and tosa frontend."""
+
+    def __init__(self,
+                 model,
+                 input: tuple,
+                 device: str = None,
+                 dynamic: bool = False,
+                 jit_trace: bool = False,
+                 benchmark_mode: bool = False):
+        self.model = model
+        self.input = input
+        self.dynamic = dynamic
+        self.jit_trace = jit_trace
+        self.benchmark_mode = benchmark_mode
+
+        # By default it's torch frontend.
+        self.frontend = "pytorch"
+
+        # Sets the device.
+        self.device = device if device is not None else shark_args.device
+
+        self.model_config_path = shark_args.model_config_path
+
+        self.shark_runner = None
+
+    # Sets the frontend i.e `pytorch` or `tensorflow`.
+    def set_frontend(self, frontend: str):
+        if frontend not in [
+                "pytorch", "torch", "tensorflow", "tf", "mhlo", "linalg",
+                "tosa", "tflite"
+        ]:
+            print_err("frontend not supported.")
+        else:
+            self.frontend = frontend
+
+    def compile(self):
+        # Inference do not use AOT.
+        from_aot = False
+        if (self.benchmark_mode == True):
+            self.shark_runner = SharkBenchmarkRunner(self.model, self.input,
+                                                     self.dynamic, self.device,
+                                                     self.jit_trace, from_aot,
+                                                     self.frontend)
+        else:
+            self.shark_runner = SharkRunner(self.model, self.input,
+                                            self.dynamic, self.device,
+                                            self.jit_trace, from_aot,
+                                            self.frontend,
+                                            self.model_config_path)
+
+    # inputs are considered to be np.array.
+    def forward(self, inputs):
+        input_list = inputs
+        # converts the inputs to numpy.
+        if self.frontend in ["pytorch", "torch"]:
+            input_list = [x.detach().numpy() for x in inputs]
+        elif self.frontend in ["tensorflow", "tf"]:
+            input_list = [x.numpy() for x in inputs]
+        return self.shark_runner.forward(input_list, self.frontend)
+
+    # Saves the .vmfb module.
+    def save_module(self, dir=None):
+        if dir is None:
+            return self.shark_runner.save_module()
+        return self.shark_runner.save_module(dir)
+
+    ######### Benchmark Related Functions #########
+    def benchmark_mode(func):
+
+        def inner(self, *args, **kwargs):
+            assert self.benchmark_mode, "SharkRunner needs to be in benchmark mode to run benchmark methods."
+            return func(self, *args, **kwargs)
+
+        return inner
+
+    @benchmark_mode
+    def benchmark_all(self, inputs):
+        self.shark_runner.benchmark_all(inputs)
+
+    @benchmark_mode
+    def benchmark_frontend(self, inputs):
+        self.shark_runner.benchmark_frontend(inputs)
+
+    @benchmark_mode
+    def benchmark_python(self, inputs):
+        self.shark_runner.benchmark_python(inputs)
+
+    @benchmark_mode
+    def benchmark_c(self):
+        self.shark_runner.benchmark_c()
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -0,0 +1,205 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from iree.compiler import tf as tfc
+import iree.compiler.tflite as ireec_tflite
+from torch.utils._python_dispatch import enable_torch_dispatch_mode
+from torch_mlir.eager_mode import torch_mlir_tensor
+from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor
+from torch_mlir_e2e_test.eager_backends.refbackend import EagerModeRefBackend
+
+from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
+from shark.torch_mlir_utils import get_torch_mlir_module, run_on_refbackend
+from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_module_to_vmfb, export_module_to_mlir_file, build_benchmark_args, run_benchmark_module
+import os
+from shark.parser import shark_args
+from tqdm import tqdm
+import time
+
+
+class SharkRunner:
+    """Base class for Shark Inference and Shark Runner."""
+
+    def __init__(
+        self,
+        model,
+        input: tuple,
+        dynamic: bool = False,
+        device: str = None,
+        jit_trace: bool = False,
+        from_aot: bool = False,
+        frontend: str = "torch",
+        model_config_path: str = None,
+    ):
+        self.model = model
+        self.frontend_model = model
+        self.from_aot = from_aot
+        self.input = input
+        self.frontend = frontend
+        self.vmfb_file = None
+        func_name = "forward"
+        self.device = device if device is not None else shark_args.device
+        if self.frontend in ["pytorch", "torch"]:
+            # get torch-mlir dialect
+            # self.model = torch.Module
+            # TODO assert
+            self.model = get_torch_mlir_module(self.model, input, dynamic,
+                                               jit_trace, from_aot)
+        elif self.frontend in ["tensorflow", "tf"]:
+            # get mhlo dialect
+            # self.model = tf.Module
+            # TODO assert
+            self.model = tfc.compile_module(self.model,
+                                            exported_names=[func_name],
+                                            import_only=True)
+        elif self.frontend in ["tflite"]:
+            print("Setting up for IREE compiler tflite")
+            # get tosa dialect
+            # self.model = model.tflite
+            # TODO assert
+            self.model = ireec_tflite.compile_file(self.model,
+                                                   input_type="tosa",
+                                                   import_only=True)
+            func_name = "main"
+
+        # TODO: We can capture the .vmfb module here and later use it for saving
+        # rather than recompiling it again, if used for saving.
+        (
+            self.iree_compilation_module,
+            self.iree_config,
+        ) = get_iree_compiled_module(self.model,
+                                     self.device,
+                                     self.frontend,
+                                     func_name=func_name,
+                                     model_config_path=model_config_path)
+
+        # Debugging Options:
+        if shark_args.save_mlir:
+            export_module_to_mlir_file(self.model, self.frontend,
+                                       shark_args.repro_dir)
+        if shark_args.save_vmfb:
+            self.vmfb_file = self.save_module(shark_args.repro_dir)
+
+    # All the timings and benchmarking can be done here.
+    def forward(self, input, frontend):
+        return get_results(self.iree_compilation_module, input,
+                           self.iree_config, frontend)
+
+    # TODO: Instead of passing directory and having names decided by the module
+    # , user may want to save the module with manual names.
+    def save_module(self, dir=os.getcwd()):
+        return export_iree_module_to_vmfb(self.model, self.device, dir,
+                                          self.frontend)
+
+    # TODO: Load a module and directly use it, we will need to set the frontend
+    # in this case.
+    def load_module(self, name):
+        pass
+
+
+class SharkEagerMode:
+
+    def __init__(self, device="cpu"):
+        if device == "refbackend":
+            torch_mlir_tensor.backend = EagerModeRefBackend()
+        else:
+            torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(
+                device)
+        self.guard = enable_torch_dispatch_mode(TorchMLIRTensor)
+        self.guard.__enter__()
+
+    def __del__(self):
+        self.guard.__exit__(None, None, None)
+
+
+class SharkBenchmarkRunner(SharkRunner):
+    # SharkRunner derived class with Benchmarking capabilities.
+    def __init__(
+        self,
+        model,
+        input: tuple,
+        dynamic: bool = False,
+        device: str = None,
+        jit_trace: bool = False,
+        from_aot: bool = False,
+        frontend: str = "torch",
+    ):
+        SharkRunner.__init__(self, model, input, dynamic, device, jit_trace,
+                             from_aot, frontend)
+        if (self.vmfb_file == None):
+            self.vmfb_file = export_iree_module_to_vmfb(self.model, device,
+                                                        shark_args.repro_dir,
+                                                        frontend)
+        self.benchmark_cl = build_benchmark_args(self.vmfb_file, device, input,
+                                                 frontend, from_aot)
+
+    def benchmark_frontend(self, inputs):
+        if self.frontend in ["pytorch", "torch"]:
+            self.benchmark_torch(inputs)
+        elif self.frontend in ["tensorflow", "tf"]:
+            self.benchmark_tf(inputs)
+
+    def benchmark_torch(self, inputs):
+        inputs = self.input if self.from_aot else inputs
+        inputs = inputs[0]
+        for i in range(shark_args.num_warmup_iterations):
+            self.frontend_model.forward(inputs)
+
+        begin = time.time()
+        for i in range(shark_args.num_iterations):
+            out = self.frontend_model.forward(inputs)
+            if i == shark_args.num_iterations - 1:
+                end = time.time()
+                break
+        print(
+            f"Torch benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    def benchmark_tf(self, inputs):
+        for i in range(shark_args.num_warmup_iterations):
+            self.frontend_model.forward(*inputs)
+
+        begin = time.time()
+        for i in range(shark_args.num_iterations):
+            out = self.frontend_model.forward(*inputs)
+            if i == shark_args.num_iterations - 1:
+                end = time.time()
+                break
+        print(
+            f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+        return
+
+    def benchmark_c(self):
+        result = run_benchmark_module(self.benchmark_cl)
+        print(f"Shark-{self.frontend} C-benchmark:{result} iter/second")
+
+    def benchmark_python(self, inputs):
+        inputs = self.input if self.from_aot else inputs
+        input_list = [x for x in inputs]
+        for i in range(shark_args.num_warmup_iterations):
+            self.forward(input_list, self.frontend)
+
+        begin = time.time()
+        for i in range(shark_args.num_iterations):
+            out = self.forward(input_list, self.frontend)
+            if i == shark_args.num_iterations - 1:
+                end = time.time()
+        print(
+            f"Shark-{self.frontend} Python-benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    def benchmark_all(self, inputs):
+        self.benchmark_frontend(inputs)
+        self.benchmark_python(inputs)
+        self.benchmark_c()
--- a/shark/shark_trainer.py
+++ b/shark/shark_trainer.py
@@ -0,0 +1,139 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from shark.torch_mlir_utils import get_torch_mlir_module, run_on_refbackend
+from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_module_to_vmfb
+import os
+from shark.parser import shark_args
+from shark.shark_runner import SharkRunner
+from shark.backward_makefx import MakeFxModule
+import numpy as np
+from tqdm import tqdm
+import time
+import sys
+
+
+# Prints to stderr.
+def print_err(*a):
+    print(*a, file=sys.stderr)
+
+
+class SharkTrainer:
+    """Training pytorch, tensorflow module on shark runtime."""
+
+    def __init__(
+        self,
+        model,
+        input: tuple,
+        dynamic: bool = False,
+        device: str = None,
+        jit_trace: bool = False,
+        from_aot: bool = True,
+    ):
+        self.model = model
+        # Change tuple to list.
+        self.input = [x for x in input]
+        self.dynamic = dynamic
+        self.from_aot = from_aot
+        self.jit_trace = jit_trace
+        self.from_aot = from_aot
+
+        # By default it's the torch frontend.
+        self.frontend = "pytorch"
+        self.device = device if device is not None else shark_args.device
+
+        self.shark_runner = None
+
+    # Sets the frontend i.e `pytorch` or `tensorflow`.
+    def set_frontend(self, frontend: str):
+        if frontend not in [
+                "pytorch", "torch", "tensorflow", "tf", "mhlo", "linalg", "tosa"
+        ]:
+            print_err("frontend not supported.")
+        else:
+            self.frontend = frontend
+
+    # Training function is needed in the case of torch_fn.
+    def compile(self, training_fn=None):
+        if self.frontend in ["torch", "pytorch"]:
+            aot_module = MakeFxModule(self.model,
+                                      tuple(self.input),
+                                      custom_inference_fn=training_fn)
+            aot_module.generate_graph()
+            # Returns the backward graph.
+            training_graph = aot_module.training_graph
+            weights = self.get_torch_params()
+            self.shark_runner = SharkRunner(training_graph,
+                                            weights + self.input, self.dynamic,
+                                            self.device, self.jit_trace,
+                                            self.from_aot, self.frontend)
+        elif self.frontend in ["tensorflow", "tf", "mhlo"]:
+            self.shark_runner = SharkRunner(self.model, self.input,
+                                            self.dynamic, self.device,
+                                            self.jit_trace, self.from_aot,
+                                            self.frontend)
+        else:
+            print_err("Unknown frontend")
+            return
+
+    # The inputs to the mlir-graph are weights, buffers and inputs respectively.
+    def get_torch_params(self):
+        params = [i.detach() for i in self.model.parameters()]
+        buffers = [i.detach() for i in self.model.buffers()]
+        return params + buffers
+
+    # Function to train pytorch module.
+    def _train_torch(self, num_iters):
+        """Returns the updated weights after num_iters"""
+        params = self.get_torch_params()
+        params = [x.numpy() for x in params]
+        print(f"Training started for {num_iters} iterations:")
+        for i in tqdm(range(num_iters)):
+            params = self.shark_runner.forward(params + self.input,
+                                               self.frontend)
+
+        return params
+
+    # Function to train tensorflow module.
+    # Output final loss.
+    # TODO(raikonenfnu): Save updated weight/states in SHARK.
+    def _train_tf(self, num_iters):
+        input_list = []
+        for x in self.input:
+            if (isinstance(x, list)):
+                nested_list = []
+                for val in x:
+                    if (isinstance(val, np.ndarray)):
+                        nested_list.append(val)
+                    else:
+                        nested_list.append(val.numpy())
+                input_list.append(nested_list)
+            elif (isinstance(x, np.ndarray)):
+                input_list.append(x)
+            else:
+                input_list.append(x.numpy())
+
+        print(f"Training started for {num_iters} iterations:")
+        for i in tqdm(range(num_iters)):
+            outputs = self.shark_runner.forward(input_list, self.frontend)
+        return outputs
+
+    def train(self, num_iters=1):
+        if self.frontend in ["torch", "pytorch"]:
+            return self._train_torch(num_iters)
+        elif self.frontend in ["tf", "tensorflow", "mhlo"]:
+            return self._train_tf(num_iters)
+        else:
+            print_err("Unknown frontend")
+            return
--- a/shark/tests/test_shark_importer.py
+++ b/shark/tests/test_shark_importer.py
@@ -0,0 +1,52 @@
+# RUN: %PYTHON %s
+import numpy as np
+from shark.shark_importer import SharkImporter
+import pytest
+
+model_path = "https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1?lite-format=tflite"
+
+
+# Inputs modified to be useful albert inputs.
+def generate_inputs(input_details):
+    for input in input_details:
+        print("\t%s, %s", str(input["shape"]), input["dtype"].__name__)
+
+    args = []
+    args.append(
+        np.random.randint(low=0,
+                          high=256,
+                          size=input_details[0]["shape"],
+                          dtype=input_details[0]["dtype"]))
+    args.append(
+        np.ones(shape=input_details[1]["shape"],
+                dtype=input_details[1]["dtype"]))
+    args.append(
+        np.zeros(shape=input_details[2]["shape"],
+                 dtype=input_details[2]["dtype"]))
+    return args
+
+
+# A specific case can be run by commenting different cases. Runs all the test
+# across cpu, gpu and vulkan according to available drivers.
+pytest_param = pytest.mark.parametrize(
+    ('dynamic', 'device'),
+    [
+        pytest.param(False, 'cpu'),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, 'cpu', marks=pytest.mark.skip),
+    ])
+
+
+@pytest_param
+def test_albert(dynamic, device):
+    my_shark_importer = SharkImporter(model_path=model_path,
+                                      model_type="tflite",
+                                      model_source_hub="tfhub",
+                                      device=device,
+                                      dynamic=dynamic,
+                                      jit_trace=True)
+    input_details, output_details = my_shark_importer.get_model_details()
+    inputs = generate_inputs(input_details)  # device_inputs
+    my_shark_importer.compile(inputs)
+    shark_results = my_shark_importer.forward(inputs)
+    # print(shark_results)
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -0,0 +1,133 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import io
+import pickle
+import sys
+import os
+
+from io import StringIO
+from torch_mlir.dialects.torch.importer.jit_ir import (
+    ClassAnnotator,
+    ModuleBuilder,
+)
+from torch_mlir_e2e_test.torchscript.serialization import (
+    extract_serializable_annotations, apply_serializable_annotations,
+    SerializableTest)
+
+from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
+
+from torch_mlir.passmanager import PassManager
+from torch_mlir_e2e_test.torchscript.annotations import annotate_args, export
+from torch_mlir.ir import StringAttr
+
+
+def get_module_name_for_asm_dump(module):
+    """Gets a name suitable for an assembly dump.
+    The name is not guaranteed to be unique.
+    """
+    if not "torch.debug_module_name" in module.operation.attributes:
+        return "UnnammedModule"
+    return StringAttr(
+        module.operation.attributes["torch.debug_module_name"]).value
+
+
+def get_input_annotations(inputs: tuple, dynamic: bool) -> list:
+    """TODO: Include necessary documentation"""
+
+    annotations_list = [None]
+    for i in inputs:
+        temp_list = []
+        if dynamic:
+            temp_list.append([-1 for i in range(len(i.shape))])
+        else:
+            temp_list.append(list(i.shape))
+        temp_list.append(i.dtype)
+        temp_list.append(True)
+        annotations_list.append(tuple(temp_list))
+    return annotations_list
+
+
+def run_on_refbackend(torch_module, inputs):
+    backend = refbackend.RefBackendLinalgOnTensorsBackend()
+    compiled = backend.compile(torch_module)
+    jit_module = backend.load(compiled)
+    np_inputs = [x.numpy() for x in inputs]
+    return jit_module.forward(np_inputs[0])
+
+
+def shark_jit_trace(module, input: tuple, dynamic: bool,
+                    tracing_required: bool):
+    """TODO: Include necessary documentation."""
+
+    if not tracing_required:
+        return torch.jit.script(module)
+
+    traced_module = torch.jit.trace_module(module, {"forward": input})
+    actual_script = traced_module._actual_script_module
+    export(actual_script.forward)
+    annotate_args_decorator = annotate_args(
+        get_input_annotations(input, dynamic))
+    annotate_args_decorator(actual_script.forward)
+    module = torch.jit.script(actual_script)
+
+    # TODO: remove saved annotations.pickle
+    torchscript_module_bytes = module.save_to_buffer({
+        "annotations.pkl":
+            pickle.dumps(extract_serializable_annotations(module))
+    })
+    serializable_test = SerializableTest(unique_name="",
+                                         program=torchscript_module_bytes,
+                                         trace=None)
+    _extra_files = {"annotations.pkl": ""}
+    module = torch.jit.load(io.BytesIO(serializable_test.program),
+                            _extra_files=_extra_files)
+    # Load the pickled annotations.
+    annotations = pickle.loads(_extra_files["annotations.pkl"])
+    apply_serializable_annotations(module, annotations)
+    return module
+
+
+def get_torch_mlir_module(
+    module,
+    input: tuple,
+    dynamic: bool,
+    tracing_required: bool,
+    from_aot: bool = False,
+):
+    """TODO: Include necessary documentation."""
+
+    # Tracing is not required from the aot_module.
+    if not from_aot:
+        module = shark_jit_trace(module, input, dynamic, tracing_required)
+
+    mb = ModuleBuilder()
+    class_annotator = ClassAnnotator()
+    class_annotator.exportNone(module._c._type())
+    class_annotator.exportPath(module._c._type(), ["forward"])
+    class_annotator.annotateArgs(
+        module._c._type(),
+        ["forward"],
+        get_input_annotations(input, dynamic),
+    )
+    mb.import_module(module._c, class_annotator)
+
+    with mb.module.context:
+        pm = PassManager.parse(
+            "torchscript-module-to-torch-backend-pipeline,torch-backend-to-linalg-on-tensors-backend-pipeline"
+        )
+        pm.run(mb.module)
+
+    return mb.module
--- a/tank/init.py
+++ b/tank/init.py
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -0,0 +1,74 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+
+import torch
+import numpy as np
+import torchvision.models as models
+from transformers import AutoModelForSequenceClassification, BertTokenizer, TFBertModel
+import importlib
+
+torch.manual_seed(0)
+
+##################### Hugging Face LM Models ###################################
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=
+            2,  # The number of output labels--2 for binary classification.
+            output_attentions=
+            False,  # Whether the model returns attentions weights.
+            output_hidden_states=
+            False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+def get_hf_model(name):
+    model = HuggingFaceLanguage(name)
+    # TODO: Currently the test input is set to (1,128)
+    test_input = torch.randint(2, (1, 128))
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+
+################################################################################
+
+##################### Torch Vision Models    ###################################
+
+
+class VisionModule(torch.nn.Module):
+
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+def get_vision_model(torch_model):
+    model = VisionModule(torch_model)
+    # TODO: Currently the test input is set to (1,128)
+    test_input = torch.randn(1, 3, 224, 224)
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+################################################################################
+
+# Utility function for comparing two tensors (torch).
+def compare_tensors(torch_tensor, numpy_tensor):
+    # setting the absolute and relative tolerance
+    rtol = 1e-02
+    atol = 1e-03
+    torch_to_numpy = torch_tensor.detach().numpy()
+    return np.allclose(torch_to_numpy, numpy_tensor, rtol, atol)
+
--- a/tank/model_utils_tf.py
+++ b/tank/model_utils_tf.py
@@ -0,0 +1,63 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+
+import tensorflow as tf
+import numpy as np
+from transformers import AutoModelForSequenceClassification, BertTokenizer, TFBertModel
+import importlib
+
+##################### Tensorflow Hugging Face LM Models ###################################
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+tf_bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+class TFHuggingFaceLanguage(tf.Module):
+
+    def __init__(self, hf_model_name):
+        super(TFHuggingFaceLanguage, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(
+            hf_model_name, from_pt=True)
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+
+    @tf.function(input_signature=tf_bert_input)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+def get_TFhf_model(name):
+    model = TFHuggingFaceLanguage(name)
+    tokenizer = BertTokenizer.from_pretrained(
+    "microsoft/MiniLM-L12-H384-uncased")
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text,
+                              padding='max_length',
+                              truncation=True,
+                              max_length=MAX_SEQUENCE_LENGTH)
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0)
+    test_input = (encoded_input["input_ids"], encoded_input["attention_mask"],
+         encoded_input["token_type_ids"])
+    actual_out = model.forward(*test_input)
+    return model, test_input, actual_out
+
+
+# Utility function for comparing two tensors (tensorflow).
+def compare_tensors_tf(tf_tensor, numpy_tensor):
+    # setting the absolute and relative tolerance
+    rtol = 1e-02
+    atol = 1e-03
+    tf_to_numpy = tf_tensor.pooler_output.numpy()
+    return np.allclose(tf_to_numpy, numpy_tensor, rtol, atol)
+
+
--- a/tank/pytorch/albert_test.py
+++ b/tank/pytorch/albert_test.py
@@ -0,0 +1,92 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_hf_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import pytest
+
+#torch.manual_seed(0)
+
+class AlbertModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_hf_model("albert-base-v2")
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(model, (input,),
+                                      device=self.device,
+                                      dynamic=self.dynamic,
+                                      jit_trace=True)
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class AlbertModuleTest(unittest.TestCase):
+    
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = AlbertModuleTester(self)
+        self.module_tester.save_mlir = self.save_mlir
+
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.xfail(reason="Language models currently failing for dynamic case")
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.xfail(reason="Albert model on GPU currently fails to produce torch numbers")
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.xfail(reason="Language models currently failing for dynamic case")
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.xfail(reason="Static albert model on vulkan currently fails to validate.")
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.xfail(reason="Language models currently failing for dynamic case")
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/pytorch/alexnet_test.py
+++ b/tank/pytorch/alexnet_test.py
@@ -0,0 +1,90 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_vision_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import torchvision.models as models
+import pytest
+
+torch.manual_seed(0)
+
+class AlexnetModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_vision_model(models.alexnet(pretrained=True))
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(
+                model,
+                (input,),
+                device=self.device,
+                dynamic=self.dynamic,
+        )
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class AlexnetModuleTest(unittest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = AlexnetModuleTester(self)
+        
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/pytorch/bert_test.py
+++ b/tank/pytorch/bert_test.py
@@ -0,0 +1,91 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_hf_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import pytest
+
+#torch.manual_seed(0)
+
+class BertModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_hf_model("bert-base-uncased")
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(model, (input,),
+                                      device=self.device,
+                                      dynamic=self.dynamic,
+                                      jit_trace=True)
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class BertModuleTest(unittest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = BertModuleTester(self)
+        
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.xfail(reason="Language models currently failing for dynamic case")
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.xfail(reason="BERT model on GPU currently fails to produce torch numbers")
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.xfail(reason="Language models currently failing for dynamic case")
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.xfail(reason="Language models currently failing for dynamic case")
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/pytorch/conftest.py
+++ b/tank/pytorch/conftest.py
@@ -0,0 +1,3 @@
+def pytest_addoption(parser):
+    # Attaches SHARK command-line arguments to the pytest machinery.
+    parser.addoption("--save_mlir", action="store_true", default="False", help="Pass option to save input MLIR module to /tmp/ directory.")
--- a/tank/pytorch/minilm_test.py
+++ b/tank/pytorch/minilm_test.py
@@ -0,0 +1,91 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_hf_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import pytest
+
+torch.manual_seed(0)
+
+class MiniLMModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_hf_model("microsoft/MiniLM-L12-H384-uncased")
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(model, (input,),
+                                      device=self.device,
+                                      dynamic=self.dynamic,
+                                      jit_trace=True)
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class MiniLMModuleTest(unittest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = MiniLMModuleTester(self)
+
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.xfail(reason="language models failing for dynamic case")
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.xfail(reason="minilm inference on gpu currently returns invalid results")
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.xfail(reason="language models failing for dynamic case")
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.xfail(reason="language models failing for dynamic case")
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/pytorch/resnet101_test.py
+++ b/tank/pytorch/resnet101_test.py
@@ -0,0 +1,89 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_vision_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import torchvision.models as models
+import pytest
+
+torch.manual_seed(0)
+
+class Resnet101ModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_vision_model(models.resnet101(pretrained=True))
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(
+                model,
+                (input,),
+                device=self.device,
+                dynamic=self.dynamic,
+        )
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class Resnet101ModuleTest(unittest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = Resnet101ModuleTester(self)
+        
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/pytorch/resnet18_test.py
+++ b/tank/pytorch/resnet18_test.py
@@ -0,0 +1,90 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_vision_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import torchvision.models as models
+import pytest
+
+torch.manual_seed(0)
+
+class Resnet18ModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_vision_model(models.resnet18(pretrained=True))
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(
+                model,
+                (input,),
+                device=self.device,
+                dynamic=self.dynamic,
+        )
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class Resnet18ModuleTest(unittest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = Resnet18ModuleTester(self)
+        
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/pytorch/resnet50_test.py
+++ b/tank/pytorch/resnet50_test.py
@@ -0,0 +1,90 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_vision_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import torchvision.models as models
+import pytest
+
+torch.manual_seed(0)
+
+class Resnet50ModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_vision_model(models.resnet50(pretrained=True))
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(
+                model,
+                (input,),
+                device=self.device,
+                dynamic=self.dynamic,
+        )
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class Resnet50ModuleTest(unittest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = Resnet50ModuleTester(self)
+        
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/pytorch/squeezenet_test.py
+++ b/tank/pytorch/squeezenet_test.py
@@ -0,0 +1,90 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_vision_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import torchvision.models as models
+import pytest
+
+torch.manual_seed(0)
+
+class SqueezenetModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_vision_model(models.squeezenet1_0(pretrained=True))
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(
+                model,
+                (input,),
+                device=self.device,
+                dynamic=self.dynamic,
+        )
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class SqueezenetModuleTest(unittest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = SqueezenetModuleTester(self)
+        
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/pytorch/v_diffusion/cfg_sample.py
+++ b/tank/pytorch/v_diffusion/cfg_sample.py
@@ -0,0 +1,111 @@
+import argparse
+import os
+from functools import partial
+
+import clip
+import torch
+from torchvision import transforms
+from tqdm import trange
+
+try:
+    from diffusion import get_model, sampling, utils
+except ModuleNotFoundError:
+    print(
+        "You need to download v-diffusion source from https://github.com/crowsonkb/v-diffusion-pytorch"
+    )
+    raise
+
+torch.manual_seed(0)
+
+
+def parse_prompt(prompt, default_weight=3.0):
+    if prompt.startswith("http://") or prompt.startswith("https://"):
+        vals = prompt.rsplit(":", 2)
+        vals = [vals[0] + ":" + vals[1], *vals[2:]]
+    else:
+        vals = prompt.rsplit(":", 1)
+    vals = vals + ["", default_weight][len(vals) :]
+    return vals[0], float(vals[1])
+
+
+args = argparse.Namespace(
+    prompts=["New York City, oil on canvas"],
+    batch_size=1,
+    device="cuda",
+    model="cc12m_1_cfg",
+    n=1,
+    steps=10,
+)
+
+device = torch.device(args.device)
+print("Using device:", device)
+
+model = get_model(args.model)()
+_, side_y, side_x = model.shape
+checkpoint = f"{args.model}.pth"
+if os.path.exists(checkpoint):
+    model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
+
+model = model.to(device).eval().requires_grad_(False)
+clip_model_name = model.clip_model if hasattr(model, "clip_model") else "ViT-B/16"
+clip_model = clip.load(clip_model_name, jit=False, device=device)[0]
+clip_model.eval().requires_grad_(False)
+normalize = transforms.Normalize(
+    mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]
+)
+
+zero_embed = torch.zeros([1, clip_model.visual.output_dim], device=device)
+target_embeds, weights = [zero_embed], []
+
+txt, weight = parse_prompt(args.prompts[0])
+target_embeds.append(clip_model.encode_text(clip.tokenize(txt).to(device)).float())
+weights.append(weight)
+
+weights = torch.tensor([1 - sum(weights), *weights], device=device)
+
+
+def cfg_model_fn(model, x, t):
+    n = x.shape[0]
+    n_conds = len(target_embeds)
+    x_in = x.repeat([n_conds, 1, 1, 1])
+    t_in = t.repeat([n_conds])
+    clip_embed_in = torch.cat([*target_embeds]).repeat_interleave(n, 0)
+    vs = model(x_in, t_in, clip_embed_in).view([n_conds, n, *x.shape[1:]])
+    v = vs.mul(weights[:, None, None, None, None]).sum(0)
+    return v
+
+
+x = torch.randn([args.n, 3, side_y, side_x], device=device)
+t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
+
+
+def repro(model):
+    if device.type == "cuda":
+        model = model.half()
+
+    steps = utils.get_spliced_ddpm_cosine_schedule(t)
+    for i in trange(0, args.n, args.batch_size):
+        cur_batch_size = min(args.n - i, args.batch_size)
+        outs = sampling.plms_sample(
+            partial(cfg_model_fn, model), x[i : i + cur_batch_size], steps, {}
+        )
+        for j, out in enumerate(outs):
+            utils.to_pil_image(out).save(f"out_{i + j:05}.png")
+
+
+def trace(model, x, t):
+    n = x.shape[0]
+    n_conds = len(target_embeds)
+    x_in = x.repeat([n_conds, 1, 1, 1])
+    t_in = t.repeat([n_conds])
+    clip_embed_in = torch.cat([*target_embeds]).repeat_interleave(n, 0)
+    ts_mod = torch.jit.trace(model, (x_in, t_in, clip_embed_in))
+    print(ts_mod.graph)
+
+    clip_model = clip.load(clip_model_name, jit=True, device=device)[0]
+    print(clip_model.graph)
+
+
+# You can't run both of these because repro will `.half()` the model
+# repro(model)
+trace(model, x, t[0])
--- a/tank/pytorch/v_diffusion/out_00000.png
+++ b/tank/pytorch/v_diffusion/out_00000.png
--- a/tank/pytorch/wide_resnet50_test.py
+++ b/tank/pytorch/wide_resnet50_test.py
@@ -0,0 +1,90 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils import get_vision_model, compare_tensors
+from shark.parser import shark_args
+
+import torch
+import unittest
+import numpy as np
+import torchvision.models as models
+import pytest
+
+torch.manual_seed(0)
+
+class WideResnet50ModuleTester:
+
+    def __init__(
+        self,
+        dynamic=False,
+        device="cpu",
+        save_mlir=False,
+    ):
+        self.dynamic = dynamic
+        self.device = device
+        self.save_mlir = save_mlir
+
+    def create_and_check_module(self):
+        model, input, act_out = get_vision_model(models.wide_resnet50_2(pretrained=True))
+        shark_args.save_mlir = self.save_mlir
+        shark_module = SharkInference(
+                model,
+                (input,),
+                device=self.device,
+                dynamic=self.dynamic,
+        )
+        shark_module.compile()
+        results = shark_module.forward((input,))
+        assert True == compare_tensors(act_out, results)
+
+class WideResnet50ModuleTest(unittest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def configure(self, pytestconfig): 
+        self.save_mlir = pytestconfig.getoption("save_mlir")
+    
+    def setUp(self):
+        self.module_tester = WideResnet50ModuleTester(self)
+        
+    def test_module_static_cpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    def test_module_dynamic_cpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "cpu"
+        self.module_tester.create_and_check_module()
+    
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(check_device_drivers("gpu"), reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "gpu"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_static_vulkan(self):
+        self.module_tester.dynamic = False
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+    @pytest.mark.skipif(
+            check_device_drivers("vulkan"),
+            reason="vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+            )
+    def test_module_dynamic_vulkan(self):
+        self.module_tester.dynamic = True
+        self.module_tester.device = "vulkan"
+        self.module_tester.create_and_check_module()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/tf/README.md
+++ b/tank/tf/README.md
@@ -0,0 +1,15 @@
+## Running SharkInference on CPUs, GPUs and MAC.
+
+
+### Run the binary sequence_classification.
+#### The models supported are: [hugging face sequence classification](https://huggingface.co/docs/transformers/model_doc/auto#transformers.TFAutoModelForSequenceClassification)
+```shell
+./seq_classification.py --hf_model_name="hf_model" --device="cpu" # Use gpu | vulkan
+```
+
+Once the model is compiled to run on the device mentioned, we can pass in text and 
+get the logits.
+
+
+
+
--- a/tank/tf/automodelmaskedlm.py
+++ b/tank/tf/automodelmaskedlm.py
@@ -0,0 +1,47 @@
+from transformers import TFAutoModelForMaskedLM
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+
+# Create a set of input signature.
+inputs_signature = [
+    tf.TensorSpec(shape=[1, 512], dtype=tf.int32),
+]
+
+
+class AutoModelMaskedLM(tf.Module):
+
+    def __init__(self, model_name):
+        super(AutoModelMaskedLM, self).__init__()
+        self.m = TFAutoModelForMaskedLM.from_pretrained(model_name,
+                                                        output_attentions=False)
+        self.m.predict = lambda x: self.m(input_ids=x)
+
+    @tf.function(input_signature=inputs_signature)
+    def forward(self, input_ids):
+        return self.m.predict(input_ids)
+
+
+fail_models = ["microsoft/deberta-base", "google/rembert", "google/tapas-base"]
+
+supported_models = [
+    "albert-base-v2", "bert-base-uncased", "camembert-base",
+    "dbmdz/convbert-base-turkish-cased", "distilbert-base-uncased",
+    "google/electra-small-discriminator",
+    "hf-internal-testing/tiny-random-flaubert", "funnel-transformer/small",
+    "microsoft/layoutlm-base-uncased", "allenai/longformer-base-4096",
+    "google/mobilebert-uncased", "microsoft/mpnet-base", "roberta-base",
+    "xlm-roberta-base"
+]
+
+if __name__ == "__main__":
+    inputs = tf.random.uniform(shape=[1, 512],
+                               maxval=3,
+                               dtype=tf.int32,
+                               seed=10)
+
+    for model_name in supported_models:
+        print(f"Running model: {model_name}")
+        shark_module = SharkInference(AutoModelMaskedLM(model_name), (inputs,))
+        shark_module.set_frontend("tensorflow")
+        shark_module.compile()
+        print(shark_module.forward((inputs,)))
--- a/tank/tf/bert_large_gen.py
+++ b/tank/tf/bert_large_gen.py
@@ -0,0 +1,90 @@
+from iree import runtime as ireert
+from iree.tf.support import module_utils
+from iree.compiler import tf as tfc
+import sys
+from absl import app
+
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+
+from official.nlp.modeling import layers
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import bert_classifier
+
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+        test_network = networks.BertEncoder(vocab_size=vocab_size,
+                                            num_layers=24,
+                                            hidden_size=1024,
+                                            num_attention_heads=16,
+                                            dict_outputs=dict_outputs)
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            test_network, num_classes=NUM_CLASSES)
+        bert_trainer_model.summary()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(input_signature=[
+        tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH],
+                      dtype=tf.int32),  #input0: input_word_ids
+        tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH],
+                      dtype=tf.int32),  #input1: input_mask
+        tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH],
+                      dtype=tf.int32),  #input2: segment_ids
+        tf.TensorSpec([BATCH_SIZE], tf.int32)  # input3: labels
+    ])
+    def learn(self, input_word_ids, input_mask, segment_ids, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            inputs = [input_word_ids, input_mask, segment_ids]
+            probs = self.m(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+    @tf.function(input_signature=bert_input)
+    def predict(self, input_word_ids, input_mask, segment_ids):
+        inputs = [input_word_ids, input_mask, segment_ids]
+        return self.m.predict(inputs)
+
+
+if __name__ == "__main__":
+    # BertModule()
+    # Compile the model using IREE
+    compiler_module = tfc.compile_module(BertModule(),
+                                         exported_names=["learn"],
+                                         import_only=True)
+    # Save module as MLIR file in a directory
+    ARITFACTS_DIR = os.getcwd()
+    mlir_path = os.path.join(ARITFACTS_DIR, "model.mlir")
+    with open(mlir_path, "wt") as output_file:
+        output_file.write(compiler_module.decode('utf-8'))
+    print(f"Wrote MLIR to path '{mlir_path}'")
--- a/tank/tf/bert_large_run.py
+++ b/tank/tf/bert_large_run.py
@@ -0,0 +1,123 @@
+from iree import runtime as ireert
+from iree.tf.support import module_utils
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+import sys
+from absl import app
+import time
+
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+
+from official.nlp.modeling import layers
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import bert_classifier
+
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+        test_network = networks.BertEncoder(vocab_size=vocab_size,
+                                            num_layers=24,
+                                            hidden_size=1024,
+                                            num_attention_heads=16,
+                                            dict_outputs=dict_outputs)
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            test_network, num_classes=NUM_CLASSES)
+        bert_trainer_model.summary()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.predict = tf.function(input_signature=[bert_input])(self.m.predict)
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(input_signature=[
+        bert_input,  # inputs
+        tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32)  # labels
+    ])
+    def learn(self, inputs, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            probs = self.m(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+
+if __name__ == "__main__":
+    # BertModule()
+    # Compile the model using IREE
+    compiler_module = tfc.compile_module(BertModule(),
+                                         exported_names=["learn"],
+                                         import_only=True)
+
+    # Compile the model using IREE
+    backend = "dylib-llvm-aot"
+    args = [
+        "--iree-llvm-target-cpu-features=host",
+        "--iree-mhlo-demote-i64-to-i32=false",
+        "--iree-stream-resource-index-bits=64", "--iree-vm-target-index-bits=64"
+    ]
+    backend_config = "dylib"
+    #backend = "cuda"
+    #backend_config = "cuda"
+    #args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    flatbuffer_blob = compile_str(compiler_module,
+                                  target_backends=[backend],
+                                  extra_args=args,
+                                  input_type="mhlo")
+    #flatbuffer_blob = compile_str(compiler_module, target_backends=["dylib-llvm-aot"])
+
+    # Save module as MLIR file in a directory
+    vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
+    tracer = ireert.Tracer(os.getcwd())
+    config = ireert.Config("dylib", tracer)
+    ctx = ireert.SystemContext(config=config)
+    ctx.add_vm_module(vm_module)
+    BertCompiled = ctx.modules.module
+    predict_sample_input = [
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH))
+    ]
+    learn_sample_input = [
+        predict_sample_input,
+        np.random.randint(5, size=(BATCH_SIZE))
+    ]
+    warmup = 5
+    total_iter = 10
+    num_iter = total_iter - warmup
+    for i in range(10):
+        if (i == warmup - 1):
+            start = time.time()
+        print(
+            BertCompiled.learn(predict_sample_input,
+                               np.random.randint(5, size=(BATCH_SIZE))))
+    end = time.time()
+    total_time = end - start
+    print("time: " + str(total_time))
+    print("time/iter: " + str(total_time / num_iter))
--- a/tank/tf/bert_large_tf.py
+++ b/tank/tf/bert_large_tf.py
@@ -0,0 +1,85 @@
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+import time
+
+from official.nlp.modeling import layers
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import bert_classifier
+
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+        test_network = networks.BertEncoder(vocab_size=vocab_size,
+                                            num_layers=24,
+                                            hidden_size=1024,
+                                            num_attention_heads=16,
+                                            dict_outputs=dict_outputs)
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            test_network, num_classes=NUM_CLASSES)
+        bert_trainer_model.summary()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.predict = tf.function(input_signature=[bert_input])(self.m.predict)
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(input_signature=[
+        bert_input,  # inputs
+        tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32)  # labels
+    ])
+    def learn(self, inputs, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            probs = self.m(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+
+if __name__ == "__main__":
+    # BertModule()
+    predict_sample_input = [
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH))
+    ]
+    bert_model = BertModule()
+    warmup = 1
+    total_iter = 10
+    num_iter = total_iter - warmup
+    for i in range(total_iter):
+        print(
+            bert_model.learn(predict_sample_input,
+                             np.random.randint(5, size=(BATCH_SIZE))))
+        if (i == warmup - 1):
+            start = time.time()
+
+    end = time.time()
+    total_time = end - start
+    print("time: " + str(total_time))
+    print("time/iter: " + str(total_time / num_iter))
--- a/tank/tf/bert_small_gen.py
+++ b/tank/tf/bert_small_gen.py
@@ -0,0 +1,89 @@
+from iree import runtime as ireert
+#from iree.tf.support import module_utils
+from iree.compiler import tf as tfc
+import sys
+from absl import app
+
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+
+from official.nlp.modeling import layers
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import bert_classifier
+
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+        test_network = networks.BertEncoder(vocab_size=vocab_size,
+                                            num_layers=2,
+                                            dict_outputs=dict_outputs)
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            test_network, num_classes=NUM_CLASSES)
+        bert_trainer_model.summary()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(input_signature=[
+        tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH],
+                      dtype=tf.int32),  #input0: input_word_ids
+        tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH],
+                      dtype=tf.int32),  #input1: input_mask
+        tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH],
+                      dtype=tf.int32),  #input2: segment_ids
+        tf.TensorSpec([BATCH_SIZE], tf.int32)  # input3: labels
+    ])
+    def learn(self, input_word_ids, input_mask, segment_ids, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            inputs = [input_word_ids, input_mask, segment_ids]
+            probs = self.m(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+    @tf.function(input_signature=bert_input)
+    def predict(self, input_word_ids, input_mask, segment_ids):
+        inputs = [input_word_ids, input_mask, segment_ids]
+        return self.m.predict(inputs)
+
+
+if __name__ == "__main__":
+    # BertModule()
+    # Compile the model using IREE
+    compiler_module = tfc.compile_module(BertModule(),
+                                         exported_names=["learn"],
+                                         import_only=True)
+    print(type(compiler_module))
+    # Save module as MLIR file in a directory
+    ARITFACTS_DIR = os.getcwd()
+    mlir_path = os.path.join(ARITFACTS_DIR, "model.mlir")
+    with open(mlir_path, "wt") as output_file:
+        output_file.write(compiler_module.decode('utf-8'))
+    print(f"Wrote MLIR to path '{mlir_path}'")
--- a/tank/tf/bert_small_run.py
+++ b/tank/tf/bert_small_run.py
@@ -0,0 +1,120 @@
+from iree import runtime as ireert
+from iree.tf.support import module_utils
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+import sys
+from absl import app
+import time
+
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+
+from official.nlp.modeling import layers
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import bert_classifier
+
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+        test_network = networks.BertEncoder(vocab_size=vocab_size,
+                                            num_layers=2,
+                                            dict_outputs=dict_outputs)
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            test_network, num_classes=NUM_CLASSES)
+        bert_trainer_model.summary()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.predict = tf.function(input_signature=[bert_input])(self.m.predict)
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(input_signature=[
+        bert_input,  # inputs
+        tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32)  # labels
+    ])
+    def learn(self, inputs, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            probs = self.m(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+
+if __name__ == "__main__":
+    # BertModule()
+    # Compile the model using IREE
+    compiler_module = tfc.compile_module(BertModule(),
+                                         exported_names=["learn"],
+                                         import_only=True)
+
+    # Compile the model using IREE
+    backend = "dylib-llvm-aot"
+    args = [
+        "--iree-llvm-target-cpu-features=host",
+        "--iree-mhlo-demote-i64-to-i32=false", "--iree-flow-demote-i64-to-i32"
+    ]
+    backend_config = "dylib"
+    #backend = "cuda"
+    #backend_config = "cuda"
+    #args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    flatbuffer_blob = compile_str(compiler_module,
+                                  target_backends=[backend],
+                                  extra_args=args,
+                                  input_type="mhlo")
+    #flatbuffer_blob = compile_str(compiler_module, target_backends=["dylib-llvm-aot"])
+
+    # Save module as MLIR file in a directory
+    vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
+    tracer = ireert.Tracer(os.getcwd())
+    config = ireert.Config("dylib", tracer)
+    ctx = ireert.SystemContext(config=config)
+    ctx.add_vm_module(vm_module)
+    BertCompiled = ctx.modules.module
+    predict_sample_input = [
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH))
+    ]
+    learn_sample_input = [
+        predict_sample_input,
+        np.random.randint(5, size=(BATCH_SIZE))
+    ]
+    warmup = 5
+    total_iter = 10
+    num_iter = total_iter - warmup
+    for i in range(10):
+        if (i == warmup - 1):
+            start = time.time()
+        print(
+            BertCompiled.learn(predict_sample_input,
+                               np.random.randint(5, size=(BATCH_SIZE))))
+    end = time.time()
+    total_time = end - start
+    print("time: " + str(total_time))
+    print("time/iter: " + str(total_time / num_iter))
--- a/tank/tf/bert_small_tf_run.py
+++ b/tank/tf/bert_small_tf_run.py
@@ -0,0 +1,83 @@
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+import time
+
+from official.nlp.modeling import layers
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import bert_classifier
+
+vocab_size = 100
+NUM_CLASSES = 5
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        dict_outputs = False
+        test_network = networks.BertEncoder(vocab_size=vocab_size,
+                                            num_layers=2,
+                                            dict_outputs=dict_outputs)
+
+        # Create a BERT trainer with the created network.
+        bert_trainer_model = bert_classifier.BertClassifier(
+            test_network, num_classes=NUM_CLASSES)
+        bert_trainer_model.summary()
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m = bert_trainer_model
+        self.m.predict = lambda x: self.m.call(x, training=False)
+        self.predict = tf.function(input_signature=[bert_input])(self.m.predict)
+        self.m.learn = lambda x, y: self.m.call(x, training=False)
+        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
+
+    @tf.function(input_signature=[
+        bert_input,  # inputs
+        tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32)  # labels
+    ])
+    def learn(self, inputs, labels):
+        with tf.GradientTape() as tape:
+            # Capture the gradients from forward prop...
+            probs = self.m(inputs, training=True)
+            loss = self.loss(labels, probs)
+
+        # ...and use them to update the model's weights.
+        variables = self.m.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        self.optimizer.apply_gradients(zip(gradients, variables))
+        return loss
+
+
+if __name__ == "__main__":
+    # BertModule()
+    predict_sample_input = [
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
+        np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH))
+    ]
+    bert_model = BertModule()
+    warmup = 1
+    total_iter = 10
+    num_iter = total_iter - warmup
+    for i in range(total_iter):
+        print(
+            bert_model.learn(predict_sample_input,
+                             np.random.randint(5, size=(BATCH_SIZE))))
+        if (i == warmup - 1):
+            start = time.time()
+
+    end = time.time()
+    total_time = end - start
+    print("time: " + str(total_time))
+    print("time/iter: " + str(total_time / num_iter))
--- a/tank/tf/huggingface_MiniLM_gen.py
+++ b/tank/tf/huggingface_MiniLM_gen.py
@@ -0,0 +1,52 @@
+from iree import runtime as ireert
+from iree.compiler import tf as tfc
+import sys
+from absl import app
+
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+
+from transformers import BertModel, BertTokenizer, TFBertModel
+
+SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased", from_pt=True)
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+
+    @tf.function(input_signature=bert_input)
+    def predict(self, input_word_ids, input_mask, segment_ids):
+        return self.m.predict(input_word_ids, input_mask, segment_ids)
+
+
+if __name__ == "__main__":
+    # BertModule()
+    # Compile the model using IREE
+    compiler_module = tfc.compile_module(BertModule(),
+                                         exported_names=["predict"],
+                                         import_only=True)
+    # Save module as MLIR file in a directory
+    ARITFACTS_DIR = os.getcwd()
+    mlir_path = os.path.join(ARITFACTS_DIR, "model.mlir")
+    with open(mlir_path, "wt") as output_file:
+        output_file.write(compiler_module.decode('utf-8'))
+    print(f"Wrote MLIR to path '{mlir_path}'")
--- a/tank/tf/huggingface_MiniLM_run.py
+++ b/tank/tf/huggingface_MiniLM_run.py
@@ -0,0 +1,87 @@
+from iree import runtime as ireert
+from iree.compiler import tf as tfc
+from iree.compiler import compile_str
+import sys
+from absl import app
+
+import numpy as np
+import os
+import tempfile
+import tensorflow as tf
+
+import time
+from transformers import BertModel, BertTokenizer, TFBertModel
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+# Create a set of 2-dimensional inputs
+bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class BertModule(tf.Module):
+
+    def __init__(self):
+        super(BertModule, self).__init__()
+        # Create a BERT trainer with the created network.
+        self.m = TFBertModel.from_pretrained(
+            "microsoft/MiniLM-L12-H384-uncased", from_pt=True)
+
+        # Invoke the trainer model on the inputs. This causes the layer to be built.
+        self.m.predict = lambda x, y, z: self.m.call(
+            input_ids=x, attention_mask=y, token_type_ids=z, training=False)
+
+    @tf.function(input_signature=bert_input)
+    def predict(self, input_ids, attention_mask, token_type_ids):
+        return self.m.predict(input_ids, attention_mask, token_type_ids)
+
+
+if __name__ == "__main__":
+    # Prepping Data
+    tokenizer = BertTokenizer.from_pretrained(
+        "microsoft/MiniLM-L12-H384-uncased")
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text,
+                              padding='max_length',
+                              truncation=True,
+                              max_length=MAX_SEQUENCE_LENGTH)
+    for key in encoded_input:
+        encoded_input[key] = tf.expand_dims(
+            tf.convert_to_tensor(encoded_input[key]), 0)
+
+    # Compile the model using IREE
+    compiler_module = tfc.compile_module(BertModule(),
+                                         exported_names=["predict"],
+                                         import_only=True)
+
+    # Compile the model using IREE
+    backend = "dylib-llvm-aot"
+    args = [
+        "--iree-llvm-target-cpu-features=host",
+        "--iree-mhlo-demote-i64-to-i32=false", "--iree-flow-demote-i64-to-i32"
+    ]
+    backend_config = "dylib"
+    #backend = "cuda"
+    #backend_config = "cuda"
+    #args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    flatbuffer_blob = compile_str(compiler_module,
+                                  target_backends=[backend],
+                                  extra_args=args,
+                                  input_type="mhlo")
+    #flatbuffer_blob = compile_str(compiler_module, target_backends=["dylib-llvm-aot"])
+
+    # Save module as MLIR file in a directory
+    vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
+    tracer = ireert.Tracer(os.getcwd())
+    config = ireert.Config("dylib", tracer)
+    ctx = ireert.SystemContext(config=config)
+    ctx.add_vm_module(vm_module)
+    BertCompiled = ctx.modules.module
+    result = BertCompiled.predict(encoded_input["input_ids"],
+                                  encoded_input["attention_mask"],
+                                  encoded_input["token_type_ids"])
+    print(result)
--- a/tank/tf/huggingface_MiniLM_tf.py
+++ b/tank/tf/huggingface_MiniLM_tf.py
@@ -0,0 +1,18 @@
+import tensorflow as tf
+from transformers import BertModel, BertTokenizer, TFBertModel
+
+tf_model = TFBertModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased",
+                                       from_pt=True)
+tokenizer = BertTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
+
+text = "Replace me by any text you'd like."
+encoded_input = tokenizer(text,
+                          padding='max_length',
+                          truncation=True,
+                          max_length=512)
+for key in encoded_input:
+    encoded_input[key] = tf.expand_dims(
+        tf.convert_to_tensor(encoded_input[key]), 0)
+output = tf_model(encoded_input)
+
+print(output)
--- a/tank/tf/minilm_tf_test.py
+++ b/tank/tf/minilm_tf_test.py
@@ -0,0 +1,99 @@
+from shark.shark_inference import SharkInference
+from shark.iree_utils import check_device_drivers
+from tank.model_utils_tf import get_TFhf_model, compare_tensors_tf
+
+import tensorflow as tf
+import unittest
+import numpy as np
+import pytest
+
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
+#Create a set of 2-dimensional inputs
+tf_bert_input = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32)
+]
+
+
+class MiniLMTFModuleTester:
+
+    def create_and_check_module(self, dynamic, device):
+        model, input, act_out = get_TFhf_model(
+            "microsoft/MiniLM-L12-H384-uncased")
+        shark_module = SharkInference(model, (input,),
+                                      device=device,
+                                      dynamic=dynamic,
+                                      jit_trace=True)
+        shark_module.set_frontend("tensorflow")
+        shark_module.compile()
+        results = shark_module.forward((input))
+        assert True == compare_tensors_tf(act_out, results)
+
+
+class MiniLMTFModuleTest(unittest.TestCase):
+
+    def setUp(self):
+        self.module_tester = MiniLMTFModuleTester()
+
+    @pytest.mark.skip(reason="TF testing temporarily unavailable.")
+    def test_module_static_cpu(self):
+        dynamic = False
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skip(reason="TF testing temporarily unavailable.")
+    @pytest.mark.xfail(
+        reason="Language models currently failing for dynamic case")
+    def test_module_dynamic_cpu(self):
+        dynamic = True
+        device = "cpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skip(reason="TF testing temporarily unavailable.")
+    @pytest.mark.skipif(check_device_drivers("gpu"),
+                        reason="nvidia-smi not found")
+    def test_module_static_gpu(self):
+        dynamic = False
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skip(reason="TF testing temporarily unavailable.")
+    @pytest.mark.xfail(
+        reason="Language models currently failing for dynamic case")
+    @pytest.mark.skipif(check_device_drivers("gpu"),
+                        reason="nvidia-smi not found")
+    def test_module_dynamic_gpu(self):
+        dynamic = True
+        device = "gpu"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skip(reason="TF testing temporarily unavailable.")
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"),
+        reason=
+        "vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+    )
+    def test_module_static_vulkan(self):
+        dynamic = False
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+    @pytest.mark.skip(reason="TF testing temporarily unavailable.")
+    @pytest.mark.xfail(
+        reason="Language models currently failing for dynamic case")
+    @pytest.mark.skipif(
+        check_device_drivers("vulkan"),
+        reason=
+        "vulkaninfo not found, install from https://github.com/KhronosGroup/MoltenVK/releases"
+    )
+    def test_module_dynamic_vulkan(self):
+        dynamic = True
+        device = "vulkan"
+        self.module_tester.create_and_check_module(dynamic, device)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tank/tf/seq_classification.py
+++ b/tank/tf/seq_classification.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
+import tensorflow as tf
+from shark.shark_inference import SharkInference
+from shark.parser import shark_args
+import argparse
+import os
+
+
+seq_parser = argparse.ArgumentParser(description='Shark Sequence Classification.')
+seq_parser.add_argument(
+    "--hf_model_name",
+    type=str,
+    default="bert-base-uncased",
+    help="Hugging face model to run sequence classification.")
+
+seq_args, unknown = seq_parser.parse_known_args()
+
+
+BATCH_SIZE = 1
+MAX_SEQUENCE_LENGTH = 16
+
+# Create a set of input signature.
+inputs_signature = [
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
+]
+
+# For supported models please see here: 
+# https://huggingface.co/docs/transformers/model_doc/auto#transformers.TFAutoModelForSequenceClassification
+
+def preprocess_input(text = "This is just used to compile the model"):
+    tokenizer = AutoTokenizer.from_pretrained(seq_args.hf_model_name)
+    inputs = tokenizer(text,
+                       padding="max_length",
+                       return_tensors="tf",
+                       truncation=True,
+                       max_length=MAX_SEQUENCE_LENGTH)
+    return inputs
+
+
+class SeqClassification(tf.Module):
+
+    def __init__(self, model_name):
+        super(SeqClassification, self).__init__()
+        self.m = TFAutoModelForSequenceClassification.from_pretrained(
+            model_name, output_attentions=False, num_labels=2)
+        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)[0]
+
+    @tf.function(input_signature=inputs_signature)
+    def forward(self, input_ids, attention_mask):
+        return tf.math.softmax(self.m.predict(input_ids, attention_mask),
+                               axis=-1)
+
+
+if __name__ == "__main__":
+    inputs = preprocess_input()
+    shark_module = SharkInference(
+        SeqClassification(seq_args.hf_model_name),
+        (inputs["input_ids"], inputs["attention_mask"]))
+    shark_module.set_frontend("tensorflow")
+    shark_module.compile()
+    print(f"Model has been successfully compiled on {shark_args.device}")
+
+    while True:
+        input_text = input("Enter the text to classify (press q or nothing to exit): ")
+        if not input_text or input_text == "q":
+            break
+        inputs = preprocess_input(input_text)
+        print(shark_module.forward((inputs["input_ids"], inputs["attention_mask"])))
--- a/tank/tflite/.gitignore
+++ b/tank/tflite/.gitignore
@@ -0,0 +1,2 @@
+tmp/
+.lit_test_times.txt
--- a/tank/tflite/README.md
+++ b/tank/tflite/README.md
@@ -0,0 +1,15 @@
+# Sample compile and execution of TFLite models
+
+This directory contains test scripts to compile/run/compare various TFLite
+models from TFHub. It aims for simplicity and hackability.
+
+Follow the instructions at the repository root to install a functioning
+python venv. Then you can just run individual python files.
+
+Or, use something like the following to collect all artifacts and traces,
+which can be fed to other tools:
+
+```
+export IREE_SAVE_TEMPS="/tmp/iree/models/{main}/{id}"
+for i in *.py; do export IREE_SAVE_CALLS=/tmp/iree/traces/$i; python $i; done
+```
--- a/tank/tflite/albert.py
+++ b/tank/tflite/albert.py
@@ -0,0 +1,44 @@
+# RUN: %PYTHON %s
+import numpy as np
+from shark.shark_importer import SharkImporter
+import pytest
+
+model_path = "https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1?lite-format=tflite"
+
+
+# Inputs modified to be useful albert inputs.
+def generate_inputs(input_details):
+    for input in input_details:
+        print(str(input["shape"]), input["dtype"].__name__)
+
+    args = []
+    args.append(
+        np.random.randint(low=0,
+                          high=256,
+                          size=input_details[0]["shape"],
+                          dtype=input_details[0]["dtype"]))
+    args.append(
+        np.ones(shape=input_details[1]["shape"],
+                dtype=input_details[1]["dtype"]))
+    args.append(
+        np.zeros(shape=input_details[2]["shape"],
+                 dtype=input_details[2]["dtype"]))
+    return args
+
+
+if __name__ == '__main__':
+    my_shark_importer = SharkImporter(model_path=model_path,
+                                      model_type="tflite",
+                                      model_source_hub="tfhub",
+                                      device="cpu",
+                                      dynamic=False,
+                                      jit_trace=True)
+    # Case1: Use default inputs
+    my_shark_importer.compile()
+    shark_results = my_shark_importer.forward()
+    # Case2: Use manually set inputs
+    input_details, output_details = my_shark_importer.get_model_details()
+    inputs = generate_inputs(input_details)  # device_inputs
+    my_shark_importer.compile(inputs)
+    shark_results = my_shark_importer.forward(inputs)
+    # print(shark_results)
--- a/tank/tflite/asr_conformer_test.py
+++ b/tank/tflite/asr_conformer_test.py
@@ -0,0 +1,22 @@
+# RUN: %PYTHON %s
+# XFAIL: *
+
+import absl.testing
+import test_util
+
+model_path = "https://tfhub.dev/neso613/lite-model/ASR_TFLite/pre_trained_models/English/1?lite-format=tflite"
+
+
+# Failure is due to dynamic shapes:
+# - Some improvements to tfl.strided_slice lowering are next steps
+class AsrConformerTest(test_util.TFLiteModelTest):
+
+    def __init__(self, *args, **kwargs):
+        super(AsrConformerTest, self).__init__(model_path, *args, **kwargs)
+
+    def test_compile_tflite(self):
+        self.compile_and_execute()
+
+
+if __name__ == '__main__':
+    absl.testing.absltest.main()
--- a/tank/tflite/bird_classifier_test.py
+++ b/tank/tflite/bird_classifier_test.py
@@ -0,0 +1,39 @@
+# RUN: %PYTHON %s
+
+import absl.testing
+import numpy
+import test_util
+import urllib.request
+
+from PIL import Image
+
+model_path = "https://tfhub.dev/google/lite-model/aiy/vision/classifier/birds_V1/3?lite-format=tflite"
+
+
+class BirdClassifierTest(test_util.TFLiteModelTest):
+
+    def __init__(self, *args, **kwargs):
+        super(BirdClassifierTest, self).__init__(model_path, *args, **kwargs)
+
+    def compare_results(self, iree_results, tflite_results, details):
+        super(BirdClassifierTest, self).compare_results(iree_results,
+                                                        tflite_results, details)
+        self.assertTrue(
+            numpy.isclose(iree_results[0], tflite_results[0], atol=1e-3).all())
+
+    def generate_inputs(self, input_details):
+        img_path = "https://github.com/google-coral/test_data/raw/master/bird.bmp"
+        local_path = "/".join([self.workdir, "bird.bmp"])
+        urllib.request.urlretrieve(img_path, local_path)
+
+        shape = input_details[0]["shape"]
+        im = numpy.array(Image.open(local_path).resize((shape[1], shape[2])))
+        args = [im.reshape(shape)]
+        return args
+
+    def test_compile_tflite(self):
+        self.compile_and_execute()
+
+
+if __name__ == '__main__':
+    absl.testing.absltest.main()
--- a/tank/tflite/cartoon_gan_test.py
+++ b/tank/tflite/cartoon_gan_test.py
@@ -0,0 +1,20 @@
+# RUN: %PYTHON %s
+# REQUIRES: hugetest
+
+import absl.testing
+import test_util
+
+model_path = "https://tfhub.dev/sayakpaul/lite-model/cartoongan/dr/1?lite-format=tflite"
+
+
+class CartoonGanTest(test_util.TFLiteModelTest):
+
+    def __init__(self, *args, **kwargs):
+        super(CartoonGanTest, self).__init__(model_path, *args, **kwargs)
+
+    def test_compile_tflite(self):
+        self.compile_and_execute()
+
+
+if __name__ == '__main__':
+    absl.testing.absltest.main()
--- a/tank/tflite/coco_test_data.py
+++ b/tank/tflite/coco_test_data.py
@@ -0,0 +1,16 @@
+import numpy as np
+import urllib.request
+
+from PIL import Image
+
+
+# Returns a sample image in the COCO 2017 dataset in uint8.
+def generate_input(workdir, input_details):
+    # We use an image of a bear since this is an easy example.
+    img_path = "https://storage.googleapis.com/iree-model-artifacts/coco_2017_000000000285.jpg"
+    local_path = "/".join([workdir, "coco_2017_000000000285.jpg"])
+    urllib.request.urlretrieve(img_path, local_path)
+
+    shape = input_details[0]["shape"]
+    im = np.array(Image.open(local_path).resize((shape[1], shape[2])))
+    return im.reshape(shape)
--- a/tank/tflite/craft_text_test.py
+++ b/tank/tflite/craft_text_test.py
@@ -0,0 +1,26 @@
+# RUN: %PYTHON %s
+# XFAIL: *
+
+import absl.testing
+import test_util
+
+model_path = "https://tfhub.dev/tulasiram58827/lite-model/craft-text-detector/dr/1?lite-format=tflite"
+
+
+# Failure: Resize lowering does not handle inferred dynamic shapes. Furthermore, the entire model
+# requires dynamic shape support.
+class CraftTextTest(test_util.TFLiteModelTest):
+
+    def __init__(self, *args, **kwargs):
+        super(CraftTextTest, self).__init__(model_path, *args, **kwargs)
+
+    def compare_results(self, iree_results, tflite_results, details):
+        super(CraftTextTest, self).compare_results(iree_results, tflite_results,
+                                                   details)
+
+    def test_compile_tflite(self):
+        self.compile_and_execute()
+
+
+if __name__ == '__main__':
+    absl.testing.absltest.main()
--- a/tank/tflite/deeplab_v3_test.py
+++ b/tank/tflite/deeplab_v3_test.py
@@ -0,0 +1,26 @@
+# RUN: %PYTHON %s
+
+import absl.testing
+import numpy
+import test_util
+
+model_path = "https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/2?lite-format=tflite"
+
+
+class DeepLabV3Test(test_util.TFLiteModelTest):
+
+    def __init__(self, *args, **kwargs):
+        super(DeepLabV3Test, self).__init__(model_path, *args, **kwargs)
+
+    def compare_results(self, iree_results, tflite_results, details):
+        super(DeepLabV3Test, self).compare_results(iree_results, tflite_results,
+                                                   details)
+        self.assertTrue(
+            numpy.isclose(iree_results[0], tflite_results[0], atol=1e-3).all())
+
+    def test_compile_tflite(self):
+        self.compile_and_execute()
+
+
+if __name__ == '__main__':
+    absl.testing.absltest.main()
--- a/tank/tflite/densenet_test.py
+++ b/tank/tflite/densenet_test.py
@@ -0,0 +1,26 @@
+# RUN: %PYTHON %s
+
+import absl.testing
+import numpy
+import test_util
+
+model_path = "https://tfhub.dev/tensorflow/lite-model/densenet/1/metadata/1?lite-format=tflite"
+
+
+class DenseNetTest(test_util.TFLiteModelTest):
+
+    def __init__(self, *args, **kwargs):
+        super(DenseNetTest, self).__init__(model_path, *args, **kwargs)
+
+    def compare_results(self, iree_results, tflite_results, details):
+        super(DenseNetTest, self).compare_results(iree_results, tflite_results,
+                                                  details)
+        self.assertTrue(
+            numpy.isclose(iree_results[0], tflite_results[0], atol=1e-5).all())
+
+    def test_compile_tflite(self):
+        self.compile_and_execute()
+
+
+if __name__ == '__main__':
+    absl.testing.absltest.main()
--- a/tank/tflite/east_text_detector_test.py
+++ b/tank/tflite/east_text_detector_test.py
@@ -0,0 +1,35 @@
+# RUN: %PYTHON %s
+
+import absl.testing
+import numpy
+import test_util
+
+model_path = "https://tfhub.dev/sayakpaul/lite-model/east-text-detector/dr/1?lite-format=tflite"
+
+
+class EastTextDetectorTest(test_util.TFLiteModelTest):
+
+    def __init__(self, *args, **kwargs):
+        super(EastTextDetectorTest, self).__init__(model_path, *args, **kwargs)
+
+    def compare_results(self, iree_results, tflite_results, details):
+        super(EastTextDetectorTest,
+              self).compare_results(iree_results, tflite_results, details)
+        self.assertTrue(
+            numpy.isclose(iree_results[0], tflite_results[0], atol=1e-3).all())
+
+        # The second return is extremely noisy as it is not a binary classification. To handle we
+        # check normalized correlation with an expectation of "close enough".
+        iree_norm = numpy.sqrt(iree_results[1] * iree_results[1])
+        tflite_norm = numpy.sqrt(tflite_results[1] * tflite_results[1])
+
+        correlation = numpy.average(iree_results[1] * tflite_results[1] /
+                                    iree_norm / tflite_norm)
+        self.assertTrue(numpy.isclose(correlation, 1.0, atol=1e-2).all())
+
+    def test_compile_tflite(self):
+        self.compile_and_execute()
+
+
+if __name__ == '__main__':
+    absl.testing.absltest.main()
--- a/tank/tflite/efficientnet_lite0_int8_test.py
+++ b/tank/tflite/efficientnet_lite0_int8_test.py
@@ -0,0 +1,39 @@
+# RUN: %PYTHON %s
+
+import absl.testing
+import imagenet_test_data
+import numpy
+import test_util
+
+# Source https://tfhub.dev/tensorflow/lite-model/efficientnet/lite0/int8/2
+model_path = "https://storage.googleapis.com/iree-model-artifacts/efficientnet_lite0_int8_2.tflite"
+
+
+class EfficientnetLite0Int8Test(test_util.TFLiteModelTest):
+
+    def __init__(self, *args, **kwargs):
+        super(EfficientnetLite0Int8Test, self).__init__(model_path, *args,
+                                                        **kwargs)
+
+    def compare_results(self, iree_results, tflite_results, details):
+        super(EfficientnetLite0Int8Test,
+              self).compare_results(iree_results, tflite_results, details)
+        # Dequantize outputs.
+        zero_point = details[0]['quantization_parameters']['zero_points'][0]
+        scale = details[0]['quantization_parameters']['scales'][0]
+        dequantized_iree_results = (iree_results - zero_point) * scale
+        dequantized_tflite_results = (tflite_results - zero_point) * scale
+        self.assertTrue(
+            numpy.isclose(dequantized_iree_results,
+                          dequantized_tflite_results,
+                          atol=5e-3).all())
+
+    def generate_inputs(self, input_details):
+        return [imagenet_test_data.generate_input(self.workdir, input_details)]
+
+    def test_compile_tflite(self):
+        self.compile_and_execute()
+
+
+if __name__ == '__main__':
+    absl.testing.absltest.main()
--- a/Show More
+++ b/Show More