Merge branch 'main' into MI100

Update test-models.yml
2026-01-12 07:18:27 -05:00 · 2022-08-03 11:41:31 -07:00 · 2022-07-29 09:04:36 -07:00
72 changed files with 502 additions and 2474 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -61,30 +61,8 @@ jobs:
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude shark.venv,lit.cfg.py 
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
-    - name: Build and validate the IREE package
-      run: |
-        cd $GITHUB_WORKSPACE
-        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
-        source iree.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://github.com/llvm/torch-mlir/releases -f https://github.com/iree-org/iree/releases
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest -k 'cpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        pytest -k 'gpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        pytest -k 'vulkan' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        rm -rf ./wheelhouse/nodai*

-    - name: Build and validate the SHARK Runtime package
+    - name: Build and validate the package
      run: |
        cd $GITHUB_WORKSPACE
        ./setup_venv.sh
@@ -95,24 +73,7 @@ jobs:
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
-        pytest -k 'cpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        pytest -k 'gpu' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        pytest -k 'vulkan' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/ |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        if !(grep -Fxq " failed" pytest_results.txt) 
-          then 
-            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/$SHA
-            gsutil -m cp -r gs://shark_tank/$SHA/* gs://shark_tank/latest/
-        fi
-        rm pytest_results.txt
-        rm -rf ./wheelhouse/nodai*
-
+        pytest -k 'not benchmark' --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py --ignore=shark/tests/test_shark_importer.py --ignore=tank/tf/
    
    - name: Upload Release Assets
      id: upload-release-assets
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -15,12 +15,14 @@ jobs:
    strategy:
      fail-fast: true
      matrix:
-        os: [a100, MacStudio, ubuntu-latest]
+        os: [a100, MI100, MacStudio, ubuntu-latest]
        suite: [cpu,gpu,vulkan]
        python-version: ["3.10"]
        include:
          - os: ubuntu-latest
            suite: lint
+          - os: MI100
+            suite: rocm
        exclude:
          - os: ubuntu-latest
            suite: vulkan
@@ -30,21 +32,19 @@ jobs:
            suite: cpu
          - os: MacStudio
            suite: gpu
-          - os: MacStudio
-            suite: cpu
          - os: MacStudio
            suite: vulkan
+          - os: MI100
+            suite: gpu
+          - os: MI100
+            suite: vulkan
+

    runs-on: ${{ matrix.os }}

    steps:
    - uses: actions/checkout@v3
    
-    - name: Set Environment Variables
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-        
    - name: Set up Python Version File ${{ matrix.python-version }}
      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest'
      run: |
@@ -82,18 +82,17 @@ jobs:
      if: matrix.suite == 'cpu'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
        pytest -k 'cpu' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 

-    - name: Validate GPU Models
+    - name: Validate GPU/CUDA Models
      if: matrix.suite == 'gpu'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --benchmark -k "gpu" --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_gpu_${SHORT_SHA}.csv
+        pytest -k "gpu" --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py 

    - name: Validate Vulkan Models
      if: matrix.suite == 'vulkan'
@@ -102,3 +101,11 @@ jobs:
        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
        pytest -k 'vulkan' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py
+        
+    - name: Validate GPU/ROCM Models
+      if: matrix.suite == 'rocm'
+      run: |
+        cd $GITHUB_WORKSPACE
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest -k 'rocm' --ignore=shark/tests/test_shark_importer.py --ignore=benchmarks/tests/test_hf_benchmark.py --ignore=benchmarks/tests/test_benchmark.py
--- a/README.md
+++ b/README.md
@@ -121,40 +121,14 @@ pytest tank/<MODEL_NAME> -k "keyword"
 ```

 ### Run benchmarks on SHARK tank pytests and generate bench_results.csv with results.
-
+  
 (requires source installation with `IMPORTER=1 ./setup_venv.sh`)
-
 ```shell
 pytest --benchmark tank
  
 # Just do static GPU benchmarks for PyTorch tests:
 pytest --benchmark tank --ignore-glob="_tf*" -k "static_gpu"
 ```
-  
-### Benchmark Resnet50, MiniLM on CPU
-
-(requires source installation with `IMPORTER=1 ./setup_venv.sh`)  
-  
-```shell
-# We suggest running the following commands as root before running benchmarks on CPU:
-  
-cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | awk -F, '{print $2}' | sort -n | uniq | ( while read X ; do echo $X ; echo 0 > /sys/devices/system/cpu/cpu$X/online ; done )
-echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
-
-# Benchmark canonical Resnet50 on CPU via pytest
-pytest --benchmark tank/resnet50/ -k "cpu"
-
-# Benchmark canonical MiniLM on CPU via pytest
-pytest --benchmark tank/MiniLM-L12-H384-uncased/ -k "cpu"
-
-# Benchmark MiniLM on CPU via transformer-benchmarks:
-git clone --recursive https://github.com/nod-ai/transformer-benchmarks.git
-cd transformer-benchmarks
-./perf-ci.sh -n
-# Check detail.csv for MLIR/IREE results.
-
-```
-
 </details>


--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-IMPORTER=1 ./setup_venv.sh
-source $GITHUB_WORKSPACE/shark.venv/bin/activate
-python generate_sharktank.py --upload=False
--- a/conftest.py
+++ b/conftest.py
@@ -1,18 +1,5 @@
 def pytest_addoption(parser):
    # Attaches SHARK command-line arguments to the pytest machinery.
-    parser.addoption(
-        "--benchmark",
-        action="store_true",
-        default="False",
-        help="Pass option to benchmark and write results.csv",
-    )
-    parser.addoption(
-        "--onnx_bench",
-        action="store_true",
-        default="False",
-        help="Add ONNX benchmark results to pytest benchmarks.",
-    )
-    # The following options are deprecated and pending removal.
    parser.addoption(
        "--save_mlir",
        action="store_true",
@@ -25,6 +12,12 @@ def pytest_addoption(parser):
        default="False",
        help="Pass option to save IREE output .vmfb",
    )
+    parser.addoption(
+        "--benchmark",
+        action="store_true",
+        default="False",
+        help="Pass option to benchmark and write results.csv",
+    )
    parser.addoption(
        "--save_temps",
        action="store_true",
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -13,7 +13,6 @@ import csv
 import argparse
 from shark.shark_importer import SharkImporter
 import tensorflow as tf
-import subprocess as sp
 import hashlib
 import numpy as np

@@ -94,12 +93,8 @@ def save_torch_model(torch_model_list):


 def save_tf_model(tf_model_list):
-    from tank.model_utils_tf import (
-        get_causal_image_model,
-        get_causal_lm_model,
-        get_keras_model,
-        get_TFhf_model,
-    )
+    from tank.model_utils_tf import get_causal_lm_model
+    from tank.model_utils_tf import get_causal_image_model

    with open(tf_model_list) as csvfile:
        tf_reader = csv.reader(csvfile, delimiter=",")
@@ -110,15 +105,11 @@ def save_tf_model(tf_model_list):

            model = None
            input = None
-            print(f"Generating artifacts for model {tf_model_name}")
+            print(model_type)
            if model_type == "hf":
                model, input, _ = get_causal_lm_model(tf_model_name)
            if model_type == "img":
                model, input, _ = get_causal_image_model(tf_model_name)
-            if model_type == "keras":
-                model, input, _ = get_keras_model(tf_model_name)
-            if model_type == "TFhf":
-                model, input, _ = get_TFhf_model(tf_model_name)

            tf_model_name = tf_model_name.replace("/", "_")
            tf_model_dir = os.path.join(WORKDIR, str(tf_model_name) + "_tf")
@@ -228,8 +219,5 @@ if __name__ == "__main__":
        save_tflite_model(args.tflite_model_csv)

    if args.upload:
-        git_hash = sp.getoutput("git log -1 --format='%h'") + "/"
-        print("uploading files to gs://shark_tank/" + git_hash)
-        os.system(
-            "gsutil cp -r ./gen_shark_tank/* gs://shark_tank/" + git_hash
-        )
+        print("uploading files to gs://shark_tank/")
+        os.system("gsutil cp -r ./gen_shark_tank/* gs://shark_tank/")
--- a/reference_models/bert-base-uncased_torch/bert-base-uncased_torch_test.py
+++ b/reference_models/bert-base-uncased_torch/bert-base-uncased_torch_test.py
@@ -1,109 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-from shark.parser import shark_args
-
-import torch
-import unittest
-import numpy as np
-import pytest
-
-
-class BertBaseUncasedModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "bert-base-uncased", dynamic
-        )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_args.onnx_bench = self.onnx_bench
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "bert-base-uncased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class BertBaseUncasedModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = BertBaseUncasedModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
+++ b/reference_models/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
@@ -1,95 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.parser import shark_args
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class DistilBertModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "distilbert-base-uncased", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "distilbert-base-uncased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class DistilBertModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = DistilBertModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
+++ b/reference_models/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class MobileNetV3ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "mobilenet_v3_small", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        np.testing.assert_allclose(act_out, results, rtol=1e-02, atol=1e-03)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "alexnet",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class MobileNetV3ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MobileNetV3ModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="golden results don't match.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="golden results don't match.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="stuck in the pipeline.")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/resnet101_torch/resnet101_torch_test.py
+++ b/reference_models/resnet101_torch/resnet101_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class Resnet101ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "resnet101", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "resnet101",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class Resnet101ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = Resnet101ModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/resnet50_torch/resnet50_torch_test.py
+++ b/reference_models/resnet50_torch/resnet50_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import get_vision_model, compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class Resnet50ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "resnet50", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "resnet50",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class Resnet50ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = Resnet50ModuleTester(self)
-        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
-        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/reference_models/unet_torch/unet_torch_test.py
+++ b/reference_models/unet_torch/unet_torch_test.py
@@ -1,91 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class UnetModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "unet", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        np.testing.assert_allclose(act_out, results, rtol=1e-02, atol=1e-03)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "unet",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class UnetModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = UnetModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -118,16 +118,6 @@ if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
  fi
 fi

-if [[ ! -z "${ONNX}" ]]; then
-  echo "${Yellow}Installing ONNX and onnxruntime for benchmarks..."
-  $PYTHON -m pip install onnx onnxruntime psutil
-  if [ $? -eq 0 ];then
-    echo "Successfully installed ONNX and ONNX runtime."
-  else
-    echo "Could not install ONNX." >&2
-  fi
-fi
-
 if [[ -z "${CONDA_PREFIX}" ]]; then
  echo "${Green}Before running examples activate venv with:"
  echo "  ${Green}source $VENV_DIR/bin/activate"
--- a/shark/examples/shark_inference/albert_maskfill_pt.py
+++ b/shark/examples/shark_inference/albert_maskfill_pt.py
@@ -1,88 +0,0 @@
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-import torch
-from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
-from iree.compiler import compile_str
-from iree import runtime as ireert
-import os
-import numpy as np
-
-MAX_SEQUENCE_LENGTH = 512
-BATCH_SIZE = 1
-
-
-class AlbertModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.model = AutoModelForMaskedLM.from_pretrained("albert-base-v2")
-        self.model.eval()
-
-    def forward(self, input_ids, attention_mask):
-        return self.model(
-            input_ids=input_ids, attention_mask=attention_mask
-        ).logits
-
-
-if __name__ == "__main__":
-    # Prepping Data
-    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
-    text = "This [MASK] is very tasty."
-    encoded_inputs = tokenizer(
-        text,
-        padding="max_length",
-        truncation=True,
-        max_length=MAX_SEQUENCE_LENGTH,
-        return_tensors="pt",
-    )
-    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
-    mlir_importer = SharkImporter(
-        AlbertModule(),
-        inputs,
-        frontend="torch",
-    )
-    minilm_mlir, func_name = mlir_importer.import_mlir(
-        is_dynamic=False, tracing_required=True
-    )
-    shark_module = SharkInference(
-        minilm_mlir, func_name, mlir_dialect="linalg"
-    )
-    shark_module.compile()
-    token_logits = torch.tensor(shark_module.forward(inputs))
-    mask_id = torch.where(
-        encoded_inputs["input_ids"] == tokenizer.mask_token_id
-    )[1]
-    mask_token_logits = token_logits[0, mask_id, :]
-    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-    for token in top_5_tokens:
-        print(
-            f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
-        )
-    while True:
-        try:
-            new_text = input("Give me a sentence with [MASK] to fill: ")
-            encoded_inputs = tokenizer(
-                new_text,
-                padding="max_length",
-                truncation=True,
-                max_length=MAX_SEQUENCE_LENGTH,
-                return_tensors="pt",
-            )
-            inputs = (
-                encoded_inputs["input_ids"],
-                encoded_inputs["attention_mask"],
-            )
-            token_logits = torch.tensor(shark_module.forward(inputs))
-            mask_id = torch.where(
-                encoded_inputs["input_ids"] == tokenizer.mask_token_id
-            )[1]
-            mask_token_logits = token_logits[0, mask_id, :]
-            top_5_tokens = (
-                torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-            )
-            for token in top_5_tokens:
-                print(
-                    f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
-                )
-        except KeyboardInterrupt:
-            print("Exiting program.")
-            break
--- a/shark/examples/shark_inference/albert_maskfill_tf.py
+++ b/shark/examples/shark_inference/albert_maskfill_tf.py
@@ -1,100 +0,0 @@
-from PIL import Image
-import requests
-
-from transformers import TFAutoModelForMaskedLM, AutoTokenizer
-import tensorflow as tf
-from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
-from iree.compiler import tf as tfc
-from iree.compiler import compile_str
-from iree import runtime as ireert
-import os
-import numpy as np
-import sys
-
-MAX_SEQUENCE_LENGTH = 512
-BATCH_SIZE = 1
-
-# Create a set of inputs
-t5_inputs = [
-    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
-    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
-]
-
-
-class AlbertModule(tf.Module):
-    def __init__(self):
-        super(AlbertModule, self).__init__()
-        self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
-        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
-
-    @tf.function(input_signature=t5_inputs)
-    def forward(self, input_ids, attention_mask):
-        return self.m.predict(input_ids, attention_mask)
-
-
-if __name__ == "__main__":
-    # Prepping Data
-    tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
-    # text = "This is a great [MASK]."
-    text = "This [MASK] is very tasty."
-    encoded_inputs = tokenizer(
-        text,
-        padding="max_length",
-        truncation=True,
-        max_length=MAX_SEQUENCE_LENGTH,
-        return_tensors="tf",
-    )
-    inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
-    mlir_importer = SharkImporter(
-        AlbertModule(),
-        inputs,
-        frontend="tf",
-    )
-    minilm_mlir, func_name = mlir_importer.import_mlir(
-        is_dynamic=False, tracing_required=False
-    )
-    shark_module = SharkInference(minilm_mlir, func_name, mlir_dialect="mhlo")
-    shark_module.compile()
-    output_idx = 0
-    data_idx = 1
-    token_logits = shark_module.forward(inputs)[output_idx][data_idx]
-    mask_id = np.where(
-        tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id
-    )
-    mask_token_logits = token_logits[0, mask_id, :]
-    top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[0:5]
-    for token in top_5_tokens:
-        print(
-            f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
-        )
-    while True:
-        try:
-            new_text = input("Give me a sentence with [MASK] to fill: ")
-            encoded_inputs = tokenizer(
-                new_text,
-                padding="max_length",
-                truncation=True,
-                max_length=MAX_SEQUENCE_LENGTH,
-                return_tensors="tf",
-            )
-            inputs = (
-                encoded_inputs["input_ids"],
-                encoded_inputs["attention_mask"],
-            )
-            token_logits = shark_module.forward(inputs)[output_idx][data_idx]
-            mask_id = np.where(
-                tf.squeeze(encoded_inputs["input_ids"])
-                == tokenizer.mask_token_id
-            )
-            mask_token_logits = token_logits[0, mask_id, :]
-            top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[
-                0:5
-            ]
-            for token in top_5_tokens:
-                print(
-                    f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
-                )
-        except KeyboardInterrupt:
-            print("Exiting program.")
-            sys.exit()
--- a/shark/examples/shark_inference/resnest.py
+++ b/shark/examples/shark_inference/resnest.py
@@ -23,7 +23,7 @@ input = torch.randn(1, 3, 224, 224)

 mlir_importer = SharkImporter(
    ResnestModule(),
-    (input,),
+    (input),
    frontend="torch",
 )

@@ -33,7 +33,9 @@ mlir_importer = SharkImporter(

 print(golden_out)

-shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
+shark_module = SharkInference(
+    vision_mlir, func_name, device="cpu", mlir_dialect="linalg"
+)
 shark_module.compile()
-result = shark_module.forward((input,))
+result = shark_module.forward((input))
 print("Obtained result", result)
--- a/shark/examples/shark_inference/unet_script.py
+++ b/shark/examples/shark_inference/unet_script.py
@@ -1,9 +1,8 @@
 import torch
-import numpy as np
-from shark.shark_inference import SharkInference
-from shark.shark_importer import SharkImporter
+from shark_runner import SharkInference


+# Currently not supported aten.transpose_conv2d missing.
 class UnetModule(torch.nn.Module):
    def __init__(self):
        super().__init__()
@@ -15,7 +14,7 @@ class UnetModule(torch.nn.Module):
            init_features=32,
            pretrained=True,
        )
-        self.model.eval()
+        self.train(False)

    def forward(self, input):
        return self.model(input)
@@ -23,17 +22,10 @@ class UnetModule(torch.nn.Module):

 input = torch.randn(1, 3, 224, 224)

-mlir_importer = SharkImporter(
+print(input)
+shark_module = SharkInference(
    UnetModule(),
    (input,),
-    frontend="torch",
 )
-
-(vision_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
-    tracing_required=False
-)
-
-shark_module = SharkInference(vision_mlir, func_name, mlir_dialect="linalg")
-shark_module.compile()
-result = shark_module.forward((input,))
-np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
+shark_module.benchmark_forward((input,))
+print(input)
--- a/shark/iree_eager_backend.py
+++ b/shark/iree_eager_backend.py
@@ -48,8 +48,8 @@ class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):

    def __init__(self, device: str):
        self.torch_device_str = device
-        self.config = ireert.Config(IREE_DEVICE_MAP[device])
-        self.raw_device_str = device
+        self.iree_device_str = IREE_DEVICE_MAP[device]
+        self.config = ireert.Config(self.iree_device_str)

    def get_torch_metadata(
        self, tensor: DeviceArray, kwargs: Dict[str, Any]
@@ -71,7 +71,7 @@ class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):
            "EagerMode",
        )
        callable, _ = get_iree_compiled_module(
-            imported_module, self.raw_device_str, func_name=fn_name
+            imported_module, self.iree_device_str, func_name=fn_name
        )
        return callable

--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -44,17 +44,15 @@ IREE_DEVICE_MAP = {
    "vulkan": "vulkan",
    "metal": "vulkan",
    "rocm": "rocm",
-    "intel-gpu": "level_zero",
 }

 IREE_TARGET_MAP = {
-    "cpu": "llvm-cpu",
+    "cpu": "dylib",
    "gpu": "cuda",
    "cuda": "cuda",
    "vulkan": "vulkan",
    "metal": "vulkan",
    "rocm": "rocm",
-    "intel-gpu": "opencl-spirv",
 }

 # Finds whether the required drivers are installed for the given device.
@@ -70,12 +68,6 @@ def check_device_drivers(device):
            subprocess.check_output("vulkaninfo")
        except Exception:
            return True
-    elif device in ["intel-gpu"]:
-        try:
-            subprocess.check_output(["dpkg", "-L", "intel-level-zero-gpu"])
-            return False
-        except Exception:
-            return True
    elif device == "cpu":
        return False
    # Unknown device.
--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -34,12 +34,9 @@ def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
            dtype_string = str(input_tensor.dtype).replace("torch.", "")
        elif mlir_dialect in ["mhlo", "tflite"]:
            dtype = input_tensor.dtype
-            try:
-                dtype_string = re.findall("'[^\"]*'", str(dtype))[0].replace(
-                    "'", ""
-                )
-            except IndexError:
-                dtype_string = str(dtype)
+            dtype_string = re.findall("'[^\"]*'", str(dtype))[0].replace(
+                "'", ""
+            )
        regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
        match = regex_split.match(dtype_string)
        mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -98,10 +98,8 @@ def compile_module_to_flatbuffer(

 def get_iree_module(flatbuffer_blob, device, func_name):
    # Returns the compiled module and the configs.
+    vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
    config = ireert.Config(IREE_DEVICE_MAP[device])
-    vm_module = ireert.VmModule.from_flatbuffer(
-        config.vm_instance, flatbuffer_blob
-    )
    ctx = ireert.SystemContext(config=config)
    ctx.add_vm_module(vm_module)
    ModuleCompiled = ctx.modules.module[func_name]
@@ -126,15 +124,15 @@ def export_iree_module_to_vmfb(
    module,
    device: str,
    directory: str,
-    mlir_dialect: str = "linalg",
+    frontend: str = "torch",
    func_name: str = "forward",
    model_config_path: str = None,
 ):
    # Compiles the module given specs and saves it as .vmfb file.
    flatbuffer_blob = compile_module_to_flatbuffer(
-        module, device, mlir_dialect, func_name, model_config_path
+        module, device, frontend, func_name, model_config_path
    )
-    module_name = f"{mlir_dialect}_{func_name}_{device}"
+    module_name = f"{frontend}_{func_name}_{device}"
    filename = os.path.join(directory, module_name + ".vmfb")
    print(f"Saved vmfb in {filename}.")
    with open(filename, "wb") as f:
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -16,7 +16,6 @@

 import iree.runtime as ireert
 import ctypes
-from shark.parser import shark_args

 # Get the default gpu args given the architecture.
 def get_iree_gpu_args():
@@ -24,9 +23,7 @@ def get_iree_gpu_args():
    ireert.flags.parse_flags("--cuda_allow_inline_execution")
    # TODO: Give the user_interface to pass the sm_arch.
    sm_arch = get_cuda_sm_cc()
-    if (
-        sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]
-    ) and (shark_args.enable_tf32 == True):
+    if sm_arch in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86"]:
        return [
            "--iree-hal-cuda-disable-loop-nounroll-wa",
            f"--iree-hal-cuda-llvm-target-arch={sm_arch}",
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -26,12 +26,6 @@ def get_vulkan_triple_flag():
    elif vulkan_device == "M2":
        print("Found Apple M2 Device. Using m1-moltenvk-macos")
        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif vulkan_device == "Max":
-        print("Found Apple M1 Max Device. Using m1-moltenvk-macos")
-        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
-    elif vulkan_device == "Pro":
-        print("Found Apple M1 Pro Device. Using m1-moltenvk-macos")
-        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
    elif vulkan_device == "M1":
        print("Found Apple M1 Device. Using m1-moltenvk-macos")
        return "-iree-vulkan-target-triple=m1-moltenvk-macos"
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -47,10 +47,16 @@ parser.add_argument(
    default="./shark_tmp",
 )
 parser.add_argument(
-    "--enable_tf32",
-    type=bool,
+    "--save_mlir",
    default=False,
-    help="Enables TF32 precision calculations on supported GPUs.",
+    action="store_true",
+    help="Saves input MLIR module to /tmp/ directory.",
+)
+parser.add_argument(
+    "--save_vmfb",
+    default=False,
+    action="store_true",
+    help="Saves iree .vmfb module to /tmp/ directory.",
 )
 parser.add_argument(
    "--model_config_path",
@@ -61,20 +67,14 @@ parser.add_argument(
 parser.add_argument(
    "--num_warmup_iterations",
    type=int,
-    default=5,
+    default=2,
    help="Run the model for the specified number of warmup iterations.",
 )
 parser.add_argument(
    "--num_iterations",
    type=int,
-    default=100,
+    default=1,
    help="Run the model for the specified number of iterations.",
 )
-parser.add_argument(
-    "--onnx_bench",
-    default=False,
-    action="store_true",
-    help="When enabled, pytest bench results will include ONNX benchmark results.",
-)

 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -19,26 +19,13 @@ from shark.iree_utils.benchmark_utils import (
    run_benchmark_module,
 )
 from shark.parser import shark_args
+from tank.model_utils import get_torch_model
 from datetime import datetime
 import time
 import csv
 import os


-class OnnxFusionOptions(object):
-    def __init__(self):
-        self.disable_gelu = False
-        self.disable_layer_norm = False
-        self.disable_attention = False
-        self.disable_skip_layer_norm = False
-        self.disable_embed_layer_norm = False
-        self.disable_bias_skip_layer_norm = False
-        self.disable_bias_gelu = False
-        self.enable_gelu_approximation = False
-        self.use_mask_index = False
-        self.no_attention_mask = False
-
-
 class SharkBenchmarkRunner(SharkRunner):
    # SharkRunner derived class with Benchmarking capabilities.
    def __init__(
@@ -47,21 +34,22 @@ class SharkBenchmarkRunner(SharkRunner):
        function_name: str = "forward",
        device: str = "none",
        mlir_dialect: str = "linalg",
+        frontend: str = "torch",
    ):
        self.device = shark_args.device if device == "none" else device
+        self.frontend = frontend
        self.frontend_model = None
        self.vmfb_file = None
-        self.mlir_dialect = mlir_dialect
        SharkRunner.__init__(
            self,
            mlir_module,
            function_name,
            device,
-            self.mlir_dialect,
+            mlir_dialect,
        )
        if self.vmfb_file == None:
            self.vmfb_file = export_iree_module_to_vmfb(
-                mlir_module, device, shark_args.repro_dir, self.mlir_dialect
+                mlir_module, device, shark_args.repro_dir, self.frontend
            )

    def setup_cl(self, input_tensors):
@@ -71,17 +59,15 @@ class SharkBenchmarkRunner(SharkRunner):
            input_tensors,
            mlir_dialect=self.mlir_dialect,
        )
-        # print(self.benchmark_cl)

-    def benchmark_frontend(self, modelname):
-        if self.mlir_dialect in ["linalg", "torch"]:
+    def benchmark_frontend(self, inputs, modelname):
+        if self.frontend in ["pytorch", "torch"]:
            return self.benchmark_torch(modelname)
-        elif self.mlir_dialect in ["mhlo", "tf"]:
-            return self.benchmark_tf(modelname)
+        elif self.frontend in ["tensorflow", "tf"]:
+            return self.benchmark_tf(inputs, modelname)

    def benchmark_torch(self, modelname):
        import torch
-        from tank.model_utils import get_torch_model

        if self.device == "gpu":
            torch.set_default_tensor_type(torch.cuda.FloatTensor)
@@ -90,7 +76,7 @@ class SharkBenchmarkRunner(SharkRunner):
        torch_device = torch.device(
            "cuda:0" if self.device == "gpu" else "cpu"
        )
-        HFmodel, input = get_torch_model(modelname)[:2]
+        HFmodel, input, act_out = get_torch_model(modelname)
        frontend_model = HFmodel.model
        frontend_model.to(torch_device)
        input.to(torch_device)
@@ -112,21 +98,13 @@ class SharkBenchmarkRunner(SharkRunner):
            f"{((end-begin)/shark_args.num_iterations)*1000}",
        ]

-    def benchmark_tf(self, modelname):
-        import tensorflow as tf
-        from tank.model_utils_tf import get_tf_model
-
-        model, input, = get_tf_model(
-            modelname
-        )[:2]
-        frontend_model = model
-
+    def benchmark_tf(self, frontend_model, inputs):
        for i in range(shark_args.num_warmup_iterations):
-            frontend_model.forward(*input)
+            frontend_model.forward(*inputs)

        begin = time.time()
        for i in range(shark_args.num_iterations):
-            out = frontend_model.forward(*input)
+            out = frontend_model.forward(*inputs)
            if i == shark_args.num_iterations - 1:
                end = time.time()
                break
@@ -139,9 +117,8 @@ class SharkBenchmarkRunner(SharkRunner):
        ]

    def benchmark_c(self):
-        print(self.benchmark_cl)
        result = run_benchmark_module(self.benchmark_cl)
-        print(f"Shark-IREE-C benchmark:{result} iter/second")
+        print(f"Shark-{self.frontend} C-benchmark:{result} iter/second")
        return [f"{result}", f"{1000/result}"]

    def benchmark_python(self, inputs):
@@ -155,105 +132,32 @@ class SharkBenchmarkRunner(SharkRunner):
            if i == shark_args.num_iterations - 1:
                end = time.time()
        print(
-            f"Shark-IREE Python benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
+            f"Shark-{self.frontend} Python-benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
        return [
            f"{shark_args.num_iterations/(end-begin)}",
            f"{((end-begin)/shark_args.num_iterations)*1000}",
        ]

-    def benchmark_onnx(self, modelname, inputs):
-        if self.device == "gpu":
-            print(
-                "Currently GPU benchmarking on ONNX is not supported in SHARK."
-            )
-            return ["N/A", "N/A"]
-        else:
-            from onnxruntime.transformers.benchmark import run_onnxruntime
-            from onnxruntime.transformers.huggingface_models import MODELS
-            from onnxruntime.transformers.benchmark_helper import (
-                ConfigModifier,
-                Precision,
-            )
-            import psutil
-
-            if modelname == "microsoft/MiniLM-L12-H384-uncased":
-                modelname = "bert-base-uncased"
-            if modelname not in MODELS:
-                print(
-                    f"{modelname} is currently not supported in ORT's HF. Check \
-https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
-for currently supported models. Exiting benchmark ONNX."
-                )
-                return ["N/A", "N/A"]
-            use_gpu = self.device == "gpu"
-            num_threads = psutil.cpu_count(logical=False)
-            batch_sizes = [1]
-            sequence_lengths = [128]
-            cache_dir = os.path.join(".", "cache_models")
-            onnx_dir = os.path.join(".", "onnx_models")
-            verbose = False
-            input_counts = [1]
-            optimize_onnx = True
-            validate_onnx = False
-            disable_ort_io_binding = False
-            use_raw_attention_mask = True
-            model_fusion_statistics = {}
-            overwrite = False
-            model_source = "pt"  # Either "pt" or "tf"
-            provider = None
-            config_modifier = ConfigModifier(None)
-            onnx_args = OnnxFusionOptions()
-            result = run_onnxruntime(
-                use_gpu,
-                provider,
-                (modelname,),
-                None,
-                config_modifier,
-                Precision.FLOAT32,
-                num_threads,
-                batch_sizes,
-                sequence_lengths,
-                shark_args.num_iterations,
-                input_counts,
-                optimize_onnx,
-                validate_onnx,
-                cache_dir,
-                onnx_dir,
-                verbose,
-                overwrite,
-                disable_ort_io_binding,
-                use_raw_attention_mask,
-                model_fusion_statistics,
-                model_source,
-                onnx_args,
-            )
-            print(
-                f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
-            )
-            return [
-                result[0]["QPS"],
-                result[0]["average_latency_ms"],
-            ]
+    def benchmark_all(self, inputs: tuple):
+        self.benchmark_frontend(inputs)
+        self.benchmark_python(inputs)
+        self.benchmark_c()

    def benchmark_all_csv(
        self, inputs: tuple, modelname, dynamic, device_str, frontend
    ):
        self.setup_cl(inputs)
        field_names = [
+            "platform",
            "model",
-            "engine",
            "dynamic",
-            "dialect",
            "device",
            "iter/sec",
            "ms/iter",
-            "iterations",
            "datetime",
        ]
-        engines = ["frontend", "shark_python", "shark_iree_c"]
-        if shark_args.onnx_bench == True:
-            engines.append("onnxruntime")
+        platforms = ["frontend", "shark_python", "shark_iree_c"]

        if not os.path.exists("bench_results.csv"):
            with open("bench_results.csv", mode="w", newline="") as f:
@@ -269,33 +173,22 @@ for currently supported models. Exiting benchmark ONNX."
            else:
                bench_result["dynamic"] = "False"
            bench_result["device"] = device_str
-            for e in engines:
-                if e == "frontend":
-                    bench_result["engine"] = frontend
-                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
-                    ) = self.benchmark_frontend(modelname)
-                elif e == "shark_python":
-                    bench_result["engine"] = "shark_python"
-                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
-                    ) = self.benchmark_python(inputs)
-                elif e == "shark_iree_c":
-                    bench_result["engine"] = "shark_iree_c"
-                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
-                    ) = self.benchmark_c()
-                elif e == "onnxruntime":
-                    bench_result["engine"] = "onnxruntime"
-                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
-                    ) = self.benchmark_onnx(modelname, inputs)
-
-                bench_result["dialect"] = self.mlir_dialect
-                bench_result["iterations"] = shark_args.num_iterations
+            for p in platforms:
+                if p == "frontend":
+                    bench_result["platform"] = frontend
+                    bench_result["iter/sec"] = self.benchmark_frontend(
+                        inputs, modelname
+                    )[0]
+                    bench_result["ms/iter"] = self.benchmark_frontend(
+                        inputs, modelname
+                    )[1]
+                elif p == "shark_python":
+                    bench_result["platform"] = "shark_python"
+                    bench_result["iter/sec"] = self.benchmark_python(inputs)[0]
+                    bench_result["ms/iter"] = self.benchmark_python(inputs)[1]
+                else:
+                    bench_result["platform"] = "shark_iree_c"
+                    bench_result["iter/sec"] = self.benchmark_c()[0]
+                    bench_result["ms/iter"] = self.benchmark_c()[1]
                bench_result["datetime"] = str(datetime.now())
                writer.writerow(bench_result)
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -29,8 +29,6 @@ input_type_to_np_dtype = {
    "int8": np.int8,
 }

-# default hash is updated when nightly populate_sharktank_ci is successful
-shark_default_sha = "latest"

 # Save the model in the home local so it needn't be fetched everytime in the CI.
 home = str(Path.home())
@@ -80,8 +78,7 @@ def download_torch_model(model_name, dynamic=False):

    def gs_download_model():
        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
            + "/"
            + model_dir_name
            + " "
@@ -96,8 +93,7 @@ def download_torch_model(model_name, dynamic=False):
        model_dir = os.path.join(WORKDIR, model_dir_name)
        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
            + "/"
            + model_dir_name
            + "/hash.npy"
@@ -135,8 +131,7 @@ def download_tflite_model(model_name, dynamic=False):

    def gs_download_model():
        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
            + "/"
            + model_dir_name
            + " "
@@ -153,8 +148,7 @@ def download_tflite_model(model_name, dynamic=False):
        model_dir = os.path.join(WORKDIR, model_dir_name)
        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
            + "/"
            + model_dir_name
            + "/hash.npy"
@@ -191,8 +185,7 @@ def download_tf_model(model_name):

    def gs_download_model():
        gs_command = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp -r gs://shark_tank'
            + "/"
            + model_dir_name
            + " "
@@ -207,8 +200,7 @@ def download_tf_model(model_name):
        model_dir = os.path.join(WORKDIR, model_dir_name)
        local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
        gs_hash = (
-            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank/'
-            + shark_default_sha
+            'gsutil -o "GSUtil:parallel_process_count=1" cp gs://shark_tank'
            + "/"
            + model_dir_name
            + "/hash.npy"
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -199,11 +199,9 @@ class SharkImporter:
                )
            elif golden_out is tuple:
                golden_out = self.convert_to_numpy(golden_out)
-            elif hasattr(golden_out, "logits"):
+            else:
                # from transformers import TFSequenceClassifierOutput
                golden_out = golden_out.logits
-            else:
-                golden_out = golden_out.last_hidden_state
            # Save the artifacts in the directory dir.
            self.save_data(
                dir,
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -12,9 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import torch
+import io
+import pickle
+
+from torch_mlir.dialects.torch.importer.jit_ir import (
+    ClassAnnotator,
+    ModuleBuilder,
+)
+from torch_mlir_e2e_test.torchscript.serialization import (
+    extract_serializable_annotations,
+    apply_serializable_annotations,
+    SerializableTest,
+)
+
+from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
+
+from torch_mlir.passmanager import PassManager
+from torch_mlir_e2e_test.torchscript.annotations import annotate_args, export
 from torch_mlir.ir import StringAttr
 import torch_mlir
-from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend


 def get_module_name_for_asm_dump(module):
@@ -28,6 +45,22 @@ def get_module_name_for_asm_dump(module):
    ).value


+def get_input_annotations(inputs: tuple, dynamic: bool) -> list:
+    """TODO: Include necessary documentation"""
+
+    annotations_list = [None]
+    for i in inputs:
+        temp_list = []
+        if dynamic:
+            temp_list.append([-1 for i in range(len(i.shape))])
+        else:
+            temp_list.append(list(i.shape))
+        temp_list.append(i.dtype)
+        temp_list.append(True)
+        annotations_list.append(tuple(temp_list))
+    return annotations_list
+
+
 def run_on_refbackend(torch_module, inputs):
    backend = refbackend.RefBackendLinalgOnTensorsBackend()
    compiled = backend.compile(torch_module)
@@ -36,16 +69,42 @@ def run_on_refbackend(torch_module, inputs):
    return jit_module.forward(np_inputs[0])


-# Creates dynamic dims for all dims.
-# TODO: Pass user specified dynamic dims.
-def create_dynamic_placeholders(inputs):
-    placeholders = []
-    for inp in inputs:
-        placeholder = torch_mlir.TensorPlaceholder.like(
-            inp, dynamic_axes=[i for i in range(len(inp.shape))]
-        )
-        placeholders.append(placeholder)
-    return tuple(placeholders)
+def shark_jit_trace(
+    module, input: tuple, dynamic: bool, tracing_required: bool
+):
+    """TODO: Include necessary documentation."""
+
+    if not tracing_required:
+        return torch.jit.script(module)
+
+    traced_module = torch.jit.trace_module(module, {"forward": input})
+    actual_script = traced_module._actual_script_module
+    export(actual_script.forward)
+    annotate_args_decorator = annotate_args(
+        get_input_annotations(input, dynamic)
+    )
+    annotate_args_decorator(actual_script.forward)
+    module = torch.jit.script(actual_script)
+
+    # TODO: remove saved annotations.pickle
+    torchscript_module_bytes = module.save_to_buffer(
+        {
+            "annotations.pkl": pickle.dumps(
+                extract_serializable_annotations(module)
+            )
+        }
+    )
+    serializable_test = SerializableTest(
+        unique_name="", program=torchscript_module_bytes, trace=None
+    )
+    _extra_files = {"annotations.pkl": ""}
+    module = torch.jit.load(
+        io.BytesIO(serializable_test.program), _extra_files=_extra_files
+    )
+    # Load the pickled annotations.
+    annotations = pickle.loads(_extra_files["annotations.pkl"])
+    apply_serializable_annotations(module, annotations)
+    return module


 def get_torch_mlir_module(
@@ -55,18 +114,39 @@ def get_torch_mlir_module(
    jit_trace: bool,
    from_torchscript: bool = False,
 ):
-    """Get the MLIR's linalg-on-tensors module from torchscipt module."""
-    ignore_traced_shapes = False
-    if dynamic:
-        input = create_dynamic_placeholders(input)
-    if jit_trace:
-        ignore_traced_shapes = True
+    """TODO: Include necessary documentation."""

-    module = torch_mlir.compile(
-        module,
-        input,
-        output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=jit_trace,
-        ignore_traced_shapes=ignore_traced_shapes,
+    # Static modules compiles well with the torch_mlir.compile API.
+    # We will always jit_trace = True with the API since we always
+    # want to propagate static shapes.
+    if not dynamic:
+        module = torch_mlir.compile(
+            module,
+            input,
+            output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=jit_trace,
+        )
+        return module
+
+    # Tracing is not required from the aot_module.
+    if not from_torchscript:
+        module = shark_jit_trace(module, input, dynamic, jit_trace)
+
+    mb = ModuleBuilder()
+    class_annotator = ClassAnnotator()
+    class_annotator.exportNone(module._c._type())
+    class_annotator.exportPath(module._c._type(), ["forward"])
+    class_annotator.annotateArgs(
+        module._c._type(),
+        ["forward"],
+        get_input_annotations(input, dynamic),
    )
-    return module
+    mb.import_module(module._c, class_annotator)
+
+    with mb.module.context:
+        pm = PassManager.parse(
+            "torchscript-module-to-torch-backend-pipeline,torch-backend-to-linalg-on-tensors-backend-pipeline"
+        )
+        pm.run(mb.module)
+
+    return mb.module
--- a/tank/MiniLM-L12-H384-uncased/MiniLM-L12-H384-uncased_test.py
+++ b/tank/MiniLM-L12-H384-uncased/MiniLM-L12-H384-uncased_test.py
@@ -1,101 +0,0 @@
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_tf_model
-from shark.parser import shark_args
-
-import iree.compiler as ireec
-import unittest
-import pytest
-import numpy as np
-
-
-class MiniLMModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model(
-            "microsoft/MiniLM-L12-H384-uncased"
-        )
-
-        shark_module = SharkInference(
-            model,
-            func_name,
-            device=device,
-            mlir_dialect="mhlo",
-            is_benchmark=self.benchmark,
-        )
-        if self.benchmark == True:
-            shark_args.enable_tf32 = True
-            shark_module.compile()
-            shark_args.onnx_bench = self.onnx_bench
-            shark_module.shark_runner.benchmark_all_csv(
-                (inputs),
-                "microsoft/MiniLM-L12-H384-uncased",
-                dynamic,
-                device,
-                "tensorflow",
-            )
-            shark_args.enable_tf32 = False
-            rtol = 1e-01
-            atol = 1e-02
-
-        else:
-            shark_module.compile()
-            rtol = 1e-02
-            atol = 1e-03
-
-        # TODO: Remove catch once new MiniLM stable
-        try:
-            result = shark_module.forward(inputs)[0][1].to_host()
-
-        except:
-            result = shark_module.forward(inputs)
-
-        np.testing.assert_allclose(golden_out, result, rtol=rtol, atol=atol)
-
-
-class MiniLMModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MiniLMModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/MiniLM-L12-H384-uncased_torch/MiniLM-L12-H384-uncased_torch_test.py
+++ b/tank/MiniLM-L12-H384-uncased_torch/MiniLM-L12-H384-uncased_torch_test.py
@@ -2,7 +2,6 @@ from shark.shark_inference import SharkInference
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from tank.model_utils import compare_tensors
 from shark.shark_downloader import download_torch_model
-from shark.parser import shark_args

 import unittest
 import numpy as np
@@ -13,15 +12,24 @@ class MiniLMModuleTester:
    def __init__(
        self,
        benchmark=False,
-        onnx_bench=False,
    ):
        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench

    def create_and_check_module(self, dynamic, device):
        model_mlir, func_name, input, act_out = download_torch_model(
            "microsoft/MiniLM-L12-H384-uncased", dynamic
        )
+
+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
        shark_module = SharkInference(
            model_mlir,
            func_name,
@@ -29,10 +37,11 @@ class MiniLMModuleTester:
            mlir_dialect="linalg",
            is_benchmark=self.benchmark,
        )
+        shark_module.compile()
+        results = shark_module.forward(input)
+        assert True == compare_tensors(act_out, results)
+
        if self.benchmark == True:
-            shark_args.enable_tf32 = True
-            shark_module.compile()
-            shark_args.onnx_bench = self.onnx_bench
            shark_module.shark_runner.benchmark_all_csv(
                (input),
                "microsoft/MiniLM-L12-H384-uncased",
@@ -40,16 +49,6 @@ class MiniLMModuleTester:
                device,
                "torch",
            )
-            shark_args.enable_tf32 = False
-            rtol = 1e-01
-            atol = 1e-02
-        else:
-            shark_module.compile()
-            rtol = 1e-02
-            atol = 1e-03
-
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results, rtol, atol)


 class MiniLMModuleTest(unittest.TestCase):
@@ -57,7 +56,6 @@ class MiniLMModuleTest(unittest.TestCase):
    def configure(self, pytestconfig):
        self.module_tester = MiniLMModuleTester(self)
        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")

    def test_module_static_cpu(self):
        dynamic = False
@@ -100,14 +98,6 @@ class MiniLMModuleTest(unittest.TestCase):
        dynamic = True
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/README.md
+++ b/tank/README.md
@@ -1,13 +0,0 @@
-To run the fine tuning example, from the root SHARK directory, run:
-
-```shell
-IMPORTER=1 ./setup_venv.sh
-source shark.venv/bin/activate
-pip install jupyter tf-models-nightly tf-datasets
-jupyter-notebook
-```
-if running from a google vm, you can view jupyter notebooks on your local system with:
-```shell
-gcloud compute ssh <YOUR_INSTANCE_DETAILS> --ssh-flag="-N -L localhost:8888:localhost:8888"
-```
-
--- a/tank/albert-base-v2_tf/albert-base-v2_tf_test.py
+++ b/tank/albert-base-v2_tf/albert-base-v2_tf_test.py
@@ -55,15 +55,6 @@ class AlbertBaseModuleTest(unittest.TestCase):
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-

 if __name__ == "__main__":
    unittest.main()
--- a/tank/albert-base-v2_torch/albert-base-v2_torch_test.py
+++ b/tank/albert-base-v2_torch/albert-base-v2_torch_test.py
@@ -99,15 +99,6 @@ class AlbertModuleTest(unittest.TestCase):
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-

 if __name__ == "__main__":
    unittest.main()
--- a/tank/alexnet_torch/alexnet_torch_test.py
+++ b/tank/alexnet_torch/alexnet_torch_test.py
@@ -101,14 +101,6 @@ class AlexnetModuleTest(unittest.TestCase):
        dynamic = True
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/bert-base-cased_torch/bert-base-cased_torch_test.py
+++ b/tank/bert-base-cased_torch/bert-base-cased_torch_test.py
@@ -1,117 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from tank.model_utils import compare_tensors
-from shark.shark_downloader import download_torch_model
-
-import torch
-import unittest
-import numpy as np
-import pytest
-
-
-class BertBaseUncasedModuleTester:
-    def __init__(
-        self,
-        save_mlir=False,
-        save_vmfb=False,
-        benchmark=False,
-    ):
-        self.save_mlir = save_mlir
-        self.save_vmfb = save_vmfb
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "bert-base-cased", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        assert True == compare_tensors(act_out, results)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "bert-base-cased",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class BertBaseUncasedModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = BertBaseUncasedModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/bert-base-uncased_tf/bert-base-uncased_tf_test.py
+++ b/tank/bert-base-uncased_tf/bert-base-uncased_tf_test.py
@@ -1,8 +1,8 @@
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from shark.shark_inference import SharkInference
 from shark.shark_downloader import download_tf_model
-from shark.parser import shark_args

+import iree.compiler as ireec
 import unittest
 import pytest
 import numpy as np
@@ -12,10 +12,8 @@ class BertBaseUncasedModuleTester:
    def __init__(
        self,
        benchmark=False,
-        onnx_bench=False,
    ):
        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench

    def create_and_check_module(self, dynamic, device):
        model, func_name, inputs, golden_out = download_tf_model(
@@ -35,7 +33,6 @@ class BertBaseUncasedModuleTest(unittest.TestCase):
    def configure(self, pytestconfig):
        self.module_tester = BertBaseUncasedModuleTester(self)
        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")

    def test_module_static_cpu(self):
        dynamic = False
@@ -57,14 +54,6 @@ class BertBaseUncasedModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/bert-base-uncased_torch/bert-base-uncased_torch_test.py
+++ b/tank/bert-base-uncased_torch/bert-base-uncased_torch_test.py
@@ -2,7 +2,6 @@ from shark.shark_inference import SharkInference
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from tank.model_utils import compare_tensors
 from shark.shark_downloader import download_torch_model
-from shark.parser import shark_args

 import torch
 import unittest
@@ -13,17 +12,29 @@ import pytest
 class BertBaseUncasedModuleTester:
    def __init__(
        self,
+        save_mlir=False,
+        save_vmfb=False,
        benchmark=False,
-        onnx_bench=False,
    ):
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench

    def create_and_check_module(self, dynamic, device):
        model_mlir, func_name, input, act_out = download_torch_model(
            "bert-base-uncased", dynamic
        )

+        # from shark.shark_importer import SharkImporter
+        # mlir_importer = SharkImporter(
+        #    model,
+        #    (input,),
+        #    frontend="torch",
+        # )
+        # minilm_mlir, func_name = mlir_importer.import_mlir(
+        #    is_dynamic=dynamic, tracing_required=True
+        # )
+
        shark_module = SharkInference(
            model_mlir,
            func_name,
@@ -36,7 +47,6 @@ class BertBaseUncasedModuleTester:
        assert True == compare_tensors(act_out, results)

        if self.benchmark == True:
-            shark_args.onnx_bench = self.onnx_bench
            shark_module.shark_runner.benchmark_all_csv(
                (input),
                "bert-base-uncased",
@@ -51,7 +61,6 @@ class BertBaseUncasedModuleTest(unittest.TestCase):
    def configure(self, pytestconfig):
        self.module_tester = BertBaseUncasedModuleTester(self)
        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")

    def test_module_static_cpu(self):
        dynamic = False
@@ -94,14 +103,6 @@ class BertBaseUncasedModuleTest(unittest.TestCase):
        dynamic = True
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/bert_fine_tuning/bert_fine_tune_tf.py
+++ b/tank/bert_fine_tuning/bert_fine_tune_tf.py
@@ -1,182 +0,0 @@
-import numpy as np
-
-from iree import runtime as ireert
-from iree.tf.support import module_utils
-from iree.compiler import tf as tfc
-from iree.compiler import compile_str
-
-import tensorflow as tf
-
-try:
-    import tensorflow_datasets as tfds
-    import tensorflow_models as tfm
-    from official.nlp.modeling import layers
-    from official.nlp.modeling import networks
-    from official.nlp.modeling.models import bert_classifier
-except ModuleNotFoundError:
-    print(
-        "tensorflow models or datasets not found please run the following command with your virtual env active:\npip install tf-models-nightly tf-datasets"
-    )
-import json
-import time
-import os
-
-gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
-tf.io.gfile.listdir(gs_folder_bert)
-vocab_size = 100
-NUM_CLASSES = 2
-SEQUENCE_LENGTH = 128
-BATCH_SIZE = 1
-# Create a set of 2-dimensional inputs
-bert_input = [
-    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
-    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
-    tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
-]
-
-
-class BertModule(tf.Module):
-    def __init__(self):
-        super(BertModule, self).__init__()
-        dict_outputs = False
-
-        bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
-
-        config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())
-        encoder_config = tfm.nlp.encoders.EncoderConfig(
-            {"type": "bert", "bert": config_dict}
-        )
-        bert_encoder = tfm.nlp.encoders.build_encoder(encoder_config)
-
-        # Create a BERT trainer with the created network.
-        bert_trainer_model = bert_classifier.BertClassifier(
-            bert_encoder, num_classes=NUM_CLASSES
-        )
-        bert_trainer_model.summary()
-        checkpoint = tf.train.Checkpoint(encoder=bert_encoder)
-        checkpoint.read(
-            os.path.join(gs_folder_bert, "bert_model.ckpt")
-        ).assert_consumed()
-
-        # Invoke the trainer model on the inputs. This causes the layer to be built.
-        self.m = bert_trainer_model
-        self.m.predict = lambda x: self.m.call(x, training=False)
-        self.predict = tf.function(input_signature=[bert_input])(
-            self.m.predict
-        )
-        self.m.learn = lambda x, y: self.m.call(x, training=False)
-        self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
-        self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
-
-    @tf.function(
-        input_signature=[
-            bert_input,  # inputs
-            tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32),  # labels
-        ]
-    )
-    def learn(self, inputs, labels):
-        with tf.GradientTape() as tape:
-            # Capture the gradients from forward prop...
-            probs = self.m.call(inputs, training=True)
-            loss = self.loss(labels, probs)
-
-        # ...and use them to update the model's weights.
-        variables = self.m.trainable_variables
-        gradients = tape.gradient(loss, variables)
-        self.optimizer.apply_gradients(zip(gradients, variables))
-        return loss
-
-
-if __name__ == "__main__":
-    glue, info = tfds.load("glue/mrpc", with_info=True, batch_size=BATCH_SIZE)
-
-    tokenizer = tfm.nlp.layers.FastWordpieceBertTokenizer(
-        vocab_file=os.path.join(gs_folder_bert, "vocab.txt"), lower_case=True
-    )
-
-    max_seq_length = SEQUENCE_LENGTH
-
-    packer = tfm.nlp.layers.BertPackInputs(
-        seq_length=max_seq_length,
-        special_tokens_dict=tokenizer.get_special_tokens_dict(),
-    )
-
-    class BertInputProcessor(tf.keras.layers.Layer):
-        def __init__(self, tokenizer, packer):
-            super().__init__()
-            self.tokenizer = tokenizer
-            self.packer = packer
-
-        def call(self, inputs):
-            tok1 = self.tokenizer(inputs["sentence1"])
-            tok2 = self.tokenizer(inputs["sentence2"])
-
-            packed = self.packer([tok1, tok2])
-
-            if "label" in inputs:
-                return packed, inputs["label"]
-            else:
-                return packed
-
-    bert_inputs_processor = BertInputProcessor(tokenizer, packer)
-    glue_train = glue["train"].map(bert_inputs_processor).prefetch(1)
-    glue_validation = glue["validation"].map(bert_inputs_processor).prefetch(1)
-    glue_test = glue["test"].map(bert_inputs_processor).prefetch(1)
-
-    # base tensorflow model
-    bert_model = BertModule()
-
-    # Compile the model using IREE
-    compiler_module = tfc.compile_module(
-        bert_model, exported_names=["learn"], import_only=True
-    )
-
-    # choose from dylib-llvm-aot or cuda
-    backend = "dylib-llvm-aot"
-    if backend == "dylib-llvm-aot":
-        args = [
-            "--iree-llvm-target-cpu-features=host",
-            "--iree-mhlo-demote-i64-to-i32=false",
-            "--iree-flow-demote-i64-to-i32",
-        ]
-        backend_config = "dylib"
-
-    else:
-        backend_config = "cuda"
-        args = [
-            "--iree-cuda-llvm-target-arch=sm_80",
-            "--iree-hal-cuda-disable-loop-nounroll-wa",
-            "--iree-enable-fusion-with-reduction-ops",
-        ]
-
-    flatbuffer_blob = compile_str(
-        compiler_module,
-        target_backends=[backend],
-        extra_args=args,
-        input_type="mhlo",
-    )
-
-    # Save module as MLIR file in a directory
-    vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
-    tracer = ireert.Tracer(os.getcwd())
-    config = ireert.Config("local-sync", tracer)
-    ctx = ireert.SystemContext(config=config)
-    ctx.add_vm_module(vm_module)
-    BertCompiled = ctx.modules.module
-
-    # compare output losses:
-
-    iterations = 10
-    for i in range(iterations):
-        example_inputs, example_labels = next(iter(glue_train))
-        example_labels = tf.cast(example_labels, tf.int32)
-        example_inputs = [value for key, value in example_inputs.items()]
-
-        # iree version
-        iree_loss = BertCompiled.learn(
-            example_inputs, example_labels
-        ).to_host()
-
-        # base tensorflow
-        tf_loss = np.array(bert_model.learn(example_inputs, example_labels))
-        print(np.allclose(iree_loss, tf_loss))
--- a/tank/camembert-base_tf/camembert-base_tf_test.py
+++ b/tank/camembert-base_tf/camembert-base_tf_test.py
@@ -54,14 +54,6 @@ class CamemBertModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/convbert-base-turkish-cased_tf/convbert-base-turkish-cased_tf_test.py
+++ b/tank/convbert-base-turkish-cased_tf/convbert-base-turkish-cased_tf_test.py
@@ -57,14 +57,6 @@ class ConvBertModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/deberta-base_tf/deberta-base_tf_test.py
+++ b/tank/deberta-base_tf/deberta-base_tf_test.py
@@ -58,14 +58,6 @@ class DebertaBaseModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
+++ b/tank/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
@@ -34,13 +34,11 @@ class DistilBertModuleTest(unittest.TestCase):
        self.module_tester = DistilBertModuleTester(self)
        self.module_tester.benchmark = pytestconfig.getoption("benchmark")

-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
    def test_module_static_cpu(self):
        dynamic = False
        device = "cpu"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
    @pytest.mark.skipif(
        check_device_drivers("gpu"), reason=device_driver_info("gpu")
    )
@@ -49,7 +47,6 @@ class DistilBertModuleTest(unittest.TestCase):
        device = "gpu"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
    @pytest.mark.skipif(
        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
    )
@@ -57,14 +54,6 @@ class DistilBertModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
+++ b/tank/distilbert-base-uncased_torch/distilbert-base-uncased_torch_test.py
@@ -113,15 +113,6 @@ class DistilBertModuleTest(unittest.TestCase):
        dynamic = True
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skip(reason="DistilBert needs to be uploaded to cloud.")
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/electra-small-discriminator_tf/electra-small-discriminator_tf_test.py
+++ b/tank/electra-small-discriminator_tf/electra-small-discriminator_tf_test.py
@@ -54,14 +54,6 @@ class ElectraModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/facebook_convnext-tiny-224_tf/facebook_convnext-tiny-224_tf_test.py
+++ b/tank/facebook_convnext-tiny-224_tf/facebook_convnext-tiny-224_tf_test.py
@@ -63,14 +63,6 @@ class ConvNextTinyModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/funnel-transformer_tf/funnel-transformer_tf_test.py
+++ b/tank/funnel-transformer_tf/funnel-transformer_tf_test.py
@@ -60,14 +60,6 @@ class FunnelModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/google_vit-base-patch16-224_tf/google_vit-base-patch16-224_tf_test.py
+++ b/tank/google_vit-base-patch16-224_tf/google_vit-base-patch16-224_tf_test.py
@@ -60,14 +60,6 @@ class VitBaseModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/layoutlm-base-uncased_tf/layoutlm-base-uncased_tf_test.py
+++ b/tank/layoutlm-base-uncased_tf/layoutlm-base-uncased_tf_test.py
@@ -54,14 +54,6 @@ class LayoutLMModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/longformer-base-4096_tf/longformer-base-4096_tf_test.py
+++ b/tank/longformer-base-4096_tf/longformer-base-4096_tf_test.py
@@ -55,14 +55,6 @@ class LongformerModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
+++ b/tank/mobilenet_v3_small_torch/mobilenet_v3_small_torch_test.py
@@ -1,114 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_downloader import download_torch_model
-
-import unittest
-import numpy as np
-import pytest
-
-
-class MobileNetV3ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-    ):
-        self.benchmark = benchmark
-
-    def create_and_check_module(self, dynamic, device):
-        model_mlir, func_name, input, act_out = download_torch_model(
-            "mobilenet_v3_small", dynamic
-        )
-
-        # from shark.shark_importer import SharkImporter
-        # mlir_importer = SharkImporter(
-        #    model,
-        #    (input,),
-        #    frontend="torch",
-        # )
-        # minilm_mlir, func_name = mlir_importer.import_mlir(
-        #    is_dynamic=dynamic, tracing_required=True
-        # )
-
-        shark_module = SharkInference(
-            model_mlir,
-            func_name,
-            device=device,
-            mlir_dialect="linalg",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        results = shark_module.forward(input)
-        np.testing.assert_allclose(act_out, results, rtol=1e-02, atol=1e-03)
-
-        if self.benchmark == True:
-            shark_module.shark_runner.benchmark_all_csv(
-                (input),
-                "alexnet",
-                dynamic,
-                device,
-                "torch",
-            )
-
-
-class MobileNetV3ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = MobileNetV3ModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    def test_module_dynamic_cpu(self):
-        dynamic = True
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="golden results don't match.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="golden results don't match.")
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_dynamic_gpu(self):
-        dynamic = True
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.xfail(reason="stuck in the pipeline.")
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_dynamic_vulkan(self):
-        dynamic = True
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -1,5 +1,4 @@
 from shark.shark_inference import SharkInference
-from shark.parser import shark_args

 import torch
 import numpy as np
@@ -14,7 +13,6 @@ vision_models = [
    "resnet50",
    "squeezenet1_0",
    "wide_resnet50_2",
-    "mobilenet_v3_small",
 ]


@@ -86,7 +84,6 @@ def get_vision_model(torch_model):
        "resnet101": models.resnet101(pretrained=True),
        "squeezenet1_0": models.squeezenet1_0(pretrained=True),
        "wide_resnet50_2": models.wide_resnet50_2(pretrained=True),
-        "mobilenet_v3_small": models.mobilenet_v3_small(pretrained=True),
    }
    if isinstance(torch_model, str):
        torch_model = vision_models_dict[torch_model]
@@ -99,6 +96,9 @@ def get_vision_model(torch_model):
 ################################################################################

 # Utility function for comparing two tensors (torch).
-def compare_tensors(torch_tensor, numpy_tensor, rtol=1e-02, atol=1e-03):
+def compare_tensors(torch_tensor, numpy_tensor):
+    # setting the absolute and relative tolerance
+    rtol = 1e-02
+    atol = 1e-03
    # torch_to_numpy = torch_tensor.detach().numpy()
    return np.allclose(torch_tensor, numpy_tensor, rtol, atol)
--- a/tank/model_utils_tf.py
+++ b/tank/model_utils_tf.py
@@ -16,50 +16,10 @@ except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

-BATCH_SIZE = 1
-MAX_SEQUENCE_LENGTH = 128
-
-################################## MHLO/TF models #########################################
-# TODO : Generate these lists or fetch model source from tank/tf/tf_model_list.csv
-keras_models = [
-    "resnet50",
-]
-maskedlm_models = [
-    "albert-base-v2",
-    "bert-base-uncased",
-    "camembert-base",
-    "convbert-base-turkish-cased",
-    "deberta-base",
-    "distilbert-base-uncased",
-    "electra-small-discriminator",
-    "funnel-transformer",
-    "layoutlm-base-uncased",
-    "longformer-base-4096",
-    "mobilebert-uncased",
-    "mpnet-base",
-    "rembert",
-    "roberta-base",
-    "tapas-base",
-    "tiny-random-flaubert",
-    "xlm-roberta",
-]
-tfhf_models = [
-    "microsoft/MiniLM-L12-H384-uncased",
-]
-
-
-def get_tf_model(name):
-    if name in keras_models:
-        return get_keras_model(name)
-    elif name in maskedlm_models:
-        return get_causal_lm_model(name)
-    elif name in tfhf_models:
-        return get_TFhf_model(name)
-    else:
-        return get_causal_image_model(name)
-
-
 ##################### Tensorflow Hugging Face LM Models ###################################
+MAX_SEQUENCE_LENGTH = 512
+BATCH_SIZE = 1
+
 # Create a set of 2-dimensional inputs
 tf_bert_input = [
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
@@ -85,6 +45,9 @@ class TFHuggingFaceLanguage(tf.Module):


 def get_TFhf_model(name):
+    #    gpus = tf.config.experimental.list_physical_devices("GPU")
+    #    for gpu in gpus:
+    #        tf.config.experimental.set_memory_growth(gpu, True)
    model = TFHuggingFaceLanguage(name)
    tokenizer = BertTokenizer.from_pretrained(
        "microsoft/MiniLM-L12-H384-uncased"
@@ -122,8 +85,22 @@ def compare_tensors_tf(tf_tensor, numpy_tensor):
 from transformers import TFAutoModelForMaskedLM, AutoTokenizer
 import tensorflow as tf

+visible_default = tf.config.list_physical_devices("GPU")
+try:
+    tf.config.set_visible_devices([], "GPU")
+    visible_devices = tf.config.get_visible_devices()
+    for device in visible_devices:
+        assert device.device_type != "GPU"
+except:
+    # Invalid device or cannot modify virtual devices once initialized.
+    pass
+
+# The max_sequence_length is set small for testing purpose.
+BATCH_SIZE = 1
+MAX_SEQUENCE_LENGTH = 16
+
 # Create a set of input signature.
-input_signature_maskedlm = [
+inputs_signature = [
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
    tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
 ]
@@ -154,12 +131,15 @@ class MaskedLM(tf.Module):
        )
        self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)[0]

-    @tf.function(input_signature=input_signature_maskedlm)
+    @tf.function(input_signature=inputs_signature)
    def forward(self, input_ids, attention_mask):
        return self.m.predict(input_ids, attention_mask)


 def get_causal_lm_model(hf_name, text="Hello, this is the default text."):
+    #    gpus = tf.config.experimental.list_physical_devices("GPU")
+    #    for gpu in gpus:
+    #        tf.config.experimental.set_memory_growth(gpu, True)
    model = MaskedLM(hf_name)
    encoded_input = preprocess_input(hf_name, text)
    test_input = (encoded_input["input_ids"], encoded_input["attention_mask"])
@@ -167,59 +147,16 @@ def get_causal_lm_model(hf_name, text="Hello, this is the default text."):
    return model, test_input, actual_out


-##################### TensorFlow Keras Resnet Models #########################################################
-# Static shape, including batch size (1).
-# Can be dynamic once dynamic shape support is ready.
-INPUT_SHAPE = [1, 224, 224, 3]
-
-tf_model = tf.keras.applications.resnet50.ResNet50(
-    weights="imagenet", include_top=True, input_shape=tuple(INPUT_SHAPE[1:])
-)
-
-
-class ResNetModule(tf.Module):
-    def __init__(self):
-        super(ResNetModule, self).__init__()
-        self.m = tf_model
-        self.m.predict = lambda x: self.m.call(x, training=False)
-
-    @tf.function(input_signature=[tf.TensorSpec(INPUT_SHAPE, tf.float32)])
-    def forward(self, inputs):
-        return self.m.predict(inputs)
-
-
-def load_image(path_to_image):
-    image = tf.io.read_file(path_to_image)
-    image = tf.image.decode_image(image, channels=3)
-    image = tf.image.resize(image, (224, 224))
-    image = image[tf.newaxis, :]
-    return image
-
-
-def get_keras_model(modelname):
-    model = ResNetModule()
-    content_path = tf.keras.utils.get_file(
-        "YellowLabradorLooking_new.jpg",
-        "https://storage.googleapis.com/download.tensorflow.org/example_images/YellowLabradorLooking_new.jpg",
-    )
-    content_image = load_image(content_path)
-    input_tensor = tf.keras.applications.resnet50.preprocess_input(
-        content_image
-    )
-    input_data = tf.expand_dims(input_tensor, 0)
-    actual_out = model.forward(*input_data)
-    return model, input_data, actual_out
-
-
 ##################### Tensorflow Hugging Face  Image Classification Models ###################################
 from transformers import TFAutoModelForImageClassification
 from transformers import ConvNextFeatureExtractor, ViTFeatureExtractor
 from transformers import BeitFeatureExtractor, AutoFeatureExtractor
+import tensorflow as tf
 from PIL import Image
 import requests

 # Create a set of input signature.
-input_signature_img_cls = [
+inputs_signature = [
    tf.TensorSpec(shape=[1, 3, 224, 224], dtype=tf.float32),
 ]

@@ -232,7 +169,7 @@ class AutoModelImageClassfication(tf.Module):
        )
        self.m.predict = lambda x: self.m(x)

-    @tf.function(input_signature=input_signature_img_cls)
+    @tf.function(input_signature=inputs_signature)
    def forward(self, inputs):
        return self.m.predict(inputs)

--- a/tank/mpnet-base_tf/mpnet-base_tf_test.py
+++ b/tank/mpnet-base_tf/mpnet-base_tf_test.py
@@ -34,13 +34,11 @@ class MpNetModuleTest(unittest.TestCase):
        self.module_tester = MpNetModuleTester(self)
        self.module_tester.benchmark = pytestconfig.getoption("benchmark")

-    @pytest.mark.xfail(reason="https://github.com/nod-ai/SHARK/issues/203")
    def test_module_static_cpu(self):
        dynamic = False
        device = "cpu"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.xfail(reason="https://github.com/nod-ai/SHARK/issues/203")
    @pytest.mark.skipif(
        check_device_drivers("gpu"), reason=device_driver_info("gpu")
    )
@@ -49,7 +47,6 @@ class MpNetModuleTest(unittest.TestCase):
        device = "gpu"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.xfail(reason="https://github.com/nod-ai/SHARK/issues/203")
    @pytest.mark.skipif(
        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
    )
@@ -57,14 +54,6 @@ class MpNetModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    # @pytest.mark.skipif(
-    #     check_device_drivers("intel-gpu"),
-    #     reason=device_driver_info("intel-gpu"),
-    # )
-    # def test_module_static_intel_gpu(self):
-    #     dynamic = False
-    #     device = "intel-gpu"
-    #     self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/pytorch/torch_model_list.csv
+++ b/tank/pytorch/torch_model_list.csv
@@ -2,7 +2,6 @@ model_name, use_tracing, model_type
 microsoft/MiniLM-L12-H384-uncased,True,hf
 albert-base-v2,True,hf
 bert-base-uncased,True,hf
-bert-base-cased,True,hf
 google/mobilebert-uncased,True,hf
 alexnet,False,vision
 resnet18,False,vision
@@ -10,4 +9,3 @@ resnet50,False,vision
 resnet101,False,vision
 squeezenet1_0,False,vision
 wide_resnet50_2,False,vision
-mobilenet_v3_small,False,vision
--- a/tank/pytorch/v_diffusion_pytorch/README.md
+++ b/tank/pytorch/v_diffusion_pytorch/README.md
@@ -17,17 +17,16 @@ python -m pip install --upgrade pip
 ### Install v-diffusion model and its dependencies

 ```shell
-cd tank/pytorch/v_diffusion/
-Run the script setup_v_diffusion_pytorch.sh
+./setup_diffusion.sh
 ```

 ### Run v-diffusion-pytorch model

 ```shell
-./v-diffusion-pytorch/cfg_sample.py "New York City, oil on canvas":5 -n 5 -bs 5
+./v-diffusion-pytorch/cfg_sample.py "the rise of consciousness":5 -n 5 -bs 5 --seed 0
 ```

-### Run the v-diffusion model via torch-mlir
+### Compile v-diffusion model via torch-mlir
 ```shell
-./cfg_sample.py "New York City, oil on canvas":5 -n 1 -bs 1 --steps 2
+python v_diffusion.py 2> v_diffusion_ir.mlir
 ```
--- a/tank/pytorch/v_diffusion/cfg_sample.py
+++ b/tank/pytorch/v_diffusion/cfg_sample.py
@@ -0,0 +1,116 @@
+import argparse
+import os
+from functools import partial
+
+import clip
+import torch
+from torchvision import transforms
+from tqdm import trange
+
+try:
+    from diffusion import get_model, sampling, utils
+except ModuleNotFoundError:
+    print(
+        "You need to download v-diffusion source from https://github.com/crowsonkb/v-diffusion-pytorch"
+    )
+    raise
+
+torch.manual_seed(0)
+
+
+def parse_prompt(prompt, default_weight=3.0):
+    if prompt.startswith("http://") or prompt.startswith("https://"):
+        vals = prompt.rsplit(":", 2)
+        vals = [vals[0] + ":" + vals[1], *vals[2:]]
+    else:
+        vals = prompt.rsplit(":", 1)
+    vals = vals + ["", default_weight][len(vals) :]
+    return vals[0], float(vals[1])
+
+
+args = argparse.Namespace(
+    prompts=["New York City, oil on canvas"],
+    batch_size=1,
+    device="cuda",
+    model="cc12m_1_cfg",
+    n=1,
+    steps=10,
+)
+
+device = torch.device(args.device)
+print("Using device:", device)
+
+model = get_model(args.model)()
+_, side_y, side_x = model.shape
+checkpoint = f"{args.model}.pth"
+if os.path.exists(checkpoint):
+    model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
+
+model = model.to(device).eval().requires_grad_(False)
+clip_model_name = (
+    model.clip_model if hasattr(model, "clip_model") else "ViT-B/16"
+)
+clip_model = clip.load(clip_model_name, jit=False, device=device)[0]
+clip_model.eval().requires_grad_(False)
+normalize = transforms.Normalize(
+    mean=[0.48145466, 0.4578275, 0.40821073],
+    std=[0.26862954, 0.26130258, 0.27577711],
+)
+
+zero_embed = torch.zeros([1, clip_model.visual.output_dim], device=device)
+target_embeds, weights = [zero_embed], []
+
+txt, weight = parse_prompt(args.prompts[0])
+target_embeds.append(
+    clip_model.encode_text(clip.tokenize(txt).to(device)).float()
+)
+weights.append(weight)
+
+weights = torch.tensor([1 - sum(weights), *weights], device=device)
+
+
+def cfg_model_fn(model, x, t):
+    n = x.shape[0]
+    n_conds = len(target_embeds)
+    x_in = x.repeat([n_conds, 1, 1, 1])
+    t_in = t.repeat([n_conds])
+    clip_embed_in = torch.cat([*target_embeds]).repeat_interleave(n, 0)
+    vs = model(x_in, t_in, clip_embed_in).view([n_conds, n, *x.shape[1:]])
+    v = vs.mul(weights[:, None, None, None, None]).sum(0)
+    return v
+
+
+x = torch.randn([args.n, 3, side_y, side_x], device=device)
+t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
+
+
+def repro(model):
+    if device.type == "cuda":
+        model = model.half()
+
+    steps = utils.get_spliced_ddpm_cosine_schedule(t)
+    for i in trange(0, args.n, args.batch_size):
+        cur_batch_size = min(args.n - i, args.batch_size)
+        outs = sampling.plms_sample(
+            partial(cfg_model_fn, model), x[i : i + cur_batch_size], steps, {}
+        )
+        for j, out in enumerate(outs):
+            utils.to_pil_image(out).save(f"out_{i + j:05}.png")
+
+
+def trace(model, x, t):
+    n = x.shape[0]
+    n_conds = len(target_embeds)
+    x_in = x.repeat([n_conds, 1, 1, 1])
+    t_in = t.repeat([n_conds])
+    clip_embed_in = torch.cat([*target_embeds]).repeat_interleave(n, 0)
+    ts_mod = torch.jit.trace(model, (x_in, t_in, clip_embed_in))
+    print(ts_mod.graph)
+
+    clip_model = clip.load(clip_model_name, jit=True, device=device)[0]
+    print(clip_model.graph)
+
+
+# You can't run both of these because repro will `.half()` the model
+# repro(model)
+trace(model, x, t[0])
--- a/tank/pytorch/v_diffusion/out_00000.png
+++ b/tank/pytorch/v_diffusion/out_00000.png
--- a/tank/pytorch/v_diffusion_pytorch/setup_v_diffusion_pytorch.sh
+++ b/tank/pytorch/v_diffusion_pytorch/setup_v_diffusion_pytorch.sh
@@ -14,9 +14,11 @@ echo "Python: $PYTHON"
 echo "Python version: $PYTHON_VERSION_X_Y"

 git clone --recursive https://github.com/crowsonkb/v-diffusion-pytorch.git
-pip install ftfy regex tqdm
-pip uninstall -y torch torchvision
-pip install -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html --pre torch torchvision

-mkdir checkpoints
-wget https://the-eye.eu/public/AI/models/v-diffusion/cc12m_1_cfg.pth -P checkpoints/
+pip install -r v-diffusion-pytorch/requirements.txt
+pip install ftfy regex tqdm
+pip install git+https://github.com/openai/CLIP.git
+
+
+mkdir v-diffusion-pytorch/checkpoints
+wget https://the-eye.eu/public/AI/models/v-diffusion/cc12m_1_cfg.pth -P v-diffusion-pytorch/checkpoints/
--- a/tank/pytorch/v_diffusion/v_diffusion.py
+++ b/tank/pytorch/v_diffusion/v_diffusion.py
@@ -0,0 +1,86 @@
+# # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# # See https://llvm.org/LICENSE.txt for license information.
+# # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# # Also available under a BSD-style license. See LICENSE.
+
+import torch
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+import tempfile
+
+import math
+import sys
+import gc
+
+from torchvision import utils as tv_utils
+from torchvision.transforms import functional as TF
+from tqdm.notebook import trange, tqdm
+
+sys.path.append("v-diffusion-pytorch")
+
+import clip
+from diffusion import get_model, sampling, utils
+import torch_mlir
+
+
+# Load the models
+model = get_model("cc12m_1_cfg")()
+_, side_y, side_x = model.shape
+model = model.eval().requires_grad_(False)
+clip_model = clip.load(model.clip_model, jit=True, device="cpu")[0]
+
+prompt = "New York City, oil on canvas"
+
+weight = 1
+n_images = 1
+steps = 2
+
+target_embed = clip_model.encode_text(clip.tokenize(prompt))
+x = torch.randn([n_images, 3, side_y, side_x], device="cpu")
+t = torch.linspace(1, 0, steps + 1, device="cpu")[:-1]
+
+n = x.shape[0]
+x_in = x.repeat([2, 1, 1, 1])
+t_in = t
+clip_embed_repeat = target_embed.repeat([n, 1])
+clip_embed_in = torch.cat(
+    [torch.zeros_like(clip_embed_repeat), clip_embed_repeat]
+)
+
+
+def model_inference(x_in, t_in, clip_embed_in):
+    return model(x_in, t_in, clip_embed_in)
+
+
+fx_g = make_fx(
+    model_inference,
+    decomposition_table=get_decompositions(
+        [
+            torch.ops.aten.embedding_dense_backward,
+            torch.ops.aten.native_layer_norm_backward,
+            torch.ops.aten.slice_backward,
+            torch.ops.aten.select_backward,
+            torch.ops.aten.norm.ScalarOpt_dim,
+            torch.ops.aten.native_group_norm,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.split.Tensor,
+        ]
+    ),
+)(x_in, t_in, clip_embed_in)
+
+fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+fx_g.recompile()
+
+ts_g = torch.jit.trace(fx_g, (x_in, t_in, clip_embed_in))
+temp = tempfile.NamedTemporaryFile(suffix="_shark_ts", prefix="temp_ts_")
+ts_g.save(temp.name)
+new_ts = torch.jit.load(temp.name)
+
+module = torch_mlir.compile(
+    new_ts,
+    [x_in, t_in, clip_embed_in],
+    torch_mlir.OutputType.LINALG_ON_TENSORS,
+    use_tracing=False,
+)
+module.dump()
--- a/tank/pytorch/v_diffusion_pytorch/cfg_sample.py
+++ b/tank/pytorch/v_diffusion_pytorch/cfg_sample.py
@@ -1,283 +0,0 @@
-#!/usr/bin/env python3
-
-"""Classifier-free guidance sampling from a diffusion model."""
-
-import argparse
-from functools import partial
-from pathlib import Path
-
-from PIL import Image
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torchvision import transforms
-from torchvision.transforms import functional as TF
-from tqdm import trange
-
-from shark.shark_inference import SharkInference
-
-import sys
-
-sys.path.append("v-diffusion-pytorch")
-from CLIP import clip
-from diffusion import get_model, get_models, sampling, utils
-
-MODULE_DIR = Path(__file__).resolve().parent
-
-
-def parse_prompt(prompt, default_weight=3.0):
-    if prompt.startswith("http://") or prompt.startswith("https://"):
-        vals = prompt.rsplit(":", 2)
-        vals = [vals[0] + ":" + vals[1], *vals[2:]]
-    else:
-        vals = prompt.rsplit(":", 1)
-    vals = vals + ["", default_weight][len(vals) :]
-    return vals[0], float(vals[1])
-
-
-def resize_and_center_crop(image, size):
-    fac = max(size[0] / image.size[0], size[1] / image.size[1])
-    image = image.resize(
-        (int(fac * image.size[0]), int(fac * image.size[1])), Image.LANCZOS
-    )
-    return TF.center_crop(image, size[::-1])
-
-
-# def main():
-p = argparse.ArgumentParser(
-    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-p.add_argument(
-    "prompts", type=str, default=[], nargs="*", help="the text prompts to use"
-)
-p.add_argument(
-    "--images",
-    type=str,
-    default=[],
-    nargs="*",
-    metavar="IMAGE",
-    help="the image prompts",
-)
-p.add_argument(
-    "--batch-size",
-    "-bs",
-    type=int,
-    default=1,
-    help="the number of images per batch",
-)
-p.add_argument("--checkpoint", type=str, help="the checkpoint to use")
-p.add_argument("--device", type=str, help="the device to use")
-p.add_argument(
-    "--eta",
-    type=float,
-    default=0.0,
-    help="the amount of noise to add during sampling (0-1)",
-)
-p.add_argument("--init", type=str, help="the init image")
-p.add_argument(
-    "--method",
-    type=str,
-    default="plms",
-    choices=["ddpm", "ddim", "prk", "plms", "pie", "plms2", "iplms"],
-    help="the sampling method to use",
-)
-p.add_argument(
-    "--model",
-    type=str,
-    default="cc12m_1_cfg",
-    choices=["cc12m_1_cfg"],
-    help="the model to use",
-)
-p.add_argument(
-    "-n", type=int, default=1, help="the number of images to sample"
-)
-p.add_argument("--seed", type=int, default=0, help="the random seed")
-p.add_argument("--size", type=int, nargs=2, help="the output image size")
-p.add_argument(
-    "--starting-timestep",
-    "-st",
-    type=float,
-    default=0.9,
-    help="the timestep to start at (used with init images)",
-)
-p.add_argument("--steps", type=int, default=50, help="the number of timesteps")
-args = p.parse_args()
-
-if args.device:
-    device = torch.device(args.device)
-else:
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print("Using device:", device)
-
-model = get_model(args.model)()
-_, side_y, side_x = model.shape
-if args.size:
-    side_x, side_y = args.size
-checkpoint = args.checkpoint
-if not checkpoint:
-    checkpoint = MODULE_DIR / f"checkpoints/{args.model}.pth"
-model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
-if device.type == "cuda":
-    model = model.half()
-model = model.to(device).eval().requires_grad_(False)
-clip_model_name = (
-    model.clip_model if hasattr(model, "clip_model") else "ViT-B/16"
-)
-clip_model = clip.load(clip_model_name, jit=False, device=device)[0]
-clip_model.eval().requires_grad_(False)
-normalize = transforms.Normalize(
-    mean=[0.48145466, 0.4578275, 0.40821073],
-    std=[0.26862954, 0.26130258, 0.27577711],
-)
-
-if args.init:
-    init = Image.open(utils.fetch(args.init)).convert("RGB")
-    init = resize_and_center_crop(init, (side_x, side_y))
-    init = (
-        utils.from_pil_image(init).to(device)[None].repeat([args.n, 1, 1, 1])
-    )
-
-zero_embed = torch.zeros([1, clip_model.visual.output_dim], device=device)
-target_embeds, weights = [zero_embed], []
-
-for prompt in args.prompts:
-    txt, weight = parse_prompt(prompt)
-    target_embeds.append(
-        clip_model.encode_text(clip.tokenize(txt).to(device)).float()
-    )
-    weights.append(weight)
-
-for prompt in args.images:
-    path, weight = parse_prompt(prompt)
-    img = Image.open(utils.fetch(path)).convert("RGB")
-    clip_size = clip_model.visual.input_resolution
-    img = resize_and_center_crop(img, (clip_size, clip_size))
-    batch = TF.to_tensor(img)[None].to(device)
-    embed = F.normalize(
-        clip_model.encode_image(normalize(batch)).float(), dim=-1
-    )
-    target_embeds.append(embed)
-    weights.append(weight)
-
-weights = torch.tensor([1 - sum(weights), *weights], device=device)
-
-torch.manual_seed(args.seed)
-
-
-def cfg_model_fn(x, t):
-    n = x.shape[0]
-    n_conds = len(target_embeds)
-    x_in = x.repeat([n_conds, 1, 1, 1])
-    t_in = t.repeat([n_conds])
-    clip_embed_in = torch.cat([*target_embeds]).repeat([n, 1])
-    vs = model(x_in, t_in, clip_embed_in).view([n_conds, n, *x.shape[1:]])
-    v = vs.mul(weights[:, None, None, None, None]).sum(0)
-    return v
-
-
-x = torch.randn([args.n, 3, side_y, side_x], device=device)
-t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
-steps = utils.get_spliced_ddpm_cosine_schedule(t)
-min_batch_size = min(args.n, args.batch_size)
-x_in = x[0:min_batch_size, :, :, :]
-ts = x_in.new_ones([x_in.shape[0]])
-t_in = t[0] * ts
-
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-import torch_mlir
-
-fx_g = make_fx(
-    cfg_model_fn,
-    decomposition_table=get_decompositions(
-        [
-            torch.ops.aten.embedding_dense_backward,
-            torch.ops.aten.native_layer_norm_backward,
-            torch.ops.aten.slice_backward,
-            torch.ops.aten.select_backward,
-            torch.ops.aten.norm.ScalarOpt_dim,
-            torch.ops.aten.native_group_norm,
-            torch.ops.aten.upsample_bilinear2d.vec,
-            torch.ops.aten.split.Tensor,
-            torch.ops.aten.split_with_sizes,
-        ]
-    ),
-)(x_in, t_in)
-
-fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-fx_g.recompile()
-
-
-def strip_overloads(gm):
-    """
-    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-    Args:
-        gm(fx.GraphModule): The input Fx graph module to be modified
-    """
-    for node in gm.graph.nodes:
-        if isinstance(node.target, torch._ops.OpOverload):
-            node.target = node.target.overloadpacket
-    gm.recompile()
-
-
-strip_overloads(fx_g)
-
-ts_g = torch.jit.script(fx_g)
-
-module = torch_mlir.compile(
-    ts_g,
-    [x_in, t_in],
-    torch_mlir.OutputType.LINALG_ON_TENSORS,
-    use_tracing=False,
-)
-
-mlir_model = module
-func_name = "forward"
-
-
-def compiled_cfg_model_fn(x, t):
-    x_ny = x.detach().numpy()
-    t_ny = t.detach().numpy()
-    inputs = (x_ny, t_ny)
-    shark_module = SharkInference(
-        mlir_model, func_name, device="gpu", mlir_dialect="linalg"
-    )
-    shark_module.compile()
-    result = shark_module.forward(inputs)
-    return torch.from_numpy(result)
-
-
-def run(x, steps):
-    if args.method == "ddpm":
-        return sampling.sample(compiled_cfg_model_fn, x, steps, 1.0, {})
-    if args.method == "ddim":
-        return sampling.sample(compiled_cfg_model_fn, x, steps, args.eta, {})
-    if args.method == "prk":
-        return sampling.prk_sample(compiled_cfg_model_fn, x, steps, {})
-    if args.method == "plms":
-        return sampling.plms_sample(compiled_cfg_model_fn, x, steps, {})
-    if args.method == "pie":
-        return sampling.pie_sample(compiled_cfg_model_fn, x, steps, {})
-    if args.method == "plms2":
-        return sampling.plms2_sample(compiled_cfg_model_fn, x, steps, {})
-    if args.method == "iplms":
-        return sampling.iplms_sample(compiled_cfg_model_fn, x, steps, {})
-    assert False
-
-
-def run_all(x, t, steps, n, batch_size):
-    x = torch.randn([n, 3, side_y, side_x], device=device)
-    t = torch.linspace(1, 0, args.steps + 1, device=device)[:-1]
-    steps = utils.get_spliced_ddpm_cosine_schedule(t)
-    if args.init:
-        steps = steps[steps < args.starting_timestep]
-        alpha, sigma = utils.t_to_alpha_sigma(steps[0])
-        x = init * alpha + x * sigma
-    for i in trange(0, n, batch_size):
-        cur_batch_size = min(n - i, batch_size)
-        outs = run(x[i : i + cur_batch_size], steps)
-        for j, out in enumerate(outs):
-            utils.to_pil_image(out).save(f"out_{i + j:05}.png")
-
-
-run_all(x, t, steps, args.n, args.batch_size)
--- a/tank/rembert_tf/rembert_tf_test.py
+++ b/tank/rembert_tf/rembert_tf_test.py
@@ -55,14 +55,6 @@ class RemBertModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/resnet101_torch/resnet101_torch_test.py
+++ b/tank/resnet101_torch/resnet101_torch_test.py
@@ -100,14 +100,6 @@ class Resnet101ModuleTest(unittest.TestCase):
        dynamic = True
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/resnet18_torch/resnet18_torch_test.py
+++ b/tank/resnet18_torch/resnet18_torch_test.py
@@ -99,14 +99,7 @@ class Resnet18ModuleTest(unittest.TestCase):
        dynamic = True
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
+

 if __name__ == "__main__":
    unittest.main()
--- a/tank/resnet50/resnet50_test.py
+++ b/tank/resnet50/resnet50_test.py
@@ -1,81 +0,0 @@
-from shark.shark_inference import SharkInference
-from shark.iree_utils._common import check_device_drivers, device_driver_info
-from shark.shark_downloader import download_tf_model
-from shark.parser import shark_args
-
-import unittest
-import numpy as np
-import pytest
-import numpy as np
-
-
-class Resnet50ModuleTester:
-    def __init__(
-        self,
-        benchmark=False,
-        onnx_bench=False,
-    ):
-        self.benchmark = benchmark
-        self.onnx_bench = onnx_bench
-
-    def create_and_check_module(self, dynamic, device):
-        model, func_name, inputs, golden_out = download_tf_model("resnet50")
-
-        shark_module = SharkInference(
-            model,
-            func_name,
-            device=device,
-            mlir_dialect="mhlo",
-            is_benchmark=self.benchmark,
-        )
-        shark_module.compile()
-        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
-
-        if self.benchmark == True:
-            shark_args.enable_tf32 = True
-            shark_args.onnx_bench = self.onnx_bench
-            shark_module.shark_runner.benchmark_all_csv(
-                (inputs), "resnet50", dynamic, device, "tensorflow"
-            )
-
-
-class Resnet50ModuleTest(unittest.TestCase):
-    @pytest.fixture(autouse=True)
-    def configure(self, pytestconfig):
-        self.module_tester = Resnet50ModuleTester(self)
-        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
-        self.module_tester.onnx_bench = pytestconfig.getoption("onnx_bench")
-
-    def test_module_static_cpu(self):
-        dynamic = False
-        device = "cpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("gpu"), reason=device_driver_info("gpu")
-    )
-    def test_module_static_gpu(self):
-        dynamic = False
-        device = "gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-    @pytest.mark.skipif(
-        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
-    )
-    def test_module_static_vulkan(self):
-        dynamic = False
-        device = "vulkan"
-        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tank/resnet50_torch/resnet50_torch_test.py
+++ b/tank/resnet50_torch/resnet50_torch_test.py
@@ -100,14 +100,6 @@ class Resnet50ModuleTest(unittest.TestCase):
        dynamic = True
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/roberta-base_tf/roberta-base_tf_test.py
+++ b/tank/roberta-base_tf/roberta-base_tf_test.py
@@ -28,9 +28,7 @@ class RobertaBaseModuleTester:
        )
        shark_module.compile()
        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(
-            result, golden_out, rtol=1e-02, atol=1e-01, verbose=True
-        )
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)


 class RobertaBaseModuleTest(unittest.TestCase):
@@ -44,7 +42,6 @@ class RobertaBaseModuleTest(unittest.TestCase):
        device = "cpu"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.xfail(reason="https://github.com/nod-ai/SHARK/issues/274")
    @pytest.mark.skipif(
        check_device_drivers("gpu"), reason=device_driver_info("gpu")
    )
@@ -60,14 +57,6 @@ class RobertaBaseModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/tapas-base_tf/tapas-base_tf_test.py
+++ b/tank/tapas-base_tf/tapas-base_tf_test.py
@@ -57,14 +57,6 @@ class TapasBaseModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/reference_models/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
+++ b/reference_models/distilbert-base-uncased_tf/distilbert-base-uncased_tf_test.py
@@ -8,7 +8,7 @@ import pytest
 import numpy as np


-class DistilBertModuleTester:
+class MiniLMModuleTester:
    def __init__(
        self,
        benchmark=False,
@@ -17,7 +17,7 @@ class DistilBertModuleTester:

    def create_and_check_module(self, dynamic, device):
        model, func_name, inputs, golden_out = download_tf_model(
-            "distilbert-base-uncased"
+            "microsoft/MiniLM-L12-H384-uncased"
        )

        shark_module = SharkInference(
@@ -28,19 +28,18 @@ class DistilBertModuleTester:
        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)


-class DistilBertModuleTest(unittest.TestCase):
+class MiniLMModuleTest(unittest.TestCase):
    @pytest.fixture(autouse=True)
    def configure(self, pytestconfig):
-        self.module_tester = DistilBertModuleTester(self)
+        self.module_tester = MiniLMModuleTester(self)
        self.module_tester.benchmark = pytestconfig.getoption("benchmark")

-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
    def test_module_static_cpu(self):
        dynamic = False
        device = "cpu"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
+    @pytest.mark.skip(reason="MiniLM numerics issues on gpu")
    @pytest.mark.skipif(
        check_device_drivers("gpu"), reason=device_driver_info("gpu")
    )
@@ -49,7 +48,6 @@ class DistilBertModuleTest(unittest.TestCase):
        device = "gpu"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.xfail(reason="shark_tank hash issues -- awaiting triage")
    @pytest.mark.skipif(
        check_device_drivers("vulkan"), reason=device_driver_info("vulkan")
    )
@@ -57,14 +55,6 @@ class DistilBertModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/tf/tf_model_list.csv
+++ b/tank/tf/tf_model_list.csv
@@ -11,9 +11,8 @@ microsoft/layoutlm-base-uncased,hf
 google/mobilebert-uncased,hf
 microsoft/mpnet-base,hf
 roberta-base,hf
-resnet50,keras
 xlm-roberta-base,hf
-microsoft/MiniLM-L12-H384-uncased,TFhf
+microsoft/MiniLM-L12-H384-uncased,hf
 funnel-transformer/small,hf
 microsoft/mpnet-base,hf
 facebook/convnext-tiny-224,img
--- a/tank/tiny-random-flaubert_tf/tiny-random-flaubert_tf_test.py
+++ b/tank/tiny-random-flaubert_tf/tiny-random-flaubert_tf_test.py
@@ -55,14 +55,6 @@ class FlauBertModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/wide_resnet50_2_torch/wide_resnet50_2_torch_test.py
+++ b/tank/wide_resnet50_2_torch/wide_resnet50_2_torch_test.py
@@ -100,14 +100,6 @@ class WideResnet50ModuleTest(unittest.TestCase):
        dynamic = True
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
--- a/tank/xlm-roberta-base_tf/xlm-roberta-base_tf_test.py
+++ b/tank/xlm-roberta-base_tf/xlm-roberta-base_tf_test.py
@@ -25,9 +25,7 @@ class XLMRobertaModuleTester:
        )
        shark_module.compile()
        result = shark_module.forward(inputs)
-        np.testing.assert_allclose(
-            result, golden_out, rtol=1e-02, atol=1e-01, verbose=True
-        )
+        np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)


 class XLMRobertaModuleTest(unittest.TestCase):
@@ -41,7 +39,6 @@ class XLMRobertaModuleTest(unittest.TestCase):
        device = "cpu"
        self.module_tester.create_and_check_module(dynamic, device)

-    @pytest.mark.xfail(reason="https://github.com/nod-ai/SHARK/issues/274")
    @pytest.mark.skipif(
        check_device_drivers("gpu"), reason=device_driver_info("gpu")
    )
@@ -57,14 +54,6 @@ class XLMRobertaModuleTest(unittest.TestCase):
        dynamic = False
        device = "vulkan"
        self.module_tester.create_and_check_module(dynamic, device)
-    @pytest.mark.skipif(
-        check_device_drivers("intel-gpu"),
-        reason=device_driver_info("intel-gpu"),
-    )
-    def test_module_static_intel_gpu(self):
-        dynamic = False
-        device = "intel-gpu"
-        self.module_tester.create_and_check_module(dynamic, device)


 if __name__ == "__main__":
Author	SHA1	Message	Date
powderluv	c9a310842d	Merge branch 'main' into MI100	2022-08-03 11:41:31 -07:00
powderluv	67bdfda58c	Update test-models.yml	2022-07-29 09:04:36 -07:00