diff --git a/shark/iree_utils.py b/shark/iree_utils.py
index fa69c4fb..d1f86f16 100644
--- a/shark/iree_utils.py
+++ b/shark/iree_utils.py
@@ -290,18 +290,34 @@ def tensor_to_type_str(input_tensors: tuple, frontend: str):
     """
     list_of_type = []
     for input_tensor in input_tensors:
-        type_string = "x".join([str(dim) for dim in input_tensor.shape])
-        if frontend in ["torch", "pytorch"]:
-            dtype_string = str(input_tensor.dtype).replace("torch.", "")
-        elif frontend in ["tensorflow", "tf"]:
-            dtype = input_tensor.dtype
-            dtype_string = re.findall('\'[^"]*\'',
-                                      str(dtype))[0].replace("\'", "")
-        regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
-        match = regex_split.match(dtype_string)
-        mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
-        type_string += f"x{mlir_type_string}"
-        list_of_type.append(type_string)
+        if (isinstance(input_tensor, tuple)):
+            for val in input_tensor:
+                type_string = "x".join([str(dim) for dim in val.shape])
+                if frontend in ["torch", "pytorch"]:
+                    dtype_string = str(val.dtype).replace("torch.", "")
+                elif frontend in ["tensorflow", "tf"]:
+                    dtype = val.dtype
+                    dtype_string = re.findall('\'[^"]*\'',
+                                             str(dtype))[0].replace("\'", "")
+                regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
+                match = regex_split.match(dtype_string)
+                mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
+                type_string += f"x{mlir_type_string}"
+                list_of_type.append(type_string)
+
+        else:
+            type_string = "x".join([str(dim) for dim in input_tensor.shape])
+            if frontend in ["torch", "pytorch"]:
+                dtype_string = str(input_tensor.dtype).replace("torch.", "")
+            elif frontend in ["tensorflow", "tf"]:
+                dtype = input_tensor.dtype
+                dtype_string = re.findall('\'[^"]*\'',
+                                         str(dtype))[0].replace("\'", "")
+            regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
+            match = regex_split.match(dtype_string)
+            mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
+            type_string += f"x{mlir_type_string}"
+            list_of_type.append(type_string)
     return list_of_type
 
 
diff --git a/shark/shark_inference.py b/shark/shark_inference.py
index fc841850..2db1e058 100644
--- a/shark/shark_inference.py
+++ b/shark/shark_inference.py
@@ -111,3 +111,7 @@ class SharkInference:
     @benchmark_mode
     def benchmark_c(self):
         self.shark_runner.benchmark_c()
+
+    @benchmark_mode
+    def benchmark_all_csv(self, inputs, modelname, dynamic, device_str):
+        self.shark_runner.benchmark_all_csv(inputs, modelname, dynamic, device_str)
diff --git a/shark/shark_runner.py b/shark/shark_runner.py
index 051a8182..71cf1c0c 100644
--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -24,7 +24,9 @@ from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_
 import os
 from shark.parser import shark_args
 from tqdm import tqdm
+from datetime import datetime
 import time
+import csv
 
 
 class SharkRunner:
@@ -148,9 +150,9 @@ class SharkBenchmarkRunner(SharkRunner):
 
     def benchmark_frontend(self, inputs):
         if self.frontend in ["pytorch", "torch"]:
-            self.benchmark_torch(inputs)
+            return self.benchmark_torch(inputs)
         elif self.frontend in ["tensorflow", "tf"]:
-            self.benchmark_tf(inputs)
+            return self.benchmark_tf(inputs)
 
     def benchmark_torch(self, inputs):
         inputs = self.input if self.from_aot else inputs
@@ -167,6 +169,7 @@ class SharkBenchmarkRunner(SharkRunner):
         print(
             f"Torch benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
         )
+        return [f"{shark_args.num_iterations/(end-begin)}", f"{((end-begin)/shark_args.num_iterations)*1000}"]
 
     def benchmark_tf(self, inputs):
         for i in range(shark_args.num_warmup_iterations):
@@ -181,11 +184,12 @@ class SharkBenchmarkRunner(SharkRunner):
         print(
             f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
         )
-        return
-
+        return [f"{shark_args.num_iterations/(end-begin)}", f"{((end-begin)/shark_args.num_iterations)*1000}"]
+    
     def benchmark_c(self):
         result = run_benchmark_module(self.benchmark_cl)
         print(f"Shark-{self.frontend} C-benchmark:{result} iter/second")
+        return [f"{result}", f"{1000/result}"]
 
     def benchmark_python(self, inputs):
         inputs = self.input if self.from_aot else inputs
@@ -201,8 +205,57 @@ class SharkBenchmarkRunner(SharkRunner):
         print(
             f"Shark-{self.frontend} Python-benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
         )
+        return [f"{shark_args.num_iterations/(end-begin)}", f"{((end-begin)/shark_args.num_iterations)*1000}"]
 
     def benchmark_all(self, inputs):
         self.benchmark_frontend(inputs)
         self.benchmark_python(inputs)
         self.benchmark_c()
+
+    def benchmark_all_csv(self, inputs, modelname, dynamic, device_str):
+        field_names = [
+                'platform',
+                'model',
+                'dynamic',
+                'device',
+                'iter/sec',
+                'ms/iter',
+                'datetime'
+                ]
+        platforms = [
+                'frontend',
+                'shark_python',
+                'shark_iree_c'
+                ]
+
+        if not os.path.exists('bench_results.csv'):
+            with open('bench_results.csv', mode='w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(field_names)
+        
+        with open('bench_results.csv', mode='a', newline='') as f:
+            writer = csv.DictWriter(f, fieldnames=field_names)
+            bench_result = {}
+            bench_result['model'] = modelname
+            if dynamic == True:
+                bench_result['dynamic'] = "True"
+            else:
+                bench_result['dynamic'] = "False"
+            bench_result['device'] = device_str
+            for p in platforms:
+                if p == 'frontend':
+                    bench_result['platform'] = "frontend"
+                    bench_result['iter/sec'] = self.benchmark_frontend(inputs)[0]
+                    bench_result['ms/iter'] = self.benchmark_frontend(inputs)[1]
+                elif p == 'shark_python':
+                    bench_result['platform'] = "shark_python"
+                    bench_result['iter/sec'] = self.benchmark_python(inputs)[0]
+                    bench_result['ms/iter'] = self.benchmark_python(inputs)[1]
+                else:
+                    bench_result['platform'] = "shark_iree_c"
+                    bench_result['iter/sec'] = self.benchmark_c()[0]
+                    bench_result['ms/iter'] = self.benchmark_c()[1]
+                bench_result['datetime'] = str(datetime.now())
+                writer.writerow(bench_result)
+                
+
diff --git a/tank/conftest.py b/tank/conftest.py
new file mode 100644
index 00000000..3bcd2cf6
--- /dev/null
+++ b/tank/conftest.py
@@ -0,0 +1,18 @@
+def pytest_addoption(parser):
+    # Attaches SHARK command-line arguments to the pytest machinery.
+    parser.addoption("--save_mlir",
+            action="store_true",
+            default="False",
+            help="Pass option to save input MLIR")
+    parser.addoption("--save_vmfb",
+            action="store_true",
+            default="False",
+            help="Pass option to save IREE output .vmfb")
+    parser.addoption("--benchmark",
+            action="store_true",
+            default="False",
+            help="Pass option to benchmark and write results.csv")
+    parser.addoption("--save_temps",
+            action="store_true",
+            default="False",
+            help="Saves IREE reproduction artifacts for filing upstream issues.")
diff --git a/tank/pytorch/conftest.py b/tank/pytorch/conftest.py
deleted file mode 100644
index 6f25e85a..00000000
--- a/tank/pytorch/conftest.py
+++ /dev/null
@@ -1,4 +0,0 @@
-def pytest_addoption(parser):
-    # Attaches SHARK command-line arguments to the pytest machinery.
-    parser.addoption("--save_mlir", action="store_true", default="False", help="Pass option to save input MLIR module to /tmp/ directory.")
-    parser.addoption("--save_vmfb", action="store_true", default="False", help="Pass option to save input MLIR module to /tmp/ directory.")
diff --git a/tank/tf/conftest.py b/tank/tf/conftest.py
deleted file mode 100644
index 681f6bc1..00000000
--- a/tank/tf/conftest.py
+++ /dev/null
@@ -1,3 +0,0 @@
-def pytest_addoption(parser):
-    # Attaches SHARK command-line arguments to the pytest machinery.
-    parser.addoption("--save_temps", action="store_true", default="False", help="Saves IREE reproduction artifacts for filing upstream issues.")
diff --git a/tank/tf/hf_masked_lm/albert-base-v2_test.py b/tank/tf/hf_masked_lm/albert-base-v2_test.py
index b8070655..901d8fbd 100644
--- a/tank/tf/hf_masked_lm/albert-base-v2_test.py
+++ b/tank/tf/hf_masked_lm/albert-base-v2_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,21 @@ class AlbertBaseModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+        self.benchmark = benchmark
 
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("albert-base-v2")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"albert_base_v2_dynamic_{device}"
             else:
@@ -37,7 +45,8 @@ class AlbertBaseModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,19 +56,29 @@ class AlbertBaseModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "albert-base-v2",
+                                           dynamic,
+                                           device)
+
 
 class AlbertBaseModuleTest(unittest.TestCase):
 
     @pytest.fixture(autouse=True)
     def configure(self, pytestconfig):
         self.module_tester = AlbertBaseModuleTester(self)
-        self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_temps=pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir=pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb=pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark=pytestconfig.getoption("benchmark")
 
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/bert-base-uncased_test.py b/tank/tf/hf_masked_lm/bert-base-uncased_test.py
index 0150d6f6..24c6a25b 100644
--- a/tank/tf/hf_masked_lm/bert-base-uncased_test.py
+++ b/tank/tf/hf_masked_lm/bert-base-uncased_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,21 @@ class BertBaseUncasedModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+        self.benchmark = benchmark
 
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("bert-base-uncased")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"bert_base_uncased_dynamic_{device}"
             else:
@@ -37,7 +45,8 @@ class BertBaseUncasedModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +56,19 @@ class BertBaseUncasedModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "bert_base_uncased",
+                                           dynamic,
+                                           device)
+
 
 class BertBaseUncasedModuleTest(unittest.TestCase):
 
@@ -60,6 +76,10 @@ class BertBaseUncasedModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = BertBaseUncasedModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+
 
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/camembert-base_test.py b/tank/tf/hf_masked_lm/camembert-base_test.py
index 97151708..8bffff36 100644
--- a/tank/tf/hf_masked_lm/camembert-base_test.py
+++ b/tank/tf/hf_masked_lm/camembert-base_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,21 @@ class CamemBertModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+        self.benchmark = benchmark
 
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("camembert-base")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"camembert-base_dynamic_{device}"
             else:
@@ -37,7 +45,8 @@ class CamemBertModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +56,18 @@ class CamemBertModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
-
+        
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "camembert-base",
+                                           dynamic,
+                                           device)
 
 class CamemBertModuleTest(unittest.TestCase):
 
@@ -60,7 +75,10 @@ class CamemBertModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester=CamemBertModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
-
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+    
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
         dynamic = False
diff --git a/tank/tf/hf_masked_lm/convbert-base-turkish-cased_test.py b/tank/tf/hf_masked_lm/convbert-base-turkish-cased_test.py
index 1a723e80..7f203ec7 100644
--- a/tank/tf/hf_masked_lm/convbert-base-turkish-cased_test.py
+++ b/tank/tf/hf_masked_lm/convbert-base-turkish-cased_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -10,19 +11,27 @@ import numpy as np
 import tempfile
 
 
-class ConvBertModuleTester:
+class ConvBertModuleTester: 
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+        self.benchmark = benchmark
+
 
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model(
             "dbmdz/convbert-base-turkish-cased")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"convbert_base_dynamic_{device}"
             else:
@@ -38,7 +47,8 @@ class ConvBertModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -48,12 +58,19 @@ class ConvBertModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                          "convbert-base-turkish-cased",
+                                          dynamic,
+                                          device)
+
 
 class ConvBertModuleTest(unittest.TestCase):
 
@@ -61,8 +78,11 @@ class ConvBertModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = ConvBertModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
-    @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
+    @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536.")
     def test_module_static_cpu(self):
         dynamic = False
         device = "cpu"
diff --git a/tank/tf/hf_masked_lm/deberta-base_test.py b/tank/tf/hf_masked_lm/deberta-base_test.py
index 925f0930..410d6d86 100644
--- a/tank/tf/hf_masked_lm/deberta-base_test.py
+++ b/tank/tf/hf_masked_lm/deberta-base_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class DebertaModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+        
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("microsoft/deberta-base")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"deberta-base_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class DebertaModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,19 @@ class DebertaModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "deberta-base",
+                                           dynamic,
+                                           device)
+
 
 class DebertaModuleTest(unittest.TestCase):
     
@@ -60,7 +75,10 @@ class DebertaModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = DebertaModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
-
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+    
     @pytest.mark.xfail
     @pytest.mark.skip(reason="deberta currently failing in the lowering passes."
                      )
diff --git a/tank/tf/hf_masked_lm/distilbert-base-uncased_test.py b/tank/tf/hf_masked_lm/distilbert-base-uncased_test.py
index 7bcd488c..c4d0e06f 100644
--- a/tank/tf/hf_masked_lm/distilbert-base-uncased_test.py
+++ b/tank/tf/hf_masked_lm/distilbert-base-uncased_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class DistilBertModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
 
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("distilbert-base-uncased")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"distilbert_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class DistilBertModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,18 +55,28 @@ class DistilBertModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "distilbert-base-uncased",
+                                           dynamic,
+                                           device)
+
 class DistilBertModuleTest(unittest.TestCase):
 
     @pytest.fixture(autouse=True)
     def configure(self, pytestconfig):
         self.module_tester = DistilBertModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/electra-small-discriminator_test.py b/tank/tf/hf_masked_lm/electra-small-discriminator_test.py
index 8377c1bf..66b77168 100644
--- a/tank/tf/hf_masked_lm/electra-small-discriminator_test.py
+++ b/tank/tf/hf_masked_lm/electra-small-discriminator_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class ElectraModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
 
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("google/electra-small-discriminator")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"electra_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class ElectraModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,19 +55,29 @@ class ElectraModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
-
+        
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "electra-small-discriminator",
+                                           dynamic,
+                                           device)
 
 class ElectraModuleTest(unittest.TestCase):
 
     @pytest.fixture(autouse=True)
     def configure(self, pytestconfig):
         self.module_tester = ElectraModuleTester(self)
-        self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_temps = pytestconfig.getoption("save_temps") 
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
+        
 
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/funnel-transformer_test.py b/tank/tf/hf_masked_lm/funnel-transformer_test.py
index 735a9d9f..7c60ed58 100644
--- a/tank/tf/hf_masked_lm/funnel-transformer_test.py
+++ b/tank/tf/hf_masked_lm/funnel-transformer_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class FunnelModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
 
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("funnel-transformer/small")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"funnel_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class FunnelModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class FunnelModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "funnel-transformer-small",
+                                           dynamic,
+                                           device)
 
 class FunnelModuleTest(unittest.TestCase):
 
@@ -60,6 +74,9 @@ class FunnelModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = FunnelModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.skip(reason="funnel currently failing in the lowering passes.")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/layoutlm-base-uncased_test.py b/tank/tf/hf_masked_lm/layoutlm-base-uncased_test.py
index 1088b1a6..b6bbfcf7 100644
--- a/tank/tf/hf_masked_lm/layoutlm-base-uncased_test.py
+++ b/tank/tf/hf_masked_lm/layoutlm-base-uncased_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,15 +15,21 @@ class LayoutLmModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+    
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model(
             "microsoft/layoutlm-base-uncased")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"layoutlm_dynamic_{device}"
             else:
@@ -38,7 +45,8 @@ class LayoutLmModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -48,12 +56,18 @@ class LayoutLmModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "layoutlm-base-uncased",
+                                           dynamic,
+                                           device)
 
 class LayoutLmModuleTest(unittest.TestCase):
 
@@ -61,6 +75,9 @@ class LayoutLmModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = LayoutLmModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
     
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/longformer-base-4096_test.py b/tank/tf/hf_masked_lm/longformer-base-4096_test.py
index 66f17e58..4fac8fd4 100644
--- a/tank/tf/hf_masked_lm/longformer-base-4096_test.py
+++ b/tank/tf/hf_masked_lm/longformer-base-4096_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -11,17 +12,23 @@ import tempfile
 
 
 class LongFormerModuleTester:
-    
+     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+    
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("allenai/longformer-base-4096")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"longformer_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class LongFormerModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class LongFormerModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "longformer-base-4096",
+                                           dynamic,
+                                           device)
 
 class LongFormerModuleTest(unittest.TestCase):
 
@@ -60,6 +74,9 @@ class LongFormerModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = LongFormerModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.skip(
         reason="longformer currently failing in the lowering passes.")
diff --git a/tank/tf/hf_masked_lm/mobilebert-uncased_test.py b/tank/tf/hf_masked_lm/mobilebert-uncased_test.py
index 46f24ca5..aafb86df 100644
--- a/tank/tf/hf_masked_lm/mobilebert-uncased_test.py
+++ b/tank/tf/hf_masked_lm/mobilebert-uncased_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class MobileBertModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
 
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("google/mobilebert-uncased")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"mobilebert_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class MobileBertModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class MobileBertModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "mobilebert-uncased",
+                                           dynamic,
+                                           device)
 
 class MobileBertModuleTest(unittest.TestCase):
 
@@ -60,6 +74,9 @@ class MobileBertModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = MobileBertModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/mpnet-base_test.py b/tank/tf/hf_masked_lm/mpnet-base_test.py
index e7d0cf92..eb5895bf 100644
--- a/tank/tf/hf_masked_lm/mpnet-base_test.py
+++ b/tank/tf/hf_masked_lm/mpnet-base_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class MpNetModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+    
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("microsoft/mpnet-base")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"mpnet_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class MpNetModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class MpNetModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "mpnet-base",
+                                           dynamic,
+                                           device)
 
 class MpNetModuleTest(unittest.TestCase):
     
@@ -60,6 +74,9 @@ class MpNetModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = MpNetModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/rembert_test.py b/tank/tf/hf_masked_lm/rembert_test.py
index a7575a78..0d097b71 100644
--- a/tank/tf/hf_masked_lm/rembert_test.py
+++ b/tank/tf/hf_masked_lm/rembert_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class RemBertModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+    
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("google/rembert")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"rembert_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class RemBertModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class RemBertModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "rembert",
+                                           dynamic,
+                                           device)
 
 class RemBertModuleTest(unittest.TestCase):
 
@@ -60,6 +74,9 @@ class RemBertModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = RemBertModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.skip(reason="rembert currently failing in the lowering passes."
                      )
diff --git a/tank/tf/hf_masked_lm/roberta-base_test.py b/tank/tf/hf_masked_lm/roberta-base_test.py
index 8d923b6c..1dfc1cc8 100644
--- a/tank/tf/hf_masked_lm/roberta-base_test.py
+++ b/tank/tf/hf_masked_lm/roberta-base_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class RobertaModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+    
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("roberta-base")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"roberta_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class RobertaModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class RobertaModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "roberta-base",
+                                           dynamic,
+                                           device)
 
 class RobertaModuleTest(unittest.TestCase):
 
@@ -60,6 +74,9 @@ class RobertaModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = RobertaModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.xfail(reason="Upstream IREE issue, see https://github.com/google/iree/issues/9536")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/tapas-base_test.py b/tank/tf/hf_masked_lm/tapas-base_test.py
index 3f1e424e..4c2473ca 100644
--- a/tank/tf/hf_masked_lm/tapas-base_test.py
+++ b/tank/tf/hf_masked_lm/tapas-base_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class TapasBaseModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+    
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("google/tapas-base")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"tapas-base_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class TapasBaseModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class TapasBaseModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "tapas-base",
+                                           dynamic,
+                                           device)
 
 class TapasBaseModuleTest(unittest.TestCase):
 
@@ -60,6 +74,9 @@ class TapasBaseModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = TapasBaseModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.skip(reason="tapas currently failing in the lowering passes.")
     def test_module_static_cpu(self):
diff --git a/tank/tf/hf_masked_lm/tiny-random-flaubert_test.py b/tank/tf/hf_masked_lm/tiny-random-flaubert_test.py
index 41f9e091..d5da7bfd 100644
--- a/tank/tf/hf_masked_lm/tiny-random-flaubert_test.py
+++ b/tank/tf/hf_masked_lm/tiny-random-flaubert_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class FlauBertModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+    
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("hf-internal-testing/tiny-random-flaubert")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"flaubert_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class FlauBertModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class FlauBertModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "tiny-random-flaubert",
+                                           dynamic,
+                                           device)
 
 class FlauBertModuleTest(unittest.TestCase):
 
@@ -60,6 +74,9 @@ class FlauBertModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = FlauBertModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     def test_module_static_cpu(self):
         dynamic = False
diff --git a/tank/tf/hf_masked_lm/xlm-roberta-base_test.py b/tank/tf/hf_masked_lm/xlm-roberta-base_test.py
index 3733c0c6..ff44ecc0 100644
--- a/tank/tf/hf_masked_lm/xlm-roberta-base_test.py
+++ b/tank/tf/hf_masked_lm/xlm-roberta-base_test.py
@@ -2,6 +2,7 @@ from masked_lm import get_causal_lm_model
 from tank.model_utils_tf import compare_tensors_tf
 from shark.iree_utils import check_device_drivers
 from shark.shark_inference import SharkInference
+from shark.parser import shark_args
 
 import iree.compiler as ireec
 import unittest
@@ -14,14 +15,20 @@ class XLMRobertaModuleTester:
     
     def __init__(
         self,
-        save_temps=False
+        save_temps=False,
+        save_mlir=False,
+        save_vmfb=False,
+        benchmark=False
     ):
         self.save_temps = save_temps
-
+        self.save_mlir = save_mlir
+        self.save_vmfb = save_vmfb
+    
     def create_and_check_module(self, dynamic, device):
         model, input, act_out = get_causal_lm_model("xlm-roberta-base")
-        save_temps = self.save_temps
-        if save_temps == True:
+        shark_args.save_mlir = self.save_mlir
+        shark_args.save_vmfb = self.save_vmfb
+        if self.save_temps == True:
             if dynamic == True:
                 repro_dir = f"xlm_roberta_dynamic_{device}"
             else:
@@ -37,7 +44,8 @@ class XLMRobertaModuleTester:
                 shark_module = SharkInference(model, (input,),
                                               device=device,
                                               dynamic=dynamic,
-                                              jit_trace=True)
+                                              jit_trace=True,
+                                              benchmark_mode=self.benchmark)
                 shark_module.set_frontend("tensorflow")
                 shark_module.compile()
                 results = shark_module.forward((input))
@@ -47,12 +55,18 @@ class XLMRobertaModuleTester:
             shark_module = SharkInference(model, (input,),
                                           device=device,
                                           dynamic=dynamic,
-                                          jit_trace=True)
+                                          jit_trace=True,
+                                          benchmark_mode=self.benchmark)
             shark_module.set_frontend("tensorflow")
             shark_module.compile()
             results = shark_module.forward((input))
             assert True == compare_tensors_tf(act_out, results)
 
+        if self.benchmark == True:
+            shark_module.benchmark_all_csv((input),
+                                           "xlm-roberta-base",
+                                           dynamic,
+                                           device)
 
 class XLMRobertaModuleTest(unittest.TestCase):
 
@@ -60,6 +74,9 @@ class XLMRobertaModuleTest(unittest.TestCase):
     def configure(self, pytestconfig):
         self.module_tester = XLMRobertaModuleTester(self)
         self.module_tester.save_temps = pytestconfig.getoption("save_temps")
+        self.module_tester.save_mlir = pytestconfig.getoption("save_mlir")
+        self.module_tester.save_vmfb = pytestconfig.getoption("save_vmfb")
+        self.module_tester.benchmark = pytestconfig.getoption("benchmark")
 
     @pytest.mark.skip(reason="Test currently hangs.")
     def test_module_static_cpu(self):
diff --git a/tank/tflite/conftest.py b/tank/tflite/conftest.py
deleted file mode 100644
index 10f42293..00000000
--- a/tank/tflite/conftest.py
+++ /dev/null
@@ -1,4 +0,0 @@
-def pytest_addoption(parser):
-    # Attaches SHARK command-line arguments to the pytest machinery.
-    parser.addoption("--save_mlir", default="False", help="Pass option to save input MLIR module to /tmp/ directory.")
-    parser.addoption("--save_vmfb", default="False", help="Pass option to save input MLIR module to /tmp/ directory.")