ORT-HF Benchmark Integration (#101)

-Add HF Benchmarker class. -Add sample to benchmark HF model. Example: ```bash python -m benchmarks.hf_model_benchmark --num_iterations=10 --model_name="microsoft/MiniLM-L12-H384-uncased" ```
2026-04-03 03:00:17 -04:00 · 2022-06-07 23:49:39 -07:00
parent a9faeae794
commit 8565be9b6b
11 changed files with 221 additions and 6 deletions
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -90,5 +90,5 @@ jobs:
    - name: Validate Models
      run: |
        source shark.venv/bin/activate
-        pytest -k 'not benchmark'
+        pytest -k 'not benchmark' --ignore=shark/tests/test_hf_benchmark.py
        
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,7 @@ cython_debug/

 # Shark related artefacts
 shark.venv/
+
+# ORT related artefacts
+cache_models/
+onnx_models/
--- a/benchmarks/init.py
+++ b/benchmarks/init.py
--- a/benchmarks/hf_model_benchmark.py
+++ b/benchmarks/hf_model_benchmark.py
@@ -0,0 +1,22 @@
+import torch
+from shark.parser import parser
+from benchmarks.hf_transformer import SharkHFBenchmarkRunner
+
+parser.add_argument(
+    "--model_name",
+    type=str,
+    required=True,
+    help=
+    "Specifies name of HF model to benchmark. (For exmaple \"microsoft/MiniLM-L12-H384-uncased\""
+)
+load_args, unknown = parser.parse_known_args()
+
+if __name__ == "__main__":
+    model_name = load_args.model_name
+    test_input = torch.randint(2, (1, 128))
+    shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
+                                          jit_trace=True)
+    shark_module.benchmark_c()
+    shark_module.benchmark_python((test_input,))
+    shark_module.benchmark_torch(test_input)
+    shark_module.benchmark_onnx(test_input)
--- a/benchmarks/hf_transformer.py
+++ b/benchmarks/hf_transformer.py
@@ -0,0 +1,137 @@
+import torch
+from shark.shark_runner import SharkBenchmarkRunner
+from shark.parser import shark_args
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from onnxruntime.transformers.benchmark import run_pytorch, run_tensorflow, run_onnxruntime
+from onnxruntime.transformers.huggingface_models import MODELS
+from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
+import os
+import psutil
+
+
+class OnnxFusionOptions(object):
+
+    def __init__(self):
+        self.disable_gelu = False
+        self.disable_layer_norm = False
+        self.disable_attention = False
+        self.disable_skip_layer_norm = False
+        self.disable_embed_layer_norm = False
+        self.disable_bias_skip_layer_norm = False
+        self.disable_bias_gelu = False
+        self.enable_gelu_approximation = False
+        self.use_mask_index = False
+        self.no_attention_mask = False
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=
+            2,  # The number of output labels--2 for binary classification.
+            output_attentions=
+            False,  # Whether the model returns attentions weights.
+            output_hidden_states=
+            False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
+    # SharkRunner derived class with Benchmarking capabilities.
+    def __init__(
+        self,
+        model_name: str,
+        input: tuple,
+        dynamic: bool = False,
+        device: str = None,
+        jit_trace: bool = False,
+        from_aot: bool = False,
+        frontend: str = "torch",
+    ):
+        self.device = device if device is not None else shark_args.device
+        if self.device == "gpu":
+            raise ValueError(
+                "Currently GPU Benchmarking is not supported due to OOM from ORT."
+            )
+        self.model_name = model_name
+        model = HuggingFaceLanguage(model_name)
+        SharkBenchmarkRunner.__init__(self, model, input, dynamic, self.device,
+                                      jit_trace, from_aot, frontend)
+
+    def benchmark_torch(self, inputs):
+        use_gpu = self.device == "gpu"
+        # Set set the model's layer number to automatic.
+        config_modifier = ConfigModifier(None)
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        verbose = False
+        result = run_pytorch(use_gpu, [self.model_name], None, config_modifier,
+                             Precision.FLOAT32, num_threads, batch_sizes,
+                             sequence_lengths, shark_args.num_iterations, False,
+                             cache_dir, verbose)
+        print(
+            f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    # TODO: Currently non-functional due to TF runtime error. There might be some issue with, initializing TF.
+    def benchmark_tf(self, inputs):
+        use_gpu = self.device == "gpu"
+        # Set set the model's layer number to automatic.
+        config_modifier = ConfigModifier(None)
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        verbose = False
+        result = run_tensorflow(use_gpu, [self.model_name], None,
+                                config_modifier, Precision.FLOAT32, num_threads,
+                                batch_sizes, sequence_lengths,
+                                shark_args.num_iterations, cache_dir, verbose)
+        print(
+            f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
+
+    def benchmark_onnx(self, inputs):
+        if self.model_name not in MODELS:
+            print(
+                f"{self.model_name} is currently not supported in ORT's HF. Check \
+https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
+for currently supported models. Exiting benchmark ONNX.")
+            return
+        use_gpu = self.device == "gpu"
+        num_threads = psutil.cpu_count(logical=False)
+        batch_sizes = [inputs.shape[0]]
+        sequence_lengths = [inputs.shape[-1]]
+        cache_dir = os.path.join(".", "cache_models")
+        onnx_dir = os.path.join(".", "onnx_models")
+        verbose = False
+        input_counts = [1]
+        optimize_onnx = True
+        validate_onnx = False
+        disable_ort_io_binding = False
+        use_raw_attention_mask = True
+        model_fusion_statistics = {}
+        overwrite = False
+        model_source = "pt"  #Either "pt" or "tf"
+        provider = None
+        config_modifier = ConfigModifier(None)
+        onnx_args = OnnxFusionOptions()
+        result = run_onnxruntime(
+            use_gpu, provider, [self.model_name], None, config_modifier,
+            Precision.FLOAT32, num_threads, batch_sizes, sequence_lengths,
+            shark_args.num_iterations, input_counts, optimize_onnx,
+            validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
+            disable_ort_io_binding, use_raw_attention_mask,
+            model_fusion_statistics, model_source, onnx_args)
+        print(
+            f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
+        )
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -20,3 +20,11 @@ Pillow
 # Testing and support.
 lit
 pyyaml
+
+# To Enable ONNX Runtime Benchmarks
+# TODO: Uncomment this when builder fixed.
+# onnx
+# --extra-index-url https://test.pypi.org/simple/
+# ort-nightly
+# coloredlogs
+# sympy
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 -f https://github.com/nod-ai/SHARK-Runtime/releases
 --pre

-numpy
+numpy==1.22.4
 torch
 torchvision

--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -120,6 +120,16 @@ fi

 if [[ $(uname -s) = 'Linux' ]]; then
  echo "${Yellow}Linux detected.. installing importer tools"
+  # Modules required for ONNX/Transformer Benchmarking.
+  # TODO: move this to requirements.txt
+  $PYTHON -m pip install protobuf
+  $PYTHON -m pip install coloredlogs
+  $PYTHON -m pip install flatbuffers
+  $PYTHON -m pip install sympy
+  $PYTHON -m pip install psutil
+  $PYTHON -m pip install -i https://test.pypi.org/simple/ onnx-weekly
+  $PYTHON -m pip install -i https://test.pypi.org/simple/ ort-nightly
+
  $PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/nod-ai/SHARK-Runtime/releases
 fi

--- a/shark/iree_utils.py
+++ b/shark/iree_utils.py
@@ -269,7 +269,6 @@ def tensor_to_type_str(input_tensors: tuple, frontend: str):
    Output: list of string that represent mlir types (i.e 1x24xf64)
    # TODO: Support more than floats, and ints
    """
-    print("front:", frontend)
    list_of_type = []
    for input_tensor in input_tensors:
        type_string = "x".join([str(dim) for dim in input_tensor.shape])
@@ -329,7 +328,7 @@ def run_cmd(cmd):
        sys.exit("Exiting program due to error running:", cmd)


-def run_benchmark(benchmark_cl):
+def run_benchmark_module(benchmark_cl):
    """
    Run benchmark command, extract result and return iteration/seconds.

--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -19,7 +19,7 @@ from torch_mlir_e2e_test.eager_backends.refbackend import EagerModeRefBackend

 from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
 from shark.torch_mlir_utils import get_torch_mlir_module, run_on_refbackend
-from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_module_to_vmfb, export_module_to_mlir_file, build_benchmark_args, run_benchmark
+from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_module_to_vmfb, export_module_to_mlir_file, build_benchmark_args, run_benchmark_module
 import os
 from shark.parser import shark_args
 from tqdm import tqdm
@@ -152,7 +152,7 @@ class SharkBenchmarkRunner(SharkRunner):
        return

    def benchmark_c(self):
-        result = run_benchmark(self.benchmark_cl)
+        result = run_benchmark_module(self.benchmark_cl)
        print(f"Shark-{self.frontend} C-benchmark:{result} iter/second")

    def benchmark_python(self, inputs):
--- a/shark/tests/test_hf_benchmark.py
+++ b/shark/tests/test_hf_benchmark.py
@@ -0,0 +1,35 @@
+import torch
+from benchmarks.hf_transformer import SharkHFBenchmarkRunner
+import importlib
+import pytest
+
+torch.manual_seed(0)
+
+############################# HF Benchmark Tests ####################################
+
+# Test running benchmark module without failing.
+pytest_benchmark_param = pytest.mark.parametrize(
+    ('dynamic', 'device'),
+    [
+        pytest.param(False, 'cpu'),
+        # TODO: Language models are failing for dynamic case..
+        pytest.param(True, 'cpu', marks=pytest.mark.skip),
+    ])
+
+@pytest.mark.skipif(importlib.util.find_spec("onnxruntime") is None, reason = "Cannot find ONNXRUNTIME.")
+@pytest_benchmark_param
+def test_HFbench_minilm_torch(dynamic, device):
+    model_name = "bert-base-uncased"
+    test_input = torch.randint(2, (1, 128))
+    try:
+        shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
+                                            jit_trace=True, dynamic = dynamic, device = device)
+        shark_module.benchmark_c()
+        shark_module.benchmark_python((test_input,))
+        shark_module.benchmark_torch(test_input)
+        shark_module.benchmark_onnx(test_input)
+        # If becnhmarking succesful, assert success/True.
+        assert True
+    except Exception as e:
+        # If anything happen during benchmarking, assert False/failure.
+        assert False