mirror of
https://github.com/nod-ai/AMD-SHARK-Studio.git
synced 2026-04-03 03:00:17 -04:00
ORT-HF Benchmark Integration (#101)
-Add HF Benchmarker class. -Add sample to benchmark HF model. Example: ```bash python -m benchmarks.hf_model_benchmark --num_iterations=10 --model_name="microsoft/MiniLM-L12-H384-uncased" ```
This commit is contained in:
2
.github/workflows/test-models.yml
vendored
2
.github/workflows/test-models.yml
vendored
@@ -90,5 +90,5 @@ jobs:
|
||||
- name: Validate Models
|
||||
run: |
|
||||
source shark.venv/bin/activate
|
||||
pytest -k 'not benchmark'
|
||||
pytest -k 'not benchmark' --ignore=shark/tests/test_hf_benchmark.py
|
||||
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -162,3 +162,7 @@ cython_debug/
|
||||
|
||||
# Shark related artefacts
|
||||
shark.venv/
|
||||
|
||||
# ORT related artefacts
|
||||
cache_models/
|
||||
onnx_models/
|
||||
|
||||
0
benchmarks/__init__.py
Normal file
0
benchmarks/__init__.py
Normal file
22
benchmarks/hf_model_benchmark.py
Normal file
22
benchmarks/hf_model_benchmark.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import torch
|
||||
from shark.parser import parser
|
||||
from benchmarks.hf_transformer import SharkHFBenchmarkRunner
|
||||
|
||||
parser.add_argument(
|
||||
"--model_name",
|
||||
type=str,
|
||||
required=True,
|
||||
help=
|
||||
"Specifies name of HF model to benchmark. (For exmaple \"microsoft/MiniLM-L12-H384-uncased\""
|
||||
)
|
||||
load_args, unknown = parser.parse_known_args()
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_name = load_args.model_name
|
||||
test_input = torch.randint(2, (1, 128))
|
||||
shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
|
||||
jit_trace=True)
|
||||
shark_module.benchmark_c()
|
||||
shark_module.benchmark_python((test_input,))
|
||||
shark_module.benchmark_torch(test_input)
|
||||
shark_module.benchmark_onnx(test_input)
|
||||
137
benchmarks/hf_transformer.py
Normal file
137
benchmarks/hf_transformer.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import torch
|
||||
from shark.shark_runner import SharkBenchmarkRunner
|
||||
from shark.parser import shark_args
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
from onnxruntime.transformers.benchmark import run_pytorch, run_tensorflow, run_onnxruntime
|
||||
from onnxruntime.transformers.huggingface_models import MODELS
|
||||
from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
|
||||
import os
|
||||
import psutil
|
||||
|
||||
|
||||
class OnnxFusionOptions(object):
|
||||
|
||||
def __init__(self):
|
||||
self.disable_gelu = False
|
||||
self.disable_layer_norm = False
|
||||
self.disable_attention = False
|
||||
self.disable_skip_layer_norm = False
|
||||
self.disable_embed_layer_norm = False
|
||||
self.disable_bias_skip_layer_norm = False
|
||||
self.disable_bias_gelu = False
|
||||
self.enable_gelu_approximation = False
|
||||
self.use_mask_index = False
|
||||
self.no_attention_mask = False
|
||||
|
||||
|
||||
class HuggingFaceLanguage(torch.nn.Module):
|
||||
|
||||
def __init__(self, hf_model_name):
|
||||
super().__init__()
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(
|
||||
hf_model_name, # The pretrained model.
|
||||
num_labels=
|
||||
2, # The number of output labels--2 for binary classification.
|
||||
output_attentions=
|
||||
False, # Whether the model returns attentions weights.
|
||||
output_hidden_states=
|
||||
False, # Whether the model returns all hidden-states.
|
||||
torchscript=True,
|
||||
)
|
||||
|
||||
def forward(self, tokens):
|
||||
return self.model.forward(tokens)[0]
|
||||
|
||||
|
||||
class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
|
||||
# SharkRunner derived class with Benchmarking capabilities.
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
input: tuple,
|
||||
dynamic: bool = False,
|
||||
device: str = None,
|
||||
jit_trace: bool = False,
|
||||
from_aot: bool = False,
|
||||
frontend: str = "torch",
|
||||
):
|
||||
self.device = device if device is not None else shark_args.device
|
||||
if self.device == "gpu":
|
||||
raise ValueError(
|
||||
"Currently GPU Benchmarking is not supported due to OOM from ORT."
|
||||
)
|
||||
self.model_name = model_name
|
||||
model = HuggingFaceLanguage(model_name)
|
||||
SharkBenchmarkRunner.__init__(self, model, input, dynamic, self.device,
|
||||
jit_trace, from_aot, frontend)
|
||||
|
||||
def benchmark_torch(self, inputs):
|
||||
use_gpu = self.device == "gpu"
|
||||
# Set set the model's layer number to automatic.
|
||||
config_modifier = ConfigModifier(None)
|
||||
num_threads = psutil.cpu_count(logical=False)
|
||||
batch_sizes = [inputs.shape[0]]
|
||||
sequence_lengths = [inputs.shape[-1]]
|
||||
cache_dir = os.path.join(".", "cache_models")
|
||||
verbose = False
|
||||
result = run_pytorch(use_gpu, [self.model_name], None, config_modifier,
|
||||
Precision.FLOAT32, num_threads, batch_sizes,
|
||||
sequence_lengths, shark_args.num_iterations, False,
|
||||
cache_dir, verbose)
|
||||
print(
|
||||
f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
|
||||
)
|
||||
|
||||
# TODO: Currently non-functional due to TF runtime error. There might be some issue with, initializing TF.
|
||||
def benchmark_tf(self, inputs):
|
||||
use_gpu = self.device == "gpu"
|
||||
# Set set the model's layer number to automatic.
|
||||
config_modifier = ConfigModifier(None)
|
||||
num_threads = psutil.cpu_count(logical=False)
|
||||
batch_sizes = [inputs.shape[0]]
|
||||
sequence_lengths = [inputs.shape[-1]]
|
||||
cache_dir = os.path.join(".", "cache_models")
|
||||
verbose = False
|
||||
result = run_tensorflow(use_gpu, [self.model_name], None,
|
||||
config_modifier, Precision.FLOAT32, num_threads,
|
||||
batch_sizes, sequence_lengths,
|
||||
shark_args.num_iterations, cache_dir, verbose)
|
||||
print(
|
||||
f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
|
||||
)
|
||||
|
||||
def benchmark_onnx(self, inputs):
|
||||
if self.model_name not in MODELS:
|
||||
print(
|
||||
f"{self.model_name} is currently not supported in ORT's HF. Check \
|
||||
https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
|
||||
for currently supported models. Exiting benchmark ONNX.")
|
||||
return
|
||||
use_gpu = self.device == "gpu"
|
||||
num_threads = psutil.cpu_count(logical=False)
|
||||
batch_sizes = [inputs.shape[0]]
|
||||
sequence_lengths = [inputs.shape[-1]]
|
||||
cache_dir = os.path.join(".", "cache_models")
|
||||
onnx_dir = os.path.join(".", "onnx_models")
|
||||
verbose = False
|
||||
input_counts = [1]
|
||||
optimize_onnx = True
|
||||
validate_onnx = False
|
||||
disable_ort_io_binding = False
|
||||
use_raw_attention_mask = True
|
||||
model_fusion_statistics = {}
|
||||
overwrite = False
|
||||
model_source = "pt" #Either "pt" or "tf"
|
||||
provider = None
|
||||
config_modifier = ConfigModifier(None)
|
||||
onnx_args = OnnxFusionOptions()
|
||||
result = run_onnxruntime(
|
||||
use_gpu, provider, [self.model_name], None, config_modifier,
|
||||
Precision.FLOAT32, num_threads, batch_sizes, sequence_lengths,
|
||||
shark_args.num_iterations, input_counts, optimize_onnx,
|
||||
validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
|
||||
disable_ort_io_binding, use_raw_attention_mask,
|
||||
model_fusion_statistics, model_source, onnx_args)
|
||||
print(
|
||||
f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
|
||||
)
|
||||
@@ -20,3 +20,11 @@ Pillow
|
||||
# Testing and support.
|
||||
lit
|
||||
pyyaml
|
||||
|
||||
# To Enable ONNX Runtime Benchmarks
|
||||
# TODO: Uncomment this when builder fixed.
|
||||
# onnx
|
||||
# --extra-index-url https://test.pypi.org/simple/
|
||||
# ort-nightly
|
||||
# coloredlogs
|
||||
# sympy
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
-f https://github.com/nod-ai/SHARK-Runtime/releases
|
||||
--pre
|
||||
|
||||
numpy
|
||||
numpy==1.22.4
|
||||
torch
|
||||
torchvision
|
||||
|
||||
|
||||
@@ -120,6 +120,16 @@ fi
|
||||
|
||||
if [[ $(uname -s) = 'Linux' ]]; then
|
||||
echo "${Yellow}Linux detected.. installing importer tools"
|
||||
# Modules required for ONNX/Transformer Benchmarking.
|
||||
# TODO: move this to requirements.txt
|
||||
$PYTHON -m pip install protobuf
|
||||
$PYTHON -m pip install coloredlogs
|
||||
$PYTHON -m pip install flatbuffers
|
||||
$PYTHON -m pip install sympy
|
||||
$PYTHON -m pip install psutil
|
||||
$PYTHON -m pip install -i https://test.pypi.org/simple/ onnx-weekly
|
||||
$PYTHON -m pip install -i https://test.pypi.org/simple/ ort-nightly
|
||||
|
||||
$PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/nod-ai/SHARK-Runtime/releases
|
||||
fi
|
||||
|
||||
|
||||
@@ -269,7 +269,6 @@ def tensor_to_type_str(input_tensors: tuple, frontend: str):
|
||||
Output: list of string that represent mlir types (i.e 1x24xf64)
|
||||
# TODO: Support more than floats, and ints
|
||||
"""
|
||||
print("front:", frontend)
|
||||
list_of_type = []
|
||||
for input_tensor in input_tensors:
|
||||
type_string = "x".join([str(dim) for dim in input_tensor.shape])
|
||||
@@ -329,7 +328,7 @@ def run_cmd(cmd):
|
||||
sys.exit("Exiting program due to error running:", cmd)
|
||||
|
||||
|
||||
def run_benchmark(benchmark_cl):
|
||||
def run_benchmark_module(benchmark_cl):
|
||||
"""
|
||||
Run benchmark command, extract result and return iteration/seconds.
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ from torch_mlir_e2e_test.eager_backends.refbackend import EagerModeRefBackend
|
||||
|
||||
from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
|
||||
from shark.torch_mlir_utils import get_torch_mlir_module, run_on_refbackend
|
||||
from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_module_to_vmfb, export_module_to_mlir_file, build_benchmark_args, run_benchmark
|
||||
from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_module_to_vmfb, export_module_to_mlir_file, build_benchmark_args, run_benchmark_module
|
||||
import os
|
||||
from shark.parser import shark_args
|
||||
from tqdm import tqdm
|
||||
@@ -152,7 +152,7 @@ class SharkBenchmarkRunner(SharkRunner):
|
||||
return
|
||||
|
||||
def benchmark_c(self):
|
||||
result = run_benchmark(self.benchmark_cl)
|
||||
result = run_benchmark_module(self.benchmark_cl)
|
||||
print(f"Shark-{self.frontend} C-benchmark:{result} iter/second")
|
||||
|
||||
def benchmark_python(self, inputs):
|
||||
|
||||
35
shark/tests/test_hf_benchmark.py
Normal file
35
shark/tests/test_hf_benchmark.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import torch
|
||||
from benchmarks.hf_transformer import SharkHFBenchmarkRunner
|
||||
import importlib
|
||||
import pytest
|
||||
|
||||
torch.manual_seed(0)
|
||||
|
||||
############################# HF Benchmark Tests ####################################
|
||||
|
||||
# Test running benchmark module without failing.
|
||||
pytest_benchmark_param = pytest.mark.parametrize(
|
||||
('dynamic', 'device'),
|
||||
[
|
||||
pytest.param(False, 'cpu'),
|
||||
# TODO: Language models are failing for dynamic case..
|
||||
pytest.param(True, 'cpu', marks=pytest.mark.skip),
|
||||
])
|
||||
|
||||
@pytest.mark.skipif(importlib.util.find_spec("onnxruntime") is None, reason = "Cannot find ONNXRUNTIME.")
|
||||
@pytest_benchmark_param
|
||||
def test_HFbench_minilm_torch(dynamic, device):
|
||||
model_name = "bert-base-uncased"
|
||||
test_input = torch.randint(2, (1, 128))
|
||||
try:
|
||||
shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
|
||||
jit_trace=True, dynamic = dynamic, device = device)
|
||||
shark_module.benchmark_c()
|
||||
shark_module.benchmark_python((test_input,))
|
||||
shark_module.benchmark_torch(test_input)
|
||||
shark_module.benchmark_onnx(test_input)
|
||||
# If becnhmarking succesful, assert success/True.
|
||||
assert True
|
||||
except Exception as e:
|
||||
# If anything happen during benchmarking, assert False/failure.
|
||||
assert False
|
||||
Reference in New Issue
Block a user