ORT-HF Benchmark Integration (#101)

-Add HF Benchmarker class.
-Add sample to benchmark HF model.

Example:
```bash
python -m benchmarks.hf_model_benchmark --num_iterations=10 --model_name="microsoft/MiniLM-L12-H384-uncased"
```
This commit is contained in:
Stanley Winata
2022-06-07 23:49:39 -07:00
committed by GitHub
parent a9faeae794
commit 8565be9b6b
11 changed files with 221 additions and 6 deletions

View File

@@ -90,5 +90,5 @@ jobs:
- name: Validate Models
run: |
source shark.venv/bin/activate
pytest -k 'not benchmark'
pytest -k 'not benchmark' --ignore=shark/tests/test_hf_benchmark.py

4
.gitignore vendored
View File

@@ -162,3 +162,7 @@ cython_debug/
# Shark related artefacts
shark.venv/
# ORT related artefacts
cache_models/
onnx_models/

0
benchmarks/__init__.py Normal file
View File

View File

@@ -0,0 +1,22 @@
import torch
from shark.parser import parser
from benchmarks.hf_transformer import SharkHFBenchmarkRunner
parser.add_argument(
"--model_name",
type=str,
required=True,
help=
"Specifies name of HF model to benchmark. (For exmaple \"microsoft/MiniLM-L12-H384-uncased\""
)
load_args, unknown = parser.parse_known_args()
if __name__ == "__main__":
model_name = load_args.model_name
test_input = torch.randint(2, (1, 128))
shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
jit_trace=True)
shark_module.benchmark_c()
shark_module.benchmark_python((test_input,))
shark_module.benchmark_torch(test_input)
shark_module.benchmark_onnx(test_input)

View File

@@ -0,0 +1,137 @@
import torch
from shark.shark_runner import SharkBenchmarkRunner
from shark.parser import shark_args
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from onnxruntime.transformers.benchmark import run_pytorch, run_tensorflow, run_onnxruntime
from onnxruntime.transformers.huggingface_models import MODELS
from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
import os
import psutil
class OnnxFusionOptions(object):
def __init__(self):
self.disable_gelu = False
self.disable_layer_norm = False
self.disable_attention = False
self.disable_skip_layer_norm = False
self.disable_embed_layer_norm = False
self.disable_bias_skip_layer_norm = False
self.disable_bias_gelu = False
self.enable_gelu_approximation = False
self.use_mask_index = False
self.no_attention_mask = False
class HuggingFaceLanguage(torch.nn.Module):
def __init__(self, hf_model_name):
super().__init__()
self.model = AutoModelForSequenceClassification.from_pretrained(
hf_model_name, # The pretrained model.
num_labels=
2, # The number of output labels--2 for binary classification.
output_attentions=
False, # Whether the model returns attentions weights.
output_hidden_states=
False, # Whether the model returns all hidden-states.
torchscript=True,
)
def forward(self, tokens):
return self.model.forward(tokens)[0]
class SharkHFBenchmarkRunner(SharkBenchmarkRunner):
# SharkRunner derived class with Benchmarking capabilities.
def __init__(
self,
model_name: str,
input: tuple,
dynamic: bool = False,
device: str = None,
jit_trace: bool = False,
from_aot: bool = False,
frontend: str = "torch",
):
self.device = device if device is not None else shark_args.device
if self.device == "gpu":
raise ValueError(
"Currently GPU Benchmarking is not supported due to OOM from ORT."
)
self.model_name = model_name
model = HuggingFaceLanguage(model_name)
SharkBenchmarkRunner.__init__(self, model, input, dynamic, self.device,
jit_trace, from_aot, frontend)
def benchmark_torch(self, inputs):
use_gpu = self.device == "gpu"
# Set set the model's layer number to automatic.
config_modifier = ConfigModifier(None)
num_threads = psutil.cpu_count(logical=False)
batch_sizes = [inputs.shape[0]]
sequence_lengths = [inputs.shape[-1]]
cache_dir = os.path.join(".", "cache_models")
verbose = False
result = run_pytorch(use_gpu, [self.model_name], None, config_modifier,
Precision.FLOAT32, num_threads, batch_sizes,
sequence_lengths, shark_args.num_iterations, False,
cache_dir, verbose)
print(
f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
)
# TODO: Currently non-functional due to TF runtime error. There might be some issue with, initializing TF.
def benchmark_tf(self, inputs):
use_gpu = self.device == "gpu"
# Set set the model's layer number to automatic.
config_modifier = ConfigModifier(None)
num_threads = psutil.cpu_count(logical=False)
batch_sizes = [inputs.shape[0]]
sequence_lengths = [inputs.shape[-1]]
cache_dir = os.path.join(".", "cache_models")
verbose = False
result = run_tensorflow(use_gpu, [self.model_name], None,
config_modifier, Precision.FLOAT32, num_threads,
batch_sizes, sequence_lengths,
shark_args.num_iterations, cache_dir, verbose)
print(
f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
)
def benchmark_onnx(self, inputs):
if self.model_name not in MODELS:
print(
f"{self.model_name} is currently not supported in ORT's HF. Check \
https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
for currently supported models. Exiting benchmark ONNX.")
return
use_gpu = self.device == "gpu"
num_threads = psutil.cpu_count(logical=False)
batch_sizes = [inputs.shape[0]]
sequence_lengths = [inputs.shape[-1]]
cache_dir = os.path.join(".", "cache_models")
onnx_dir = os.path.join(".", "onnx_models")
verbose = False
input_counts = [1]
optimize_onnx = True
validate_onnx = False
disable_ort_io_binding = False
use_raw_attention_mask = True
model_fusion_statistics = {}
overwrite = False
model_source = "pt" #Either "pt" or "tf"
provider = None
config_modifier = ConfigModifier(None)
onnx_args = OnnxFusionOptions()
result = run_onnxruntime(
use_gpu, provider, [self.model_name], None, config_modifier,
Precision.FLOAT32, num_threads, batch_sizes, sequence_lengths,
shark_args.num_iterations, input_counts, optimize_onnx,
validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
disable_ort_io_binding, use_raw_attention_mask,
model_fusion_statistics, model_source, onnx_args)
print(
f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{shark_args.num_iterations}"
)

View File

@@ -20,3 +20,11 @@ Pillow
# Testing and support.
lit
pyyaml
# To Enable ONNX Runtime Benchmarks
# TODO: Uncomment this when builder fixed.
# onnx
# --extra-index-url https://test.pypi.org/simple/
# ort-nightly
# coloredlogs
# sympy

View File

@@ -2,7 +2,7 @@
-f https://github.com/nod-ai/SHARK-Runtime/releases
--pre
numpy
numpy==1.22.4
torch
torchvision

View File

@@ -120,6 +120,16 @@ fi
if [[ $(uname -s) = 'Linux' ]]; then
echo "${Yellow}Linux detected.. installing importer tools"
# Modules required for ONNX/Transformer Benchmarking.
# TODO: move this to requirements.txt
$PYTHON -m pip install protobuf
$PYTHON -m pip install coloredlogs
$PYTHON -m pip install flatbuffers
$PYTHON -m pip install sympy
$PYTHON -m pip install psutil
$PYTHON -m pip install -i https://test.pypi.org/simple/ onnx-weekly
$PYTHON -m pip install -i https://test.pypi.org/simple/ ort-nightly
$PYTHON -m pip install --upgrade -r "$TD/requirements-importer.txt" -f https://github.com/nod-ai/SHARK-Runtime/releases
fi

View File

@@ -269,7 +269,6 @@ def tensor_to_type_str(input_tensors: tuple, frontend: str):
Output: list of string that represent mlir types (i.e 1x24xf64)
# TODO: Support more than floats, and ints
"""
print("front:", frontend)
list_of_type = []
for input_tensor in input_tensors:
type_string = "x".join([str(dim) for dim in input_tensor.shape])
@@ -329,7 +328,7 @@ def run_cmd(cmd):
sys.exit("Exiting program due to error running:", cmd)
def run_benchmark(benchmark_cl):
def run_benchmark_module(benchmark_cl):
"""
Run benchmark command, extract result and return iteration/seconds.

View File

@@ -19,7 +19,7 @@ from torch_mlir_e2e_test.eager_backends.refbackend import EagerModeRefBackend
from shark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
from shark.torch_mlir_utils import get_torch_mlir_module, run_on_refbackend
from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_module_to_vmfb, export_module_to_mlir_file, build_benchmark_args, run_benchmark
from shark.iree_utils import get_results, get_iree_compiled_module, export_iree_module_to_vmfb, export_module_to_mlir_file, build_benchmark_args, run_benchmark_module
import os
from shark.parser import shark_args
from tqdm import tqdm
@@ -152,7 +152,7 @@ class SharkBenchmarkRunner(SharkRunner):
return
def benchmark_c(self):
result = run_benchmark(self.benchmark_cl)
result = run_benchmark_module(self.benchmark_cl)
print(f"Shark-{self.frontend} C-benchmark:{result} iter/second")
def benchmark_python(self, inputs):

View File

@@ -0,0 +1,35 @@
import torch
from benchmarks.hf_transformer import SharkHFBenchmarkRunner
import importlib
import pytest
torch.manual_seed(0)
############################# HF Benchmark Tests ####################################
# Test running benchmark module without failing.
pytest_benchmark_param = pytest.mark.parametrize(
('dynamic', 'device'),
[
pytest.param(False, 'cpu'),
# TODO: Language models are failing for dynamic case..
pytest.param(True, 'cpu', marks=pytest.mark.skip),
])
@pytest.mark.skipif(importlib.util.find_spec("onnxruntime") is None, reason = "Cannot find ONNXRUNTIME.")
@pytest_benchmark_param
def test_HFbench_minilm_torch(dynamic, device):
model_name = "bert-base-uncased"
test_input = torch.randint(2, (1, 128))
try:
shark_module = SharkHFBenchmarkRunner(model_name, (test_input,),
jit_trace=True, dynamic = dynamic, device = device)
shark_module.benchmark_c()
shark_module.benchmark_python((test_input,))
shark_module.benchmark_torch(test_input)
shark_module.benchmark_onnx(test_input)
# If becnhmarking succesful, assert success/True.
assert True
except Exception as e:
# If anything happen during benchmarking, assert False/failure.
assert False