mirror of
https://github.com/nod-ai/SHARK-Studio.git
synced 2026-01-09 13:57:54 -05:00
182 lines
6.1 KiB
Python
182 lines
6.1 KiB
Python
import torch
|
|
from amdshark.amdshark_benchmark_runner import AMDSharkBenchmarkRunner
|
|
from amdshark.parser import amdshark_args
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
from onnxruntime.transformers.benchmark import (
|
|
run_pytorch,
|
|
run_tensorflow,
|
|
run_onnxruntime,
|
|
)
|
|
from onnxruntime.transformers.huggingface_models import MODELS
|
|
from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
|
|
import os
|
|
import psutil
|
|
|
|
|
|
class OnnxFusionOptions(object):
|
|
def __init__(self):
|
|
self.disable_gelu = False
|
|
self.disable_layer_norm = False
|
|
self.disable_attention = False
|
|
self.disable_skip_layer_norm = False
|
|
self.disable_embed_layer_norm = False
|
|
self.disable_bias_skip_layer_norm = False
|
|
self.disable_bias_gelu = False
|
|
self.enable_gelu_approximation = False
|
|
self.use_mask_index = False
|
|
self.no_attention_mask = False
|
|
|
|
|
|
class HuggingFaceLanguage(torch.nn.Module):
|
|
def __init__(self, hf_model_name):
|
|
super().__init__()
|
|
self.model = AutoModelForSequenceClassification.from_pretrained(
|
|
hf_model_name, # The pretrained model.
|
|
num_labels=2, # The number of output labels--2 for binary classification.
|
|
output_attentions=False, # Whether the model returns attentions weights.
|
|
output_hidden_states=False, # Whether the model returns all hidden-states.
|
|
torchscript=True,
|
|
)
|
|
|
|
def forward(self, tokens):
|
|
return self.model.forward(tokens)[0]
|
|
|
|
|
|
class AMDSharkHFBenchmarkRunner(AMDSharkBenchmarkRunner):
|
|
# AMDSharkRunner derived class with Benchmarking capabilities.
|
|
def __init__(
|
|
self,
|
|
model_name: str,
|
|
input: tuple,
|
|
dynamic: bool = False,
|
|
device: str = None,
|
|
jit_trace: bool = False,
|
|
from_aot: bool = False,
|
|
frontend: str = "torch",
|
|
):
|
|
self.device = device if device is not None else amdshark_args.device
|
|
if self.device == "gpu":
|
|
raise ValueError(
|
|
"Currently GPU Benchmarking is not supported due to OOM from ORT."
|
|
)
|
|
self.model_name = model_name
|
|
model = HuggingFaceLanguage(model_name)
|
|
AMDSharkBenchmarkRunner.__init__(
|
|
self,
|
|
model,
|
|
input,
|
|
dynamic,
|
|
self.device,
|
|
jit_trace,
|
|
from_aot,
|
|
frontend,
|
|
)
|
|
|
|
def benchmark_torch(self, inputs):
|
|
use_gpu = self.device == "gpu"
|
|
# Set set the model's layer number to automatic.
|
|
config_modifier = ConfigModifier(None)
|
|
num_threads = psutil.cpu_count(logical=False)
|
|
batch_sizes = [inputs.shape[0]]
|
|
sequence_lengths = [inputs.shape[-1]]
|
|
cache_dir = os.path.join(".", "cache_models")
|
|
verbose = False
|
|
result = run_pytorch(
|
|
use_gpu,
|
|
[self.model_name],
|
|
None,
|
|
config_modifier,
|
|
Precision.FLOAT32,
|
|
num_threads,
|
|
batch_sizes,
|
|
sequence_lengths,
|
|
amdshark_args.num_iterations,
|
|
False,
|
|
cache_dir,
|
|
verbose,
|
|
)
|
|
print(
|
|
f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{amdshark_args.num_iterations}"
|
|
)
|
|
|
|
# TODO: Currently non-functional due to TF runtime error. There might be some issue with, initializing TF.
|
|
def benchmark_tf(self, inputs):
|
|
use_gpu = self.device == "gpu"
|
|
# Set set the model's layer number to automatic.
|
|
config_modifier = ConfigModifier(None)
|
|
num_threads = psutil.cpu_count(logical=False)
|
|
batch_sizes = [inputs.shape[0]]
|
|
sequence_lengths = [inputs.shape[-1]]
|
|
cache_dir = os.path.join(".", "cache_models")
|
|
verbose = False
|
|
result = run_tensorflow(
|
|
use_gpu,
|
|
[self.model_name],
|
|
None,
|
|
config_modifier,
|
|
Precision.FLOAT32,
|
|
num_threads,
|
|
batch_sizes,
|
|
sequence_lengths,
|
|
amdshark_args.num_iterations,
|
|
cache_dir,
|
|
verbose,
|
|
)
|
|
print(
|
|
f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{amdshark_args.num_iterations}"
|
|
)
|
|
|
|
def benchmark_onnx(self, inputs):
|
|
if self.model_name not in MODELS:
|
|
print(
|
|
f"{self.model_name} is currently not supported in ORT's HF. Check \
|
|
https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
|
|
for currently supported models. Exiting benchmark ONNX."
|
|
)
|
|
return
|
|
use_gpu = self.device == "gpu"
|
|
num_threads = psutil.cpu_count(logical=False)
|
|
batch_sizes = [inputs.shape[0]]
|
|
sequence_lengths = [inputs.shape[-1]]
|
|
cache_dir = os.path.join(".", "cache_models")
|
|
onnx_dir = os.path.join(".", "onnx_models")
|
|
verbose = False
|
|
input_counts = [1]
|
|
optimize_onnx = True
|
|
validate_onnx = False
|
|
disable_ort_io_binding = False
|
|
use_raw_attention_mask = True
|
|
model_fusion_statistics = {}
|
|
overwrite = False
|
|
model_source = "pt" # Either "pt" or "tf"
|
|
provider = None
|
|
config_modifier = ConfigModifier(None)
|
|
onnx_args = OnnxFusionOptions()
|
|
result = run_onnxruntime(
|
|
use_gpu,
|
|
provider,
|
|
[self.model_name],
|
|
None,
|
|
config_modifier,
|
|
Precision.FLOAT32,
|
|
num_threads,
|
|
batch_sizes,
|
|
sequence_lengths,
|
|
amdshark_args.num_iterations,
|
|
input_counts,
|
|
optimize_onnx,
|
|
validate_onnx,
|
|
cache_dir,
|
|
onnx_dir,
|
|
verbose,
|
|
overwrite,
|
|
disable_ort_io_binding,
|
|
use_raw_attention_mask,
|
|
model_fusion_statistics,
|
|
model_source,
|
|
onnx_args,
|
|
)
|
|
print(
|
|
f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{amdshark_args.num_iterations}"
|
|
)
|