Files
SHARK-Studio/benchmarks/hf_transformer.py
pdhirajkumarprasad fe03539901 Migration to AMDShark (#2182)
Signed-off-by: pdhirajkumarprasad <dhirajp@amd.com>
2025-11-20 12:52:07 +05:30

182 lines
6.1 KiB
Python

import torch
from amdshark.amdshark_benchmark_runner import AMDSharkBenchmarkRunner
from amdshark.parser import amdshark_args
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from onnxruntime.transformers.benchmark import (
run_pytorch,
run_tensorflow,
run_onnxruntime,
)
from onnxruntime.transformers.huggingface_models import MODELS
from onnxruntime.transformers.benchmark_helper import ConfigModifier, Precision
import os
import psutil
class OnnxFusionOptions(object):
def __init__(self):
self.disable_gelu = False
self.disable_layer_norm = False
self.disable_attention = False
self.disable_skip_layer_norm = False
self.disable_embed_layer_norm = False
self.disable_bias_skip_layer_norm = False
self.disable_bias_gelu = False
self.enable_gelu_approximation = False
self.use_mask_index = False
self.no_attention_mask = False
class HuggingFaceLanguage(torch.nn.Module):
def __init__(self, hf_model_name):
super().__init__()
self.model = AutoModelForSequenceClassification.from_pretrained(
hf_model_name, # The pretrained model.
num_labels=2, # The number of output labels--2 for binary classification.
output_attentions=False, # Whether the model returns attentions weights.
output_hidden_states=False, # Whether the model returns all hidden-states.
torchscript=True,
)
def forward(self, tokens):
return self.model.forward(tokens)[0]
class AMDSharkHFBenchmarkRunner(AMDSharkBenchmarkRunner):
# AMDSharkRunner derived class with Benchmarking capabilities.
def __init__(
self,
model_name: str,
input: tuple,
dynamic: bool = False,
device: str = None,
jit_trace: bool = False,
from_aot: bool = False,
frontend: str = "torch",
):
self.device = device if device is not None else amdshark_args.device
if self.device == "gpu":
raise ValueError(
"Currently GPU Benchmarking is not supported due to OOM from ORT."
)
self.model_name = model_name
model = HuggingFaceLanguage(model_name)
AMDSharkBenchmarkRunner.__init__(
self,
model,
input,
dynamic,
self.device,
jit_trace,
from_aot,
frontend,
)
def benchmark_torch(self, inputs):
use_gpu = self.device == "gpu"
# Set set the model's layer number to automatic.
config_modifier = ConfigModifier(None)
num_threads = psutil.cpu_count(logical=False)
batch_sizes = [inputs.shape[0]]
sequence_lengths = [inputs.shape[-1]]
cache_dir = os.path.join(".", "cache_models")
verbose = False
result = run_pytorch(
use_gpu,
[self.model_name],
None,
config_modifier,
Precision.FLOAT32,
num_threads,
batch_sizes,
sequence_lengths,
amdshark_args.num_iterations,
False,
cache_dir,
verbose,
)
print(
f"ONNX Pytorch-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{amdshark_args.num_iterations}"
)
# TODO: Currently non-functional due to TF runtime error. There might be some issue with, initializing TF.
def benchmark_tf(self, inputs):
use_gpu = self.device == "gpu"
# Set set the model's layer number to automatic.
config_modifier = ConfigModifier(None)
num_threads = psutil.cpu_count(logical=False)
batch_sizes = [inputs.shape[0]]
sequence_lengths = [inputs.shape[-1]]
cache_dir = os.path.join(".", "cache_models")
verbose = False
result = run_tensorflow(
use_gpu,
[self.model_name],
None,
config_modifier,
Precision.FLOAT32,
num_threads,
batch_sizes,
sequence_lengths,
amdshark_args.num_iterations,
cache_dir,
verbose,
)
print(
f"ONNX TF-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{amdshark_args.num_iterations}"
)
def benchmark_onnx(self, inputs):
if self.model_name not in MODELS:
print(
f"{self.model_name} is currently not supported in ORT's HF. Check \
https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
for currently supported models. Exiting benchmark ONNX."
)
return
use_gpu = self.device == "gpu"
num_threads = psutil.cpu_count(logical=False)
batch_sizes = [inputs.shape[0]]
sequence_lengths = [inputs.shape[-1]]
cache_dir = os.path.join(".", "cache_models")
onnx_dir = os.path.join(".", "onnx_models")
verbose = False
input_counts = [1]
optimize_onnx = True
validate_onnx = False
disable_ort_io_binding = False
use_raw_attention_mask = True
model_fusion_statistics = {}
overwrite = False
model_source = "pt" # Either "pt" or "tf"
provider = None
config_modifier = ConfigModifier(None)
onnx_args = OnnxFusionOptions()
result = run_onnxruntime(
use_gpu,
provider,
[self.model_name],
None,
config_modifier,
Precision.FLOAT32,
num_threads,
batch_sizes,
sequence_lengths,
amdshark_args.num_iterations,
input_counts,
optimize_onnx,
validate_onnx,
cache_dir,
onnx_dir,
verbose,
overwrite,
disable_ort_io_binding,
use_raw_attention_mask,
model_fusion_statistics,
model_source,
onnx_args,
)
print(
f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{amdshark_args.num_iterations}"
)