mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[FRONTEND][BACKEND] Add a performance test for reductions (#2125)
Also stop promoting integer types as it doesn't give better perf this will allow more vectorization oportuinity in the future.
This commit is contained in:
@@ -377,9 +377,9 @@ def get_max_tensorcore_tflops(dtype, backend=None, device=None, clock_rate=None)
|
||||
assert dtype == torch.float16
|
||||
ops_per_sub_core = 256 # 2 4x4x4 Tensor Cores
|
||||
else:
|
||||
if dtype == torch.float32:
|
||||
if dtype in [torch.float32, torch.int32]:
|
||||
ops_per_sub_core = 256
|
||||
elif dtype in [torch.float16, torch.bfloat16]:
|
||||
elif dtype in [torch.float16, torch.bfloat16, torch.int16]:
|
||||
ops_per_sub_core = 512
|
||||
elif dtype in [torch.int8, tl.float8e4, tl.float8e4b15, tl.float8e5]:
|
||||
ops_per_sub_core = 1024
|
||||
|
||||
Reference in New Issue
Block a user