[FRONTEND][BACKEND] Add a performance test for reductions (#2125)

Also stop promoting integer types as it doesn't give better perf this
will allow more vectorization oportuinity in the future.
This commit is contained in:
Thomas
2023-08-17 16:30:33 -07:00
committed by GitHub
parent 3fa6d51bc9
commit 387fc890a5
5 changed files with 75 additions and 13 deletions

View File

@@ -377,9 +377,9 @@ def get_max_tensorcore_tflops(dtype, backend=None, device=None, clock_rate=None)
assert dtype == torch.float16
ops_per_sub_core = 256 # 2 4x4x4 Tensor Cores
else:
if dtype == torch.float32:
if dtype in [torch.float32, torch.int32]:
ops_per_sub_core = 256
elif dtype in [torch.float16, torch.bfloat16]:
elif dtype in [torch.float16, torch.bfloat16, torch.int16]:
ops_per_sub_core = 512
elif dtype in [torch.int8, tl.float8e4, tl.float8e4b15, tl.float8e5]:
ops_per_sub_core = 1024