[FRONTEND][BACKEND] Add a performance test for reductions (#2125)

Also stop promoting integer types as it doesn't give better perf this will allow more vectorization oportuinity in the future.
2026-04-05 03:01:17 -04:00 · 2023-08-17 16:30:33 -07:00
parent 3fa6d51bc9
commit 387fc890a5
5 changed files with 75 additions and 13 deletions
--- a/python/triton/testing.py
+++ b/python/triton/testing.py
@@ -377,9 +377,9 @@ def get_max_tensorcore_tflops(dtype, backend=None, device=None, clock_rate=None)
        assert dtype == torch.float16
        ops_per_sub_core = 256  # 2 4x4x4 Tensor Cores
    else:
-        if dtype == torch.float32:
+        if dtype in [torch.float32, torch.int32]:
            ops_per_sub_core = 256
-        elif dtype in [torch.float16, torch.bfloat16]:
+        elif dtype in [torch.float16, torch.bfloat16, torch.int16]:
            ops_per_sub_core = 512
        elif dtype in [torch.int8, tl.float8e4, tl.float8e4b15, tl.float8e5]:
            ops_per_sub_core = 1024