[FRONTEND] Make the performance model work for int8, tf32, and fp32 (#456)

2026-04-05 03:01:17 -04:00 · 2022-02-12 14:34:42 +08:00
parent 9b100302d3
commit a9dfdcaaa9
5 changed files with 201 additions and 69 deletions
--- a/python/triton/testing.py
+++ b/python/triton/testing.py
@@ -330,18 +330,56 @@ def get_dram_gbps(backend=None, device=None):
        device = torch.cuda.current_device()
    mem_clock_khz = _triton.runtime.memory_clock_rate(backend, device)
    bus_width = _triton.runtime.global_memory_bus_width(backend, device)
-    bw_gbps = mem_clock_khz * bus_width * 2 // 1024 // 1024 // 8  # In GB/s
+    bw_gbps = mem_clock_khz * bus_width * 2 / 1e6 / 8  # In GB/s
    return bw_gbps


-def get_max_tensorcore_tflops(backend, device):
+def get_max_tensorcore_tflops(dtype: torch.dtype, backend=None, device=None, clock_rate=None):
+    if not backend:
+        backend = _triton.runtime.backend.CUDA
+    if not device:
+        device = torch.cuda.current_device()
    num_subcores = _triton.runtime.num_sm(backend, device) * 4  # on recent GPUs
-    clock_rate = _triton.runtime.clock_rate(backend, device)  # in kHz
-    # assume fp32 += fp16*fp16
+    if not clock_rate:
+        clock_rate = _triton.runtime.clock_rate(backend, device)  # in kHz
    cc = _triton.runtime.cc(backend, device)
    if cc < 80:
+        assert dtype == torch.float16
        ops_per_sub_core = 256  # 2 4x4x4 Tensor Cores
    else:
-        ops_per_sub_core = 512
-    tflops = num_subcores * clock_rate * ops_per_sub_core / (1024 * 1024 * 1024)
+        if dtype == torch.float32:
+            ops_per_sub_core = 256
+        elif dtype in [torch.float16, torch.bfloat16]:
+            ops_per_sub_core = 512
+        elif dtype == torch.int8:
+            ops_per_sub_core = 1024
+        else:
+            raise RuntimeError("dtype not supported")
+    tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
+    return tflops
+
+
+def get_max_simd_tflops(dtype: torch.dtype, backend=None, device=None):
+    if not backend:
+        backend = _triton.runtime.backend.CUDA
+    if not device:
+        device = torch.cuda.current_device()
+    num_subcores = _triton.runtime.num_sm(backend, device) * 4  # on recent GPUs
+    clock_rate = _triton.runtime.clock_rate(backend, device)  # in kHz
+    cc = _triton.runtime.cc(backend, device)
+    if cc < 80:
+        if dtype == torch.float32:
+            ops_per_sub_core = 32  # 2*16
+        elif dtype == torch.float16:
+            ops_per_sub_core = 64
+        else:
+            raise RuntimeError("dtype not supported")
+    else:
+        if dtype == torch.float32:
+            ops_per_sub_core = 32
+        elif dtype in [torch.float16, torch.bfloat16]:
+            ops_per_sub_core = 64
+        else:
+            raise RuntimeError("dtype not supported")
+    tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
    return tflops