mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[FRONTEND] Make the performance model work for int8, tf32, and fp32 (#456)
This commit is contained in:
@@ -330,18 +330,56 @@ def get_dram_gbps(backend=None, device=None):
|
||||
device = torch.cuda.current_device()
|
||||
mem_clock_khz = _triton.runtime.memory_clock_rate(backend, device)
|
||||
bus_width = _triton.runtime.global_memory_bus_width(backend, device)
|
||||
bw_gbps = mem_clock_khz * bus_width * 2 // 1024 // 1024 // 8 # In GB/s
|
||||
bw_gbps = mem_clock_khz * bus_width * 2 / 1e6 / 8 # In GB/s
|
||||
return bw_gbps
|
||||
|
||||
|
||||
def get_max_tensorcore_tflops(backend, device):
|
||||
def get_max_tensorcore_tflops(dtype: torch.dtype, backend=None, device=None, clock_rate=None):
|
||||
if not backend:
|
||||
backend = _triton.runtime.backend.CUDA
|
||||
if not device:
|
||||
device = torch.cuda.current_device()
|
||||
num_subcores = _triton.runtime.num_sm(backend, device) * 4 # on recent GPUs
|
||||
clock_rate = _triton.runtime.clock_rate(backend, device) # in kHz
|
||||
# assume fp32 += fp16*fp16
|
||||
if not clock_rate:
|
||||
clock_rate = _triton.runtime.clock_rate(backend, device) # in kHz
|
||||
cc = _triton.runtime.cc(backend, device)
|
||||
if cc < 80:
|
||||
assert dtype == torch.float16
|
||||
ops_per_sub_core = 256 # 2 4x4x4 Tensor Cores
|
||||
else:
|
||||
ops_per_sub_core = 512
|
||||
tflops = num_subcores * clock_rate * ops_per_sub_core / (1024 * 1024 * 1024)
|
||||
if dtype == torch.float32:
|
||||
ops_per_sub_core = 256
|
||||
elif dtype in [torch.float16, torch.bfloat16]:
|
||||
ops_per_sub_core = 512
|
||||
elif dtype == torch.int8:
|
||||
ops_per_sub_core = 1024
|
||||
else:
|
||||
raise RuntimeError("dtype not supported")
|
||||
tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
|
||||
return tflops
|
||||
|
||||
|
||||
def get_max_simd_tflops(dtype: torch.dtype, backend=None, device=None):
|
||||
if not backend:
|
||||
backend = _triton.runtime.backend.CUDA
|
||||
if not device:
|
||||
device = torch.cuda.current_device()
|
||||
num_subcores = _triton.runtime.num_sm(backend, device) * 4 # on recent GPUs
|
||||
clock_rate = _triton.runtime.clock_rate(backend, device) # in kHz
|
||||
cc = _triton.runtime.cc(backend, device)
|
||||
if cc < 80:
|
||||
if dtype == torch.float32:
|
||||
ops_per_sub_core = 32 # 2*16
|
||||
elif dtype == torch.float16:
|
||||
ops_per_sub_core = 64
|
||||
else:
|
||||
raise RuntimeError("dtype not supported")
|
||||
else:
|
||||
if dtype == torch.float32:
|
||||
ops_per_sub_core = 32
|
||||
elif dtype in [torch.float16, torch.bfloat16]:
|
||||
ops_per_sub_core = 64
|
||||
else:
|
||||
raise RuntimeError("dtype not supported")
|
||||
tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
|
||||
return tflops
|
||||
|
||||
Reference in New Issue
Block a user