[FRONTEND] Make the performance model work for int8, tf32, and fp32 (#456)

This commit is contained in:
daadaada
2022-02-12 14:34:42 +08:00
committed by GitHub
parent 9b100302d3
commit a9dfdcaaa9
5 changed files with 201 additions and 69 deletions

View File

@@ -330,18 +330,56 @@ def get_dram_gbps(backend=None, device=None):
device = torch.cuda.current_device()
mem_clock_khz = _triton.runtime.memory_clock_rate(backend, device)
bus_width = _triton.runtime.global_memory_bus_width(backend, device)
bw_gbps = mem_clock_khz * bus_width * 2 // 1024 // 1024 // 8 # In GB/s
bw_gbps = mem_clock_khz * bus_width * 2 / 1e6 / 8 # In GB/s
return bw_gbps
def get_max_tensorcore_tflops(backend, device):
def get_max_tensorcore_tflops(dtype: torch.dtype, backend=None, device=None, clock_rate=None):
if not backend:
backend = _triton.runtime.backend.CUDA
if not device:
device = torch.cuda.current_device()
num_subcores = _triton.runtime.num_sm(backend, device) * 4 # on recent GPUs
clock_rate = _triton.runtime.clock_rate(backend, device) # in kHz
# assume fp32 += fp16*fp16
if not clock_rate:
clock_rate = _triton.runtime.clock_rate(backend, device) # in kHz
cc = _triton.runtime.cc(backend, device)
if cc < 80:
assert dtype == torch.float16
ops_per_sub_core = 256 # 2 4x4x4 Tensor Cores
else:
ops_per_sub_core = 512
tflops = num_subcores * clock_rate * ops_per_sub_core / (1024 * 1024 * 1024)
if dtype == torch.float32:
ops_per_sub_core = 256
elif dtype in [torch.float16, torch.bfloat16]:
ops_per_sub_core = 512
elif dtype == torch.int8:
ops_per_sub_core = 1024
else:
raise RuntimeError("dtype not supported")
tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
return tflops
def get_max_simd_tflops(dtype: torch.dtype, backend=None, device=None):
if not backend:
backend = _triton.runtime.backend.CUDA
if not device:
device = torch.cuda.current_device()
num_subcores = _triton.runtime.num_sm(backend, device) * 4 # on recent GPUs
clock_rate = _triton.runtime.clock_rate(backend, device) # in kHz
cc = _triton.runtime.cc(backend, device)
if cc < 80:
if dtype == torch.float32:
ops_per_sub_core = 32 # 2*16
elif dtype == torch.float16:
ops_per_sub_core = 64
else:
raise RuntimeError("dtype not supported")
else:
if dtype == torch.float32:
ops_per_sub_core = 32
elif dtype in [torch.float16, torch.bfloat16]:
ops_per_sub_core = 64
else:
raise RuntimeError("dtype not supported")
tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
return tflops