mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[TESTING] cleanup (#2293)
Co-authored-by: Philippe Tillet <phil@openai.com>
This commit is contained in:
@@ -1,13 +1,10 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import triton
|
||||
import triton.language as tl
|
||||
import triton.ops
|
||||
from triton.testing import get_dram_gbps, get_max_tensorcore_tflops
|
||||
from triton.testing import get_dram_gbps, get_max_tensorcore_tflops, nvsmi
|
||||
|
||||
DEVICE_NAME = {7: 'v100', 8: 'a100'}[torch.cuda.get_device_capability()[0]]
|
||||
|
||||
@@ -21,15 +18,6 @@ def print_perf(cur_ms, cur_util, ref_util):
|
||||
print(f'{cur_ms:.3f} ms \t cur: {cur_util:.3f} \t ref: {ref_util:.3f} \t dif={cur_util - ref_util:.3f}', end='\t')
|
||||
|
||||
|
||||
def nvsmi(attrs):
|
||||
attrs = ','.join(attrs)
|
||||
cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits']
|
||||
out = subprocess.check_output(cmd)
|
||||
ret = out.decode(sys.stdout.encoding).split(',')
|
||||
ret = [int(x) for x in ret]
|
||||
return ret
|
||||
|
||||
|
||||
#######################
|
||||
# Matrix Multiplication
|
||||
#######################
|
||||
@@ -51,9 +39,9 @@ matmul_data = {
|
||||
(16, 8192, 8192): {'float16': 0.077, 'float32': 0.077, 'int8': 0.043},
|
||||
(64, 1024, 1024): {'float16': 0.018, 'float32': 0.023, 'int8': 0.017},
|
||||
(64, 4096, 4096): {'float16': 0.150, 'float32': 0.000, 'int8': 0.097},
|
||||
(64, 8192, 8192): {'float16': 0.338, 'float32': 0.000, 'int8': 0.174},
|
||||
(64, 8192, 8192): {'float16': 0.214, 'float32': 0.000, 'int8': 0.174},
|
||||
(1024, 64, 1024): {'float16': 0.029, 'float32': 0.046, 'int8': 0.017},
|
||||
(4096, 64, 4096): {'float16': 0.179, 'float32': 0.214, 'int8': 0.102},
|
||||
(4096, 64, 4096): {'float16': 0.136, 'float32': 0.214, 'int8': 0.102},
|
||||
(8192, 64, 8192): {'float16': 0.278, 'float32': 0.000, 'int8': 0.177},
|
||||
# test EVEN_K==False
|
||||
(8192, 8192, 8176): {'float16': 0.786, 'float32': 0.743, 'int8': 0.51},
|
||||
|
||||
@@ -5,14 +5,16 @@ import torch
|
||||
from .. import cdiv
|
||||
from .._C.libtriton.triton import runtime
|
||||
from ..runtime import driver
|
||||
from ..testing import get_dram_gbps, get_max_simd_tflops, get_max_tensorcore_tflops
|
||||
from ..testing import (get_dram_gbps, get_max_simd_tflops, get_max_tensorcore_tflops,
|
||||
nvsmi)
|
||||
|
||||
|
||||
def get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype):
|
||||
''' return compute throughput in TOPS '''
|
||||
total_warps = num_ctas * min(num_warps, 4)
|
||||
num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs
|
||||
tflops = min(num_subcores, total_warps) / num_subcores * get_max_tensorcore_tflops(dtype, backend, device)
|
||||
cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
|
||||
tflops = min(num_subcores, total_warps) / num_subcores * get_max_tensorcore_tflops(dtype, cur_sm_clock, backend, device)
|
||||
return tflops
|
||||
|
||||
|
||||
@@ -20,7 +22,8 @@ def get_simd_tflops(backend, device, num_ctas, num_warps, dtype):
|
||||
''' return compute throughput in TOPS '''
|
||||
total_warps = num_ctas * min(num_warps, 4)
|
||||
num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs
|
||||
tflops = min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, backend, device)
|
||||
cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
|
||||
tflops = min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, cur_sm_clock, backend, device)
|
||||
return tflops
|
||||
|
||||
|
||||
|
||||
@@ -368,7 +368,7 @@ def get_dram_gbps(backend=None, device=None):
|
||||
return bw_gbps
|
||||
|
||||
|
||||
def get_max_tensorcore_tflops(dtype, backend=None, device=None, clock_rate=None):
|
||||
def get_max_tensorcore_tflops(dtype, clock_rate, backend=None, device=None):
|
||||
import torch
|
||||
|
||||
from .runtime import driver
|
||||
@@ -378,8 +378,6 @@ def get_max_tensorcore_tflops(dtype, backend=None, device=None, clock_rate=None)
|
||||
device = torch.cuda.current_device()
|
||||
|
||||
num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4
|
||||
if not clock_rate:
|
||||
clock_rate = driver.utils.get_device_properties(device)["sm_clock_rate"] # in kHz
|
||||
capability = torch.cuda.get_device_capability(device)
|
||||
if capability[0] < 8:
|
||||
assert dtype == torch.float16
|
||||
@@ -423,21 +421,6 @@ def cuda_memcheck(**target_kwargs):
|
||||
return decorator
|
||||
|
||||
|
||||
def nvsmi_attr(attrs):
|
||||
attrs = ",".join(attrs)
|
||||
cmd = [
|
||||
"nvidia-smi",
|
||||
"-i",
|
||||
"0",
|
||||
"--query-gpu=" + attrs,
|
||||
"--format=csv,noheader,nounits",
|
||||
]
|
||||
out = subprocess.check_output(cmd)
|
||||
ret = out.decode(sys.stdout.encoding).split(",")
|
||||
ret = [int(x) for x in ret]
|
||||
return ret
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215):
|
||||
try:
|
||||
@@ -458,8 +441,8 @@ def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215):
|
||||
f"--lock-memory-clocks={ref_mem_clock},{ref_mem_clock}",
|
||||
]
|
||||
)
|
||||
cur_sm_clock = nvsmi_attr(["clocks.current.sm"])[0]
|
||||
cur_mem_clock = nvsmi_attr(["clocks.current.memory"])[0]
|
||||
cur_sm_clock = nvsmi(["clocks.current.sm"])[0]
|
||||
cur_mem_clock = nvsmi(["clocks.current.memory"])[0]
|
||||
assert abs(cur_sm_clock - ref_sm_clock) < 10, f"GPU SMs must run at {ref_sm_clock} MHz"
|
||||
assert abs(cur_mem_clock - ref_mem_clock) < 10, f"GPU SMs must run at {ref_mem_clock} MHz"
|
||||
tflops = 1e-6 * 2 * 108 * 4 * 256 * ref_sm_clock
|
||||
@@ -471,7 +454,7 @@ def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215):
|
||||
subprocess.check_output(["nvidia-smi", "-i", "0", "-rmc"])
|
||||
|
||||
|
||||
def get_max_simd_tflops(dtype, backend=None, device=None):
|
||||
def get_max_simd_tflops(dtype, clock_rate, backend=None, device=None):
|
||||
import torch
|
||||
|
||||
from .runtime import driver
|
||||
@@ -481,7 +464,6 @@ def get_max_simd_tflops(dtype, backend=None, device=None):
|
||||
device = torch.cuda.current_device()
|
||||
|
||||
num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4
|
||||
clock_rate = driver.utils.get_device_properties(device)["sm_clock_rate"] # in kHz
|
||||
capability = torch.cuda.get_device_capability()
|
||||
if capability[0] < 8:
|
||||
if dtype == torch.float32:
|
||||
|
||||
Reference in New Issue
Block a user