mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[CI][TEST] update pre-commit hooks and use pre-commit for style tests in CI (#1409)
Ref issue: - #1408 Changes: - Add `.editorconfig` - Add `pre-commit-hooks`: ```yaml - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - id: check-symlinks - id: destroyed-symlinks - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: check-toml - id: check-ast - id: check-added-large-files - id: check-merge-conflict - id: check-executables-have-shebangs - id: check-shebang-scripts-are-executable - id: detect-private-key - id: debug-statements ``` - Add `flake8` to `pre-commit` config and add `.flake8` file - Use `pre-commit` for style tests in CI - Run `pre-commit` and fix existing violations: - fix trailing spaces - fix end-of-files - fix mod file mode with `chmod -x` - run `autopep8` on existing code - fix `flake8` violations
This commit is contained in:
@@ -18,7 +18,12 @@ DEVICE_NAME = {7: 'v100', 8: 'a100'}[torch.cuda.get_device_capability()[0]]
|
||||
|
||||
def nvsmi(attrs):
|
||||
attrs = ','.join(attrs)
|
||||
cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits']
|
||||
cmd = [
|
||||
'nvidia-smi',
|
||||
'-i',
|
||||
'0',
|
||||
'--query-gpu=' + attrs,
|
||||
'--format=csv,noheader,nounits']
|
||||
out = subprocess.check_output(cmd)
|
||||
ret = out.decode(sys.stdout.encoding).split(',')
|
||||
ret = [int(x) for x in ret]
|
||||
@@ -81,11 +86,15 @@ matmul_data = {
|
||||
def test_matmul(M, N, K, dtype_str):
|
||||
if dtype_str in ['float32', 'int8'] and DEVICE_NAME != 'a100':
|
||||
pytest.skip('Only test float32 & int8 on a100')
|
||||
dtype = {'float16': torch.float16, 'float32': torch.float32, 'int8': torch.int8}[dtype_str]
|
||||
dtype = {
|
||||
'float16': torch.float16,
|
||||
'float32': torch.float32,
|
||||
'int8': torch.int8}[dtype_str]
|
||||
torch.manual_seed(0)
|
||||
ref_gpu_util = matmul_data[DEVICE_NAME][(M, N, K)][dtype_str]
|
||||
cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
|
||||
max_gpu_perf = get_max_tensorcore_tflops(dtype, clock_rate=cur_sm_clock * 1e3)
|
||||
max_gpu_perf = get_max_tensorcore_tflops(
|
||||
dtype, clock_rate=cur_sm_clock * 1e3)
|
||||
if dtype == torch.int8:
|
||||
a = torch.randint(-128, 127, (M, K), dtype=dtype, device='cuda')
|
||||
b = torch.randint(-128, 127, (N, K), dtype=dtype, device='cuda')
|
||||
@@ -93,11 +102,17 @@ def test_matmul(M, N, K, dtype_str):
|
||||
else:
|
||||
a = torch.randn((M, K), dtype=dtype, device='cuda')
|
||||
b = torch.randn((K, N), dtype=dtype, device='cuda')
|
||||
fn = lambda: triton.ops.matmul(a, b)
|
||||
|
||||
def fn():
|
||||
return triton.ops.matmul(a, b)
|
||||
ms = triton.testing.do_bench(fn, percentiles=None, warmup=100, rep=300)
|
||||
cur_gpu_perf = 2. * M * N * K / ms * 1e-9
|
||||
cur_gpu_util = cur_gpu_perf / max_gpu_perf
|
||||
torch.testing.assert_allclose(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
|
||||
torch.testing.assert_allclose(
|
||||
cur_gpu_util,
|
||||
ref_gpu_util,
|
||||
atol=0.01,
|
||||
rtol=0.05)
|
||||
|
||||
|
||||
#######################
|
||||
@@ -148,12 +163,20 @@ def test_elementwise(N):
|
||||
z = torch.empty((N, ), dtype=torch.float16, device='cuda')
|
||||
x = torch.randn_like(z)
|
||||
y = torch.randn_like(z)
|
||||
grid = lambda args: (triton.cdiv(N, args['BLOCK_SIZE']), )
|
||||
fn = lambda: _add[grid](x, y, z, N, BLOCK_SIZE=1024)
|
||||
|
||||
def grid(args):
|
||||
return (triton.cdiv(N, args['BLOCK_SIZE']), )
|
||||
|
||||
def fn():
|
||||
return _add[grid](x, y, z, N, BLOCK_SIZE=1024)
|
||||
ms = triton.testing.do_bench(fn, percentiles=None, warmup=100, rep=500)
|
||||
cur_gpu_perf = 3. * N * z.element_size() / ms * 1e-6
|
||||
cur_gpu_util = cur_gpu_perf / max_gpu_perf
|
||||
torch.testing.assert_allclose(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
|
||||
torch.testing.assert_allclose(
|
||||
cur_gpu_util,
|
||||
ref_gpu_util,
|
||||
atol=0.01,
|
||||
rtol=0.05)
|
||||
|
||||
#######################
|
||||
# Flash-Attention
|
||||
@@ -175,20 +198,34 @@ def test_flash_attention(Z, H, N_CTX, D_HEAD, mode, dtype_str):
|
||||
is_backward = mode == 'backward'
|
||||
capability = torch.cuda.get_device_capability()
|
||||
if capability[0] < 8:
|
||||
pytest.skip("Flash attention only supported for compute capability < 80")
|
||||
pytest.skip(
|
||||
"Flash attention only supported for compute capability < 80")
|
||||
torch.manual_seed(20)
|
||||
dtype = {'float16': torch.float16, 'float32': torch.float32, 'int8': torch.int8}[dtype_str]
|
||||
dtype = {
|
||||
'float16': torch.float16,
|
||||
'float32': torch.float32,
|
||||
'int8': torch.int8}[dtype_str]
|
||||
# init data
|
||||
q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_()
|
||||
k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_()
|
||||
v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_()
|
||||
q = torch.empty(
|
||||
(Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(
|
||||
mean=0.1, std=0.2).requires_grad_()
|
||||
k = torch.empty(
|
||||
(Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(
|
||||
mean=0.4, std=0.2).requires_grad_()
|
||||
v = torch.empty(
|
||||
(Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(
|
||||
mean=0.3, std=0.2).requires_grad_()
|
||||
sm_scale = 0.2
|
||||
# benchmark
|
||||
fn = lambda: triton.ops.attention(q, k, v, sm_scale)
|
||||
|
||||
def fn():
|
||||
return triton.ops.attention(q, k, v, sm_scale)
|
||||
if is_backward:
|
||||
o = fn()
|
||||
do = torch.randn_like(o)
|
||||
fn = lambda: o.backward(do, retain_graph=True)
|
||||
|
||||
def fn():
|
||||
return o.backward(do, retain_graph=True)
|
||||
ms = triton.testing.do_bench(fn, percentiles=None, warmup=100, rep=500)
|
||||
# compute flops
|
||||
flops_per_matmul = 2. * Z * H * N_CTX * N_CTX * D_HEAD * 0.5
|
||||
@@ -198,7 +235,13 @@ def test_flash_attention(Z, H, N_CTX, D_HEAD, mode, dtype_str):
|
||||
cur_gpu_perf = total_flops / ms * 1e-9
|
||||
# maximum flops
|
||||
cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
|
||||
max_gpu_perf = get_max_tensorcore_tflops(dtype, clock_rate=cur_sm_clock * 1e3)
|
||||
max_gpu_perf = get_max_tensorcore_tflops(
|
||||
dtype, clock_rate=cur_sm_clock * 1e3)
|
||||
cur_gpu_util = cur_gpu_perf / max_gpu_perf
|
||||
ref_gpu_util = flash_attention_data[DEVICE_NAME][(Z, H, N_CTX, D_HEAD, mode, dtype_str)]
|
||||
torch.testing.assert_allclose(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
|
||||
ref_gpu_util = flash_attention_data[DEVICE_NAME][(
|
||||
Z, H, N_CTX, D_HEAD, mode, dtype_str)]
|
||||
torch.testing.assert_allclose(
|
||||
cur_gpu_util,
|
||||
ref_gpu_util,
|
||||
atol=0.01,
|
||||
rtol=0.05)
|
||||
|
||||
Reference in New Issue
Block a user