mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[CI] No longer install triton in editable mode to run tests (#1476)
This commit is contained in:
26
.github/workflows/integration-tests.yml
vendored
26
.github/workflows/integration-tests.yml
vendored
@@ -13,7 +13,7 @@ concurrency:
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/master' }}
|
||||
|
||||
env:
|
||||
TRITON_USE_ASSERT_ENABLED_LLVM: 'TRUE'
|
||||
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
|
||||
|
||||
jobs:
|
||||
Runner-Preparation:
|
||||
@@ -71,25 +71,27 @@ jobs:
|
||||
if: ${{ env.BACKEND != 'ROCM'}}
|
||||
run: |
|
||||
cd python
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install cmake==3.24
|
||||
python3 -m pip install -vvv -e '.[tests]'
|
||||
python3 -m pip install --no-build-isolation -vvv '.[tests]'
|
||||
|
||||
- name: Install Triton on ROCM
|
||||
if: ${{ env.BACKEND == 'ROCM'}}
|
||||
run: |
|
||||
cd python
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install cmake==3.24
|
||||
python3 -m pip install torch==1.13.1 --index-url https://download.pytorch.org/whl/rocm5.2
|
||||
python3 -m pip install -vvv -e '.[tests]'
|
||||
python3 -m pip install --no-build-isolation -vvv '.[tests]'
|
||||
|
||||
- name: Run lit tests
|
||||
if: ${{ env.BACKEND != 'ROCM'}}
|
||||
run: |
|
||||
python3 -m pip install lit
|
||||
cd python
|
||||
LIT_TEST_DIR="build/$(ls build)/test"
|
||||
LIT_TEST_DIR="build/$(ls build | grep -i temp)/test"
|
||||
if [ ! -d "${LIT_TEST_DIR}" ]; then
|
||||
echo "Not found '${LIT_TEST_DIR}'. Did you change an installation method?" ; exit -1
|
||||
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
|
||||
fi
|
||||
lit -v "${LIT_TEST_DIR}"
|
||||
|
||||
@@ -99,19 +101,19 @@ jobs:
|
||||
cd python/test/unit
|
||||
python3 -m pytest
|
||||
|
||||
- name: Run CXX unittests
|
||||
if: ${{ env.BACKEND != 'ROCM'}}
|
||||
run: |
|
||||
cd python
|
||||
cd "build/$(ls build | grep -i temp)"
|
||||
ctest
|
||||
|
||||
- name: Run python tests on ROCM
|
||||
if: ${{ env.BACKEND == 'ROCM'}}
|
||||
run: |
|
||||
cd python/test/unit/language
|
||||
python3 -m pytest --capture=tee-sys -rfs --verbose "test_core.py::test_empty_kernel"
|
||||
|
||||
- name: Run CXX unittests
|
||||
if: ${{ env.BACKEND != 'ROCM'}}
|
||||
run: |
|
||||
cd python
|
||||
cd "build/$(ls build)"
|
||||
ctest
|
||||
|
||||
- name: Regression tests
|
||||
if: ${{ contains(matrix.runner, 'A100') }}
|
||||
run: |
|
||||
|
||||
@@ -16,6 +16,11 @@ DEVICE_NAME = {7: 'v100', 8: 'a100'}[torch.cuda.get_device_capability()[0]]
|
||||
#######################
|
||||
|
||||
|
||||
def print_perf(cur_ms, cur_util, ref_util):
|
||||
# print on the same line cur_ms, cur_util and ref_util with 3 decimal places
|
||||
print(f'{cur_ms:.3f} ms \t cur: {cur_util:.3f} \t ref: {ref_util:.3f} \t dif={cur_util - ref_util:.3f}', end='\t')
|
||||
|
||||
|
||||
def nvsmi(attrs):
|
||||
attrs = ','.join(attrs)
|
||||
cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits']
|
||||
@@ -55,21 +60,21 @@ matmul_data = {
|
||||
# A100 in the CI server is slow-ish for some reason.
|
||||
# On some other servers, we are getting about 90% peak for 8kx8x8k float16
|
||||
'a100': {
|
||||
(512, 512, 512): {'float16': 0.08, 'float32': 0.13, 'int8': 0.05},
|
||||
(1024, 1024, 1024): {'float16': 0.33, 'float32': 0.35, 'int8': 0.169},
|
||||
(2048, 2048, 2048): {'float16': 0.62, 'float32': 0.57, 'int8': 0.34},
|
||||
(4096, 4096, 4096): {'float16': 0.81, 'float32': 0.75, 'int8': 0.46},
|
||||
(8192, 8192, 8192): {'float16': 0.77, 'float32': 0.85, 'int8': 0.51},
|
||||
(512, 512, 512): {'float16': 0.084, 'float32': 0.13, 'int8': 0.05},
|
||||
(1024, 1024, 1024): {'float16': 0.332, 'float32': 0.35, 'int8': 0.169},
|
||||
(2048, 2048, 2048): {'float16': 0.641, 'float32': 0.57, 'int8': 0.34},
|
||||
(4096, 4096, 4096): {'float16': 0.785, 'float32': 0.75, 'int8': 0.46},
|
||||
(8192, 8192, 8192): {'float16': 0.805, 'float32': 0.85, 'int8': 0.51},
|
||||
# tall-skinny
|
||||
(16, 1024, 1024): {'float16': 0.0077, 'float32': 0.0127, 'int8': 0.005},
|
||||
(16, 4096, 4096): {'float16': 0.0363, 'float32': 0.0457, 'int8': 0.0259},
|
||||
(16, 4096, 4096): {'float16': 0.044, 'float32': 0.0457, 'int8': 0.0259},
|
||||
(16, 8192, 8192): {'float16': 0.07, 'float32': 0.0648, 'int8': 0.0431},
|
||||
(64, 1024, 1024): {'float16': 0.0271, 'float32': 0.0509, 'int8': 0.0169},
|
||||
(64, 4096, 4096): {'float16': 0.16, 'float32': 0.162, 'int8': 0.097},
|
||||
(64, 8192, 8192): {'float16': 0.30, 'float32': 0.257, 'int8': 0.174},
|
||||
(1024, 64, 1024): {'float16': 0.037, 'float32': 0.0458, 'int8': 0.017},
|
||||
(64, 1024, 1024): {'float16': 0.030, 'float32': 0.0509, 'int8': 0.0169},
|
||||
(64, 4096, 4096): {'float16': 0.163, 'float32': 0.162, 'int8': 0.097},
|
||||
(64, 8192, 8192): {'float16': 0.285, 'float32': 0.257, 'int8': 0.174},
|
||||
(1024, 64, 1024): {'float16': 0.033, 'float32': 0.0458, 'int8': 0.017},
|
||||
(4096, 64, 4096): {'float16': 0.16, 'float32': 0.177, 'int8': 0.102},
|
||||
(8192, 64, 8192): {'float16': 0.25, 'float32': 0.230, 'int8': 0.177},
|
||||
(8192, 64, 8192): {'float16': 0.254, 'float32': 0.230, 'int8': 0.177},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,9 +99,10 @@ def test_matmul(M, N, K, dtype_str):
|
||||
a = torch.randn((M, K), dtype=dtype, device='cuda')
|
||||
b = torch.randn((K, N), dtype=dtype, device='cuda')
|
||||
fn = lambda: triton.ops.matmul(a, b)
|
||||
ms = triton.testing.do_bench(fn, warmup=100, rep=300)
|
||||
ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=300)
|
||||
cur_gpu_perf = 2. * M * N * K / ms * 1e-9
|
||||
cur_gpu_util = cur_gpu_perf / max_gpu_perf
|
||||
print_perf(ms, cur_gpu_util, ref_gpu_util)
|
||||
triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
|
||||
|
||||
|
||||
@@ -129,12 +135,12 @@ elementwise_data = {
|
||||
1024 * 65536: 0.939,
|
||||
},
|
||||
'a100': {
|
||||
1024 * 16: 0.008,
|
||||
1024 * 64: 0.034,
|
||||
1024 * 16: 0.010,
|
||||
1024 * 64: 0.040,
|
||||
1024 * 256: 0.132,
|
||||
1024 * 1024: 0.352,
|
||||
1024 * 4096: 0.580,
|
||||
1024 * 16384: 0.782,
|
||||
1024 * 1024: 0.353,
|
||||
1024 * 4096: 0.605,
|
||||
1024 * 16384: 0.758,
|
||||
1024 * 65536: 0.850,
|
||||
}
|
||||
}
|
||||
@@ -150,9 +156,10 @@ def test_elementwise(N):
|
||||
y = torch.randn_like(z)
|
||||
grid = lambda args: (triton.cdiv(N, args['BLOCK_SIZE']), )
|
||||
fn = lambda: _add[grid](x, y, z, N, BLOCK_SIZE=1024)
|
||||
ms = triton.testing.do_bench(fn, warmup=100, rep=500)
|
||||
ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=500)
|
||||
cur_gpu_perf = 3. * N * z.element_size() / ms * 1e-6
|
||||
cur_gpu_util = cur_gpu_perf / max_gpu_perf
|
||||
print_perf(ms, cur_gpu_util, ref_gpu_util)
|
||||
triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
|
||||
|
||||
#######################
|
||||
@@ -189,7 +196,7 @@ def test_flash_attention(Z, H, N_CTX, D_HEAD, mode, dtype_str):
|
||||
o = fn()
|
||||
do = torch.randn_like(o)
|
||||
fn = lambda: o.backward(do, retain_graph=True)
|
||||
ms = triton.testing.do_bench(fn, warmup=100, rep=500)
|
||||
ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=500)
|
||||
# compute flops
|
||||
flops_per_matmul = 2. * Z * H * N_CTX * N_CTX * D_HEAD * 0.5
|
||||
total_flops = 2 * flops_per_matmul
|
||||
@@ -201,4 +208,5 @@ def test_flash_attention(Z, H, N_CTX, D_HEAD, mode, dtype_str):
|
||||
max_gpu_perf = get_max_tensorcore_tflops(dtype, clock_rate=cur_sm_clock * 1e3)
|
||||
cur_gpu_util = cur_gpu_perf / max_gpu_perf
|
||||
ref_gpu_util = flash_attention_data[DEVICE_NAME][(Z, H, N_CTX, D_HEAD, mode, dtype_str)]
|
||||
print_perf(ms, cur_gpu_util, ref_gpu_util)
|
||||
triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
|
||||
|
||||
Reference in New Issue
Block a user