From 773d5b60bfb83411993c7b499413e42e7fc43dbd Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 11 Nov 2024 18:11:18 -0500 Subject: [PATCH] beam benchmark tests (#7638) * beam benchmark tests * lower AMD number somehow * less flaky --- .github/workflows/benchmark.yml | 8 +++-- test/external/speed_v_theoretical.py | 51 ++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 test/external/speed_v_theoretical.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 22e18f4f6f..ad49ddbdcd 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -170,8 +170,10 @@ jobs: run: NV=1 PTX=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt - name: Run Tensor Core GEMM (NV) run: NV=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_nv.txt - - name: Run Tensor Core GEMM (NV) with BEAM - run: BEAM=4 NV=1 HALF=1 IGNORE_BEAM_CACHE=1 DEBUG=2 python3 extra/gemm/simple_matmul.py + # - name: Run Tensor Core GEMM (NV) with BEAM + # run: BEAM=4 NV=1 HALF=1 IGNORE_BEAM_CACHE=1 DEBUG=2 python3 extra/gemm/simple_matmul.py + - name: Test speed vs theoretical + run: NV=1 IGNORE_BEAM_CACHE=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py - name: Run Stable Diffusion run: NV=1 python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt - name: Run SDXL @@ -343,6 +345,8 @@ jobs: AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded - name: Run Tensor Core GEMM (AMD) run: AMD=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_amd.txt + - name: Test speed vs theoretical + run: AMD=1 IGNORE_BEAM_CACHE=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py # TODO: AMD compiler bug causes this to fail #- name: Fuzz Padded Tensor Core GEMM # run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py diff --git a/test/external/speed_v_theoretical.py b/test/external/speed_v_theoretical.py new file mode 100644 index 0000000000..9806019aff --- /dev/null +++ b/test/external/speed_v_theoretical.py @@ -0,0 +1,51 @@ +import unittest, time +from tinygrad import Tensor, TinyJit, Device +from tinygrad.helpers import Context, DEBUG + +class TestKernelSpeed(unittest.TestCase): + def _test_matmul(self, M, N=None, K=None, nv=None, amd=None): + # (MxK) @ (KxN) + @TinyJit + def f(a, b): return (a @ b).realize() + + if N is None: N = M + if K is None: K = M + tms = [] + with Context(BEAM=3): + for _ in range(10): + with Context(BEAM=0, DEBUG=0): + a = Tensor.rand(M, K, dtype="half").realize() + b = Tensor.rand(K, N, dtype="half").realize() + Device.default.synchronize() + st = time.perf_counter() + _c = f(a, b) + Device.default.synchronize() + tms.append(time.perf_counter() - st) + + ops = 2 * M * N * K + tm = min(tms) + tflops = ops / tm / 1e12 + + if DEBUG >= 1: + print(f"{tm=}") + print(f"{tflops=}") + + if Device.DEFAULT == "NV": + if DEBUG >=1: print(f"target: {nv}") + self.assertGreater(tflops, nv) + if Device.DEFAULT == "AMD": + if DEBUG >=1: print(f'target: {amd}') + self.assertGreater(tflops, amd) + + # TODO: smaller ones has other overhead in synchronize + # TODO: AMD number can be better (perf level?) + def test_gemm_1024(self): self._test_matmul(1024, nv=9, amd=5) + def test_gemm_2048(self): self._test_matmul(2048, nv=50, amd=20) + def test_gemm_4096(self): self._test_matmul(4096, nv=95, amd=30) + def test_gemm_8192(self): self._test_matmul(8192, nv=130, amd=50) + + # TODO: add gemv, which is memory bounded + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file