From 773d5b60bfb83411993c7b499413e42e7fc43dbd Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Mon, 11 Nov 2024 18:11:18 -0500
Subject: [PATCH] beam benchmark tests (#7638)

* beam benchmark tests

* lower AMD number somehow

* less flaky
---
 .github/workflows/benchmark.yml      |  8 +++--
 test/external/speed_v_theoretical.py | 51 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 test/external/speed_v_theoretical.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 22e18f4f6f..ad49ddbdcd 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -170,8 +170,10 @@ jobs:
       run: NV=1 PTX=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt
     - name: Run Tensor Core GEMM (NV)
       run: NV=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_nv.txt
-    - name: Run Tensor Core GEMM (NV) with BEAM
-      run: BEAM=4 NV=1 HALF=1 IGNORE_BEAM_CACHE=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
+    # - name: Run Tensor Core GEMM (NV) with BEAM
+    #   run: BEAM=4 NV=1 HALF=1 IGNORE_BEAM_CACHE=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
+    - name: Test speed vs theoretical
+      run: NV=1 IGNORE_BEAM_CACHE=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py
     - name: Run Stable Diffusion
       run: NV=1 python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt
     - name: Run SDXL
@@ -343,6 +345,8 @@ jobs:
         AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
     - name: Run Tensor Core GEMM (AMD)
       run: AMD=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_amd.txt
+    - name: Test speed vs theoretical
+      run: AMD=1 IGNORE_BEAM_CACHE=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py
     # TODO: AMD compiler bug causes this to fail
     #- name: Fuzz Padded Tensor Core GEMM
     #  run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py
diff --git a/test/external/speed_v_theoretical.py b/test/external/speed_v_theoretical.py
new file mode 100644
index 0000000000..9806019aff
--- /dev/null
+++ b/test/external/speed_v_theoretical.py
@@ -0,0 +1,51 @@
+import unittest, time
+from tinygrad import Tensor, TinyJit, Device
+from tinygrad.helpers import Context, DEBUG
+
+class TestKernelSpeed(unittest.TestCase):
+  def _test_matmul(self, M, N=None, K=None, nv=None, amd=None):
+    # (MxK) @ (KxN)
+    @TinyJit
+    def f(a, b): return (a @ b).realize()
+
+    if N is None: N = M
+    if K is None: K = M
+    tms = []
+    with Context(BEAM=3):
+      for _ in range(10):
+        with Context(BEAM=0, DEBUG=0):
+          a = Tensor.rand(M, K, dtype="half").realize()
+          b = Tensor.rand(K, N, dtype="half").realize()
+        Device.default.synchronize()
+        st = time.perf_counter()
+        _c = f(a, b)
+        Device.default.synchronize()
+        tms.append(time.perf_counter() - st)
+
+    ops = 2 * M * N * K
+    tm = min(tms)
+    tflops = ops / tm / 1e12
+
+    if DEBUG >= 1:
+      print(f"{tm=}")
+      print(f"{tflops=}")
+
+    if Device.DEFAULT == "NV":
+      if DEBUG >=1: print(f"target: {nv}")
+      self.assertGreater(tflops, nv)
+    if Device.DEFAULT == "AMD":
+      if DEBUG >=1: print(f'target: {amd}')
+      self.assertGreater(tflops, amd)
+
+  # TODO: smaller ones has other overhead in synchronize
+  # TODO: AMD number can be better (perf level?)
+  def test_gemm_1024(self): self._test_matmul(1024, nv=9, amd=5)
+  def test_gemm_2048(self): self._test_matmul(2048, nv=50, amd=20)
+  def test_gemm_4096(self): self._test_matmul(4096, nv=95, amd=30)
+  def test_gemm_8192(self): self._test_matmul(8192, nv=130, amd=50)
+
+  # TODO: add gemv, which is memory bounded
+
+
+if __name__ == '__main__':
+  unittest.main()
\ No newline at end of file