remove ACCUM_FP32 in simple_matmul.py (#3045)

* remove ACCUM_FP32 in simple_matmul.py accumate for half inputs is always in float * move test llama compile speed to metal
2026-01-09 15:08:02 -05:00 · 2024-01-08 17:37:57 -05:00
parent 47d67da830
commit 1d730b8853
2 changed files with 4 additions and 3 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -254,8 +254,6 @@ jobs:
  #    run: npm install puppeteer
  #  - name: Run WEBGPU Efficientnet
  #    run: node test/web/test_webgpu.js
-  #  - name: Test LLaMA compile speed
-  #    run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py

  testmetal:
    name: Metal Tests
@@ -291,6 +289,8 @@ jobs:
      run: METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py
    - name: Test tensor core ops
      run: METAL=1 TC=2 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
+    - name: Test LLaMA compile speed
+      run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py

  testhipcompilation:
    name: HIP Compilation Tests
--- a/extra/gemm/simple_matmul.py
+++ b/extra/gemm/simple_matmul.py
@@ -8,7 +8,8 @@ a, b = Tensor.rand(N, N, dtype=dtype_in).realize(), Tensor.rand(N, N, dtype=dtyp
 for i in range(CNT):
  if i > 0 and getenv("RAND", 0) != 0:
    a, b = Tensor.rand(N, N, dtype=dtype_in).realize(), Tensor.rand(N, N, dtype=dtype_in).realize()
-  c = (a.reshape(N, 1, N) * b.permute(1,0).reshape(1, N, N)).float().sum(axis=2).realize() if getenv("ACCUM_FP32") else (a @ b).realize()
+  # NOTE: accumulate is in float32
+  c = (a @ b).realize()
 comp = a.numpy().astype(np.float32) @ b.numpy().astype(np.float32)
 nc = c.numpy()
 np.testing.assert_allclose(nc, comp, atol=1e-4, rtol=3e-2)