tf32 tc for nv and ptx (#8635)

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
2026-02-17 10:02:00 -05:00 · 2025-01-17 22:43:57 -03:00
parent 5afb0a4a81
commit d2234e308a
7 changed files with 23 additions and 11 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -172,12 +172,13 @@ jobs:
      run: NV=1 python test/external/external_benchmark_multitensor_allreduce.py
    - name: Test tensor cores
      run: |
-        NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
-        PTX=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
+        NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
+        PTX=1 ALLOW_TF32=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
    - name: Run Tensor Core GEMM (CUDA)
      run: |
        CUDA=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
        CUDA=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt
+        CUDA=1 ALLOW_TF32=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_tf32.txt
    - name: Run Tensor Core GEMM (PTX)
      run: NV=1 PTX=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt
    - name: Run Tensor Core GEMM (NV)
@@ -226,6 +227,7 @@ jobs:
          torch_speed.txt
          matmul.txt
          matmul_bfloat16.txt
+          matmul_tf32.txt
          matmul_ptx.txt
          matmul_nv.txt
          sd.txt