diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index dcac986702..265971b24a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -172,12 +172,13 @@ jobs: run: NV=1 python test/external/external_benchmark_multitensor_allreduce.py - name: Test tensor cores run: | - NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded - PTX=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded + NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded + PTX=1 ALLOW_TF32=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded - name: Run Tensor Core GEMM (CUDA) run: | CUDA=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt CUDA=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt + CUDA=1 ALLOW_TF32=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_tf32.txt - name: Run Tensor Core GEMM (PTX) run: NV=1 PTX=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt - name: Run Tensor Core GEMM (NV) @@ -226,6 +227,7 @@ jobs: torch_speed.txt matmul.txt matmul_bfloat16.txt + matmul_tf32.txt matmul_ptx.txt matmul_nv.txt sd.txt diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 417b222f50..3dcf2d544e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -166,15 +166,16 @@ jobs: - name: Test emulated CUDA tensor cores run: | DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 + DEBUG=2 EMULATE_CUDA=1 ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm DEBUG=2 EMULATE_CUDA_SM75=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 - PYTHONPATH="." DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded + PYTHONPATH="." DEBUG=2 EMULATE_CUDA=1 ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded - name: Test emulated INTEL OpenCL tensor cores run: DEBUG=2 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 HALF=1 N=64 python3 ./extra/gemm/simple_matmul.py - name: Full test tensor cores run: | PYTHONPATH=. DEBUG=2 EMULATE_METAL=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores - PYTHONPATH=. DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores + PYTHONPATH=. DEBUG=2 EMULATE_CUDA=1 ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores PYTHONPATH=. DEBUG=2 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores PYTHONPATH=. DEBUG=2 AMX=1 EMULATE_AMX=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores - name: Test tensor cores (TC=3) diff --git a/docs/env_vars.md b/docs/env_vars.md index 7540b91146..99b063e8d1 100644 --- a/docs/env_vars.md +++ b/docs/env_vars.md @@ -47,4 +47,5 @@ PTX | [1] | enable the specialized [PTX](https://docs.nvi PROFILE | [1] | enable profiling. This feature is supported in NV, AMD, QCOM and METAL backends. VISIBLE_DEVICES | [list[int]]| restricts the NV/AMD devices that are available. The format is a comma-separated list of identifiers (indexing starts with 0). JIT | [0-2] | 0=disabled, 1=[jit enabled](quickstart.md#jit) (default), 2=jit enabled, but graphs are disabled -VIZ | [1] | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz) \ No newline at end of file +VIZ | [1] | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz) +ALLOW_TF32 | [1] | enable TensorFloat-32 tensor cores on Ampere or newer GPUs. \ No newline at end of file diff --git a/test/test_linearizer.py b/test/test_linearizer.py index cd274b6aaf..002256bbaa 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -1982,7 +1982,7 @@ class TestKernelOpts(unittest.TestCase): Tensor.manual_seed(1552) for tc in Device[Device.DEFAULT].renderer.tensor_cores: # bf16 buffer returns float32 numpy outputs so test would fail. testing opt with half suffices. - if tc.dtype_in == dtypes.bfloat16: continue + if tc.dtype_in != dtypes.half and tc.dtype_out != dtypes.half: continue a, b = Tensor.rand(N, N, dtype=tc.dtype_in), Tensor.rand(N, N, dtype=tc.dtype_in) r = a.matmul(b, acc_dtype=tc.dtype_out) (atol, rtol) = ((0.25, 0.01) if tc.dtype_out == dtypes.half else (3e-2, 1e-3)) if tc.dtype_in == dtypes.half else (1e-4, 1e-4) @@ -2009,7 +2009,7 @@ class TestKernelOpts(unittest.TestCase): Tensor.manual_seed(1552) for tc in Device[Device.DEFAULT].renderer.tensor_cores: # bf16 buffer returns float32 numpy outputs so test would fail. testing opt with half suffices. - if tc.dtype_in == dtypes.bfloat16: continue + if tc.dtype_in != dtypes.half and tc.dtype_out != dtypes.half: continue a, b = Tensor.rand(N, N, dtype=tc.dtype_in), Tensor.rand(N, N, dtype=tc.dtype_in) r = a.matmul(b, acc_dtype=tc.dtype_out) (atol, rtol) = ((0.25, 0.01) if tc.dtype_out == dtypes.half else (3e-2, 1e-3)) if tc.dtype_in == dtypes.half else (1e-4, 1e-4) diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 5171839cef..4f2e604f05 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -303,8 +303,11 @@ class CUDARenderer(CStyleLanguage): swizzle=(((6,7,2,3,4),(0,1,9,5,10,8)), ((6,7,9,0,1),(2,3,4,10,5,8)))) for di,do in [(dtypes.half,dtypes.float), (dtypes.bfloat16,dtypes.float)]] tc_8168_f16 = [TensorCore(dims=(8,16,8), threads=32, elements_per_thread=(4,2,4), dtype_in=dtypes.half, dtype_out=dtypes.float, opts=cuda_tc_opts, swizzle=(((6,7,2,3,4),(0,1,8,5,9)), ((6,7,8,0,1),(2,3,4,9,5))))] + tc_8168_tf32 = [TensorCore(dims=(8,16,8), threads=32, elements_per_thread=(4,2,4), dtype_in=dtypes.float, dtype_out=dtypes.float, opts=cuda_tc_opts, + swizzle=(((5,6,2,3,4),(0,1,8,9,7)), ((5,6,8,0,1),(2,3,4,9,7))))] tc_sm80 = tc_81616 + tc_8168_f16 + if getenv("ALLOW_TF32", 0): tc_sm80 += tc_8168_tf32 tc_sm75 = tc_8168_f16 def __init__(self, arch:str): self.tensor_cores, self.arch = CUDARenderer.tc_sm80 if int(arch[3:]) >= 80 else CUDARenderer.tc_sm75 if int(arch[3:]) >= 75 else [], arch @@ -340,7 +343,7 @@ class CUDARenderer(CStyleLanguage): if any(dt.scalar() == dtypes.bfloat16 for dt in used_dtypes): prefix.append("#include ") prefix += [self.render_vector_prefix(dt) for dt in used_dtypes if dt.count in (4,8) and dt.scalar() in {dtypes.half, dtypes.bfloat16}] - dt_map = { dtypes.half: "f16", dtypes.bfloat16: "bf16" } + dt_map = { dtypes.float: "tf32", dtypes.half: "f16", dtypes.bfloat16: "bf16" } for name, (N, M, K), dtype_in, dtype_out, _, _, upcast_axes, _ in dedup([uop.arg for uop in uops if uop.op is Ops.WMMA]): upcast_sizes = [prod(size for _, size in upcast) for upcast in upcast_axes] wmma_dtypes = [self.render_dtype(dtype.vec(size)) for dtype, size in zip([dtype_in, dtype_in, dtype_out], upcast_sizes)] diff --git a/tinygrad/renderer/ptx.py b/tinygrad/renderer/ptx.py index 7e91b99071..c63868fe98 100644 --- a/tinygrad/renderer/ptx.py +++ b/tinygrad/renderer/ptx.py @@ -60,7 +60,7 @@ def render_wmma(ctx: "PTXRenderer", x: UOp): assert ctx.wmma_r, "registry values for wmma must be populated" _, (N, M, K), dtype_in, _, _, _, upcast_axes, _ = x.arg n_operands = tuple(prod(sz for _, sz in upc)*dtype_in.itemsize//4 for upc in upcast_axes[:2]) - dt_map = { dtypes.half: "f16" } + dt_map = { dtypes.half: "f16", dtypes.float: "tf32" } _i = 0 for vv in x.src[:2]: for i in range(0, len(ctx.r[vv]), (elems_per_reg := 4//dtype_in.itemsize)): @@ -124,7 +124,7 @@ class PTXRenderer(Renderer): device = "CUDA" suffix = "PTX" global_max, local_max, shared_max = CUDARenderer.global_max, CUDARenderer.local_max, CUDARenderer.shared_max - tc_sm80 = [tc for tc in CUDARenderer.tc_sm80 if tc.dtype_in == dtypes.half] + tc_sm80 = [tc for tc in CUDARenderer.tc_sm80 if tc.dtype_in in [dtypes.half, dtypes.float]] code_for_op = asm_for_op extra_matcher = ptx_matcher def __init__(self, arch:str, device="CUDA"): diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 4961104492..c95e237f09 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -154,11 +154,16 @@ class PythonProgram: def b_elem(x, col, k, goff): return x[k%2 + (k//8)*2][goff + (k//2)%4 + col*4] ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map) - elif arg[1] == (8,16,8): + elif arg[1] == (8,16,8) and arg[2] == dtypes.half: def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2][goff + k//2 + (row%8)*4] def b_elem(x, col, k, goff): return x[k%2][goff + k//2 + col*4] ul[i] = wmma_helper(32, 8, 4, 2, 4, a_elem, b_elem, c_map) + elif arg[1] == (8,16,8) and arg[2] == dtypes.float: + def a_elem(x, k, row, goff): return x[(k//4)*2 + row//8][goff + k%4 + (row%8)*4] + def b_elem(x, col, k, goff): return x[k//4][goff + k%4 + col*4] + ul[i] = wmma_helper(32, 8, 4, 2, 4, a_elem, b_elem, c_map) + else: raise NotImplementedError(f"unimplemented tensor core {arg}") elif arg[4] == "INTEL": # A (16 elements on 8 threads)