diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 631c541dcb..603d44898f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -123,6 +123,8 @@ jobs: CUDA=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt - name: Run Tensor Core GEMM (PTX) run: CUDA=1 PTX=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt + - name: Run Tensor Core GEMM (NV) + run: NV=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_nv.txt - name: Fuzz Padded Tensor Core GEMM(CUDA) run: CUDA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py - name: Fuzz Padded Tensor Core GEMM(PTX) @@ -160,6 +162,7 @@ jobs: matmul.txt matmul_bfloat16.txt matmul_ptx.txt + matmul_nv.txt llama_unjitted.txt llama_jitted.txt llama_beam.txt diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 1b155eade5..1fe5b1e29f 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -59,6 +59,7 @@ tensor_cores: Dict[str, List[TensorCore]] = { "CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[-2],[5],[0],[0],[-1,1,2,-3],[3,4]], [[5],[0],[0],[4],[3],[-1,1,2,-2],[0]], [[2],[-2],[5],[1],[-1],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501 } tensor_cores["AMD"] = tensor_cores["HSA"] +tensor_cores["NV"] = tensor_cores["CUDA"] class LocalBuffer(NamedTuple): name: str