diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 5c8b36bf6b..9b043fda38 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -110,12 +110,12 @@ jobs: run: CUDA=1 python3 test/external/external_model_benchmark.py - name: Test speed vs torch run: CUDA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt - - name: Run Tensor Core GEMM(CUDA) + - name: Run Tensor Core GEMM (CUDA) run: | CUDA=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt CUDA=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt - - name: Run Tensor Core GEMM(PTX) - run: CUDA=1 PTX=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt + - name: Run Tensor Core GEMM (PTX) + run: CUDA=1 PTX=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt - name: Run LLaMA run: | CUDA=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt @@ -148,6 +148,7 @@ jobs: torch_speed.txt matmul.txt matmul_bfloat16.txt + matmul_ptx.txt llama_unjitted.txt llama_jitted.txt llama_beam.txt @@ -192,8 +193,10 @@ jobs: # run: | # python3 -c "import torch; print(torch.__version__)" # LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt - - name: Run Tensor Core GEMM + - name: Run Tensor Core GEMM (HSA) run: HSA=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt + - name: Run Tensor Core GEMM (KFD) + run: KFD=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_kfd.txt - name: Run Stable Diffusion run: HSA=1 python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt - name: Run LLaMA 7B @@ -229,6 +232,7 @@ jobs: gpt2_unjitted.txt gpt2_jitted.txt matmul.txt + matmul_kfd.txt sd.txt mixtral.txt diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 291c1882c6..bd539a3791 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -53,6 +53,7 @@ tensor_cores: Dict[str, List[TensorCore]] = { "HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[2],[0],[0],[-1],[1]], [[0],[2],[1],[-1],[0]], [[-2],[2],[1],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501 "CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[-2],[5],[0],[0],[-1,1,2,-3],[3,4]], [[5],[0],[0],[4],[3],[-1,1,2,-2],[0]], [[2],[-2],[5],[1],[-1],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501 } +tensor_cores["KFD"] = tensor_cores["HSA"] class LocalBuffer(NamedTuple): name: str