diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0c4df8fa12..16e2beacbd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -46,16 +46,20 @@ jobs: PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=1 python3 ./extra/gemm/simple_matmul.py - name: Test emulated CUDA tensor cores run: DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 + - name: Test emulated INTEL OpenCL tensor cores + run: DEBUG=2 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 HALF=1 N=64 python3 ./extra/gemm/simple_matmul.py - name: Full test tensor cores run: | PYTHONPATH=. DEBUG=2 EMULATE_METAL=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores PYTHONPATH=. DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores + PYTHONPATH=. DEBUG=2 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores - name: Test tensor cores (TC=3) run: | TC=3 DEBUG=3 EMULATE_METAL=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm TC=3 PYTHONPATH=. DEBUG=3 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py TC=3 DEBUG=3 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 + TC=3 PYTHONPATH=. DEBUG=3 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 python3 ./extra/gemm/simple_matmul.py - name: Test dtype with Python emulator run: DEBUG=1 PYTHONPATH=. PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py - name: Test ops with Python emulator diff --git a/extra/gemm/intel_xmx.py b/extra/gemm/intel_xmx.py index e72e7899bf..8ec478e5f6 100644 --- a/extra/gemm/intel_xmx.py +++ b/extra/gemm/intel_xmx.py @@ -26,9 +26,9 @@ __kernel void test(__global float* data0, const __global int* data1, const __glo """)) #with open("/tmp/test.elf", "wb") as f: f.write(prog.lib) -a = Buffer("GPU", 8, dtypes.float32) -b = Buffer("GPU", 0x10, dtypes.float16) -c = Buffer("GPU", 8*0x10, dtypes.float16) +a = Buffer("GPU", 8, dtypes.float32).allocate() +b = Buffer("GPU", 0x10, dtypes.float16).allocate() +c = Buffer("GPU", 8*0x10, dtypes.float16).allocate() row = np.array([1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8], np.float16) mat = np.random.random((8, 0x10)).astype(np.float16) diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 44b039e479..087e34039e 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -894,7 +894,7 @@ class TestLinearizer(unittest.TestCase): @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") def test_tensor_cores(self): for tc in Device[Device.DEFAULT].renderer.tensor_cores: - if getenv("EMULATE_CUDA") and (tc.dtype_in == dtypes.bfloat16 or tc.dtype_out == dtypes.bfloat16): continue + if (getenv("EMULATE_CUDA") or getenv("EMULATE_INTEL")) and (tc.dtype_in == dtypes.bfloat16 or tc.dtype_out == dtypes.bfloat16): continue helper_tc_allclose(tc.dims[0], tc.dims[1], tc.dims[2], tc.dtype_in, tc.dtype_out, axis=0, tc_opt=0) @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index beba8cd358..45aa83fa95 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -328,7 +328,7 @@ class Kernel: for (tc_dim, tc_amt) in tc.threads: self.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[tc_dim], tc_amt), append_opt=False) for i, sz in enumerate([prod(x) for x in [[x[1] for x in tc.threads if x[0]==dim] for dim in range(2)]]): # upcast non-local'd N, M if tc.dims[i] > sz: self.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[i], tc.dims[i]//sz), append_opt=False) - elif self.opts.device == "METAL": + elif self.opts.device == "METAL" or self.opts.suffix == "INTEL": self.apply_opt(Opt(OptOps.UNROLL, tc_opts.axes[2]-self.first_reduce, tc.dims[2]), append_opt=False) for i, sz in enumerate([prod(x) for x in [[x[1] for x in tc.threads if x[0]==dim] for dim in range(2)]]): # upcast non-local'd N, M if tc.dims[i] > sz: self.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[i], tc.dims[i]//sz), append_opt=False) @@ -676,11 +676,15 @@ class Kernel: ((1,1), (1,0), (0,2), (0,3), (0,4)), ((1,3), (1,4), (1,2), (0,0), (0,1), (1,5))) fix_st2 = functools.partial(fix_st, (2,2,2,2,2), (8,2,2,2), (2,2,2,2,2,2), ((1,1), (1,0), (1,5), (0,0), (0,1)), ((0,4), (0,2), (1,4), (0,3), (1,3), (1,2))) + elif self.opts.suffix == "INTEL": + reduce_axes, upcast_axes = [0], [[(0, 16)], [(0, 16)], [(1, 8)]] + fix_st1 = functools.partial(fix_st, (8,), (16,8), (8,2,8), ((1,0),), ((1,2), (1,1), (0,0))) + fix_st2 = None else: raise RuntimeError("unsupported device for tensor cores") assert apply_to_st is None, "double tensor core? not supported" - wmma_arg = (str(tc), tc.dims, tc.dtype_in, tc.dtype_out, self.opts.device, + wmma_arg = (str(tc), tc.dims, tc.dtype_in, tc.dtype_out, self.opts.device, prod(t[1] for t in tc.threads), tuple(tuple((self.first_upcast+ax, sz) for ax, sz in up) for up in upcast_axes), tuple(self.first_upcast+ax for ax in reduce_axes)) if self.use_tensor_cores >= 2: diff --git a/tinygrad/codegen/uops.py b/tinygrad/codegen/uops.py index 6cea149cb9..1693b2b81d 100644 --- a/tinygrad/codegen/uops.py +++ b/tinygrad/codegen/uops.py @@ -289,5 +289,5 @@ def flops_mem(uops:List[UOp], ignore_indexing=False) -> Tuple[sint, sint]: flops += (mults * (2 if u.arg == TernaryOps.MULACC else 1)) * u.dtype.count elif u.op is UOps.WMMA and u not in dont_count: assert u.arg[1] is not None - flops += 2 * prod(u.arg[1]) // 32 * mults + flops += 2 * prod(u.arg[1]) // u.arg[5] * mults return flops, mem diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 60b5fb5b05..322ec8fede 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -223,14 +223,32 @@ class OpenCLRenderer(CStyleLanguage): float4 = "(float4)" code_for_workitem = {"g": lambda x: f"get_group_id({x})", "l": lambda x: f"get_local_id({x})", "i": lambda x: f"get_global_id({x})"} uses_vload = True - type_map = { dtypes.uint8: "uchar", dtypes.uint32: "uint", dtypes.uint16: "ushort", dtypes.uint64: "ulong" } + type_map = { dtypes.uint8: "uchar", dtypes.uint32: "uint", dtypes.uint16: "ushort", dtypes.uint64: "ulong", dtypes.bfloat16: "ushort" } def render_cast(self, x, var_dtype, bitcast=False) -> str: return f"as_{self.render_dtype(var_dtype)}({x})" if bitcast else super().render_cast(x, var_dtype) def render_kernel(self, function_name, kernel, bufs, uops, prefix=None) -> str: - if any(uop.dtype == dtypes.half for uop in uops): prefix = ["#pragma OPENCL EXTENSION cl_khr_fp16 : enable"] + if any(uop.dtype == dtypes.half for uop in uops): prefix = (["#pragma OPENCL EXTENSION cl_khr_fp16 : enable"] + (prefix or [])) return super().render_kernel(function_name, kernel, bufs, uops, prefix) +class IntelRenderer(OpenCLRenderer): + device, suffix, kernel_prefix = "GPU", "INTEL", "__attribute__((intel_reqd_sub_group_size(8)))\n" + "__kernel " + tensor_cores = [TensorCore(dims=(8,8,16), threads=[(0,8)], dtype_in=di, dtype_out=do) for di, do in [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)]] # noqa: E501 + def render_dtype(self, var_dtype:DType) -> str: + return f"ushort{var_dtype.count}" if "bfloat16" in var_dtype.name else super().render_dtype(var_dtype) + def render_cast(self, x, var_dtype, bitcast=False, from_dtype=None) -> str: + return f"intel_convert_bfloat16_as_ushort({x[0]})" if (var_dtype, from_dtype) == (dtypes.bfloat16, dtypes.float) else \ + (f"intel_convert_as_bfloat16_float({x[0]})" if (var_dtype, from_dtype) == (dtypes.float, dtypes.bfloat16) else \ + super().render_cast(x, var_dtype, bitcast)) + + def render_kernel(self, function_name, kernel, bufs, uops, prefix=None) -> str: + prefix = [] + for arg in dedup([uop.arg for uop in uops if uop.op is UOps.WMMA]): + dt_in = ("ushort", "bf16") if arg[2] == dtypes.bfloat16 else (arg[2].name, "f16") + prefix.append(f"""{arg[3].name}8 __{arg[0]}({dt_in[0]}16 a, {dt_in[0]}16 b, {arg[3].name}8 c) {{ + return intel_sub_group_{dt_in[1]}_{dt_in[1]}_matrix_mad_k16(as_int8(a), as_int8(b), c);\n}}""") + return super().render_kernel(function_name, kernel, bufs, uops, prefix or None) + class MetalRenderer(CStyleLanguage): device = "METAL" shared_max = 32768 diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 379b067311..6b756dc7ec 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -2,8 +2,8 @@ from __future__ import annotations from typing import Tuple, Optional, List, cast import ctypes, functools, hashlib from tinygrad.runtime.autogen import opencl as cl -from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG -from tinygrad.renderer.cstyle import OpenCLRenderer +from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv +from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something @@ -95,9 +95,11 @@ class CLDevice(Compiled): self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status) self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status) self.pending_copyin: List[memoryview] = [] + self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1] # noqa: E501 compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest() - super().__init__(device, CLAllocator(self), OpenCLRenderer(), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self)) + renderer = IntelRenderer() if "cl_intel_subgroup_matrix_multiply_accumulate" in self.device_exts and getenv("INTEL") else OpenCLRenderer() + super().__init__(device, CLAllocator(self), renderer, CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self)) def synchronize(self): check(cl.clFinish(self.queue)) self.pending_copyin.clear() diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 46244a3426..d0598a67cd 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -10,7 +10,7 @@ from tinygrad.device import Compiled, Compiler, Allocator from tinygrad.codegen.uops import UOps, UOp from tinygrad.ops import BinaryOps, TernaryOps, exec_alu, truncate from tinygrad.renderer import Renderer -from tinygrad.renderer.cstyle import CUDARenderer, MetalRenderer, AMDRenderer +from tinygrad.renderer.cstyle import CUDARenderer, MetalRenderer, AMDRenderer, IntelRenderer def _load(m, i): if i < 0 or i >= len(m): raise IndexError(f"load out of bounds, size is {len(m)} and access is {i}") @@ -171,6 +171,14 @@ class PythonProgram: # (i, j), C, D (4 elements on 32 threads) def c_map(lane, elem): return ((elem%2)+(lane%4)*2, (lane//4)+(elem//2)*8) ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map) + elif arg[4] == "INTEL": + # A (16 elements on 8 threads) + def a_elem(x, i, j, goff): return x[i%2+j*2][goff+i//2] + # B (16 elements on 8 threads) + def b_elem(x, i, j, goff): return x[j][goff+i] + # C, D (8 elements on 8 threads) + def c_map(lane, elem): return (lane, elem) + ul[i] = wmma_helper(8, 16, 16, 16, 8, a_elem, b_elem, c_map) else: raise NotImplementedError(f"unimplemented tensor core {arg}") elif uop is UOps.ALU: assert all_same([len(x) for x in inp]), f"{[len(x) for x in inp]} doesn't match on {arg}" @@ -186,6 +194,7 @@ class PythonRenderer(Renderer): if getenv("EMULATE_METAL"): self.device, self.tensor_cores = "METAL", MetalRenderer.tensor_cores if getenv("EMULATE_AMD"): self.device, self.tensor_cores = "AMD", AMDRenderer.tensor_cores if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tensor_cores + if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores def render(self, name:str, uops:List[UOp]) -> str: lops = [(u.op, u.dtype, [uops.index(v) for v in u.src], u.arg) for u in uops]