Intel XMX Tensor Core Support (#5622)

* fixed xmx demo * i think i'm invoking the DPAS but it's slow * compiler build arg to stop register spilling, indicated where to fix flop counter * don't mind this * do NOT mind me * do not mind me * do not view * i will add bf16 later * in process of figuring out tc fields * we figured out the fields!!! * added check for cl device vendor, added seperate IntelRenderer * remove tc thread_local_aliases * cleaning debris before draft pr * edits for linter * deduping and checking device extensions * i will find more line reductions in other places * before merge upstream * double grf size in compiler to fix register spilling (bandaid), device checking changes * tc python emulation * fixed emulation * tests for emulated intel tensor core * TC=0, 1 working on upstream, fixed perf * test * debris * check for specialized cl device when we canonicalize device * bf16 support, tc=3 test added * address tests * revert half2 loads on intel tc, cleanup * linter * fold_expanded revert * lint, whitespace fix * cuda bf16 (only one with bf16) is skipped in test tensor cores, so i will skip for intel bf16 too * make line shorter, no need for noqa E501 * removed device intel * fix python emulation --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
2026-02-16 09:37:11 -05:00 · 2024-08-16 11:19:21 -05:00
parent f82ecd8802
commit 38fb1e14a2
8 changed files with 50 additions and 13 deletions
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -2,8 +2,8 @@ from __future__ import annotations
 from typing import Tuple, Optional, List, cast
 import ctypes, functools, hashlib
 from tinygrad.runtime.autogen import opencl as cl
-from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
-from tinygrad.renderer.cstyle import OpenCLRenderer
+from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv
+from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
 from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError

 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
@@ -95,9 +95,11 @@ class CLDevice(Compiled):
    self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
    self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
    self.pending_copyin: List[memoryview] = []
+    self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1]  # noqa: E501

    compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
-    super().__init__(device, CLAllocator(self), OpenCLRenderer(), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
+    renderer = IntelRenderer() if "cl_intel_subgroup_matrix_multiply_accumulate" in self.device_exts and getenv("INTEL") else OpenCLRenderer()
+    super().__init__(device, CLAllocator(self), renderer, CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
  def synchronize(self):
    check(cl.clFinish(self.queue))
    self.pending_copyin.clear()