mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-16 09:37:11 -05:00
Intel XMX Tensor Core Support (#5622)
* fixed xmx demo * i think i'm invoking the DPAS but it's slow * compiler build arg to stop register spilling, indicated where to fix flop counter * don't mind this * do NOT mind me * do not mind me * do not view * i will add bf16 later * in process of figuring out tc fields * we figured out the fields!!! * added check for cl device vendor, added seperate IntelRenderer * remove tc thread_local_aliases * cleaning debris before draft pr * edits for linter * deduping and checking device extensions * i will find more line reductions in other places * before merge upstream * double grf size in compiler to fix register spilling (bandaid), device checking changes * tc python emulation * fixed emulation * tests for emulated intel tensor core * TC=0, 1 working on upstream, fixed perf * test * debris * check for specialized cl device when we canonicalize device * bf16 support, tc=3 test added * address tests * revert half2 loads on intel tc, cleanup * linter * fold_expanded revert * lint, whitespace fix * cuda bf16 (only one with bf16) is skipped in test tensor cores, so i will skip for intel bf16 too * make line shorter, no need for noqa E501 * removed device intel * fix python emulation --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
@@ -2,8 +2,8 @@ from __future__ import annotations
|
||||
from typing import Tuple, Optional, List, cast
|
||||
import ctypes, functools, hashlib
|
||||
from tinygrad.runtime.autogen import opencl as cl
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
|
||||
from tinygrad.renderer.cstyle import OpenCLRenderer
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv
|
||||
from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
|
||||
from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
|
||||
|
||||
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
||||
@@ -95,9 +95,11 @@ class CLDevice(Compiled):
|
||||
self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
|
||||
self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
|
||||
self.pending_copyin: List[memoryview] = []
|
||||
self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1] # noqa: E501
|
||||
|
||||
compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
|
||||
super().__init__(device, CLAllocator(self), OpenCLRenderer(), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
|
||||
renderer = IntelRenderer() if "cl_intel_subgroup_matrix_multiply_accumulate" in self.device_exts and getenv("INTEL") else OpenCLRenderer()
|
||||
super().__init__(device, CLAllocator(self), renderer, CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
|
||||
def synchronize(self):
|
||||
check(cl.clFinish(self.queue))
|
||||
self.pending_copyin.clear()
|
||||
|
||||
Reference in New Issue
Block a user