Intel XMX Tensor Core Support (#5622)

* fixed xmx demo

* i think i'm invoking the DPAS but it's slow

* compiler build arg to stop register spilling, indicated where to fix flop counter

* don't mind this

* do NOT mind me

* do not mind me

* do not view

* i will add bf16 later

* in process of figuring out tc fields

* we figured out the fields!!!

* added check for cl device vendor, added seperate IntelRenderer

* remove tc thread_local_aliases

* cleaning debris before draft pr

* edits for linter

* deduping and checking device extensions

* i will find more line reductions in other places

* before merge upstream

* double grf size in compiler to fix register spilling (bandaid), device checking changes

* tc python emulation

* fixed emulation

* tests for emulated intel tensor core

* TC=0, 1 working on upstream, fixed perf

* test

* debris

* check for specialized cl device when we canonicalize device

* bf16 support, tc=3 test added

* address tests

* revert half2 loads on intel tc, cleanup

* linter

* fold_expanded revert

* lint, whitespace fix

* cuda bf16 (only one with bf16) is skipped in test tensor cores, so i will skip for intel bf16 too

* make line shorter, no need for noqa E501

* removed device intel

* fix python emulation

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
CaltropHungerton
2024-08-16 11:19:21 -05:00
committed by GitHub
parent f82ecd8802
commit 38fb1e14a2
8 changed files with 50 additions and 13 deletions

View File

@@ -2,8 +2,8 @@ from __future__ import annotations
from typing import Tuple, Optional, List, cast
import ctypes, functools, hashlib
from tinygrad.runtime.autogen import opencl as cl
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
from tinygrad.renderer.cstyle import OpenCLRenderer
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv
from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
@@ -95,9 +95,11 @@ class CLDevice(Compiled):
self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
self.pending_copyin: List[memoryview] = []
self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1] # noqa: E501
compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
super().__init__(device, CLAllocator(self), OpenCLRenderer(), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
renderer = IntelRenderer() if "cl_intel_subgroup_matrix_multiply_accumulate" in self.device_exts and getenv("INTEL") else OpenCLRenderer()
super().__init__(device, CLAllocator(self), renderer, CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
def synchronize(self):
check(cl.clFinish(self.queue))
self.pending_copyin.clear()