mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 07:28:15 -05:00
@@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM
|
||||
from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM, Context
|
||||
from test.helpers import slow
|
||||
from tinygrad.engine.realize import CompiledRunner, get_program
|
||||
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
|
||||
@@ -49,13 +49,14 @@ def helper_tc_allclose(N:int, M:int, K:int, dtype_in:DType, dtype_out:DType, axi
|
||||
assert len([x for x in prg.p.uops[-1].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
prg.exec(bufs)
|
||||
if dtype_in == dtypes.half: tc_atol, tc_rtol = 1e-2, 1e-3
|
||||
elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = 1e-2, 1e-2
|
||||
elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = (1e-1, 2e-2) if dtype_out == dtypes.bfloat16 else (1e-2, 1e-2)
|
||||
else: tc_atol, tc_rtol = 5e-3, 1e-4
|
||||
c = bufs[0].numpy().reshape((M,N))
|
||||
np.testing.assert_allclose(c, np_a @ np_b, atol=tc_atol, rtol=tc_rtol)
|
||||
|
||||
class TestTensorCores(unittest.TestCase):
|
||||
# TODO: don't skip bf16 for real device (METAL, AMD)
|
||||
@Context(ALLOW_TF32=1)
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
@@ -63,6 +64,7 @@ class TestTensorCores(unittest.TestCase):
|
||||
# for AMX, tc.dims[2] == 1 so reduceop is None thus tensor_cores are not triggered
|
||||
helper_tc_allclose(tc.dims[0], tc.dims[1], 2 if AMX else tc.dims[2], tc.dtype_in, tc.dtype_out, axis=0, tc_opt=0)
|
||||
|
||||
@Context(ALLOW_TF32=1)
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_codegen(self):
|
||||
@@ -81,6 +83,7 @@ class TestTensorCores(unittest.TestCase):
|
||||
else:
|
||||
assert "__WMMA_" in prg.src
|
||||
|
||||
@Context(ALLOW_TF32=1)
|
||||
@unittest.skipIf((Device.DEFAULT == "AMD") or (Device.DEFAULT == "PYTHON" and Device.default.renderer.device == "AMD"), "broken for AMD")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_padded(self):
|
||||
@@ -98,6 +101,7 @@ class TestTensorCores(unittest.TestCase):
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
helper_tc_allclose(tc.dims[0]+(pad:=1), tc.dims[1]+pad, tc.dims[2]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)
|
||||
|
||||
@Context(ALLOW_TF32=1)
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_padded_uops(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
@@ -119,6 +123,7 @@ class TestTensorCores(unittest.TestCase):
|
||||
if not AMX: # AMX tc.dims[2] == 1
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//8, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
|
||||
|
||||
@Context(ALLOW_TF32=1)
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
|
||||
@slow
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@@ -149,6 +154,7 @@ class TestTensorCores(unittest.TestCase):
|
||||
if golden_result is None: golden_result = np.frombuffer(real_bufs[0].as_buffer(), _to_np_dtype(real_bufs[0].dtype))
|
||||
np.testing.assert_allclose(result, golden_result, atol=0.1, rtol=0.2)
|
||||
|
||||
@Context(ALLOW_TF32=1)
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_unroll_phi(self):
|
||||
@@ -161,6 +167,7 @@ class TestTensorCores(unittest.TestCase):
|
||||
if u.op is Ops.WMMA:
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
@Context(ALLOW_TF32=1)
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@unittest.skipIf(Device.DEFAULT in {"CPU"}, "CPU does not support using a different type for accumulation")
|
||||
@@ -175,6 +182,7 @@ class TestTensorCores(unittest.TestCase):
|
||||
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
@Context(ALLOW_TF32=1)
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@unittest.skipIf(Device.DEFAULT in {"CPU"}, "CPU does not support using a different type for accumulation")
|
||||
|
||||
@@ -246,7 +246,9 @@ class NVProgram(HCQProgram):
|
||||
elif typ == 0x39: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) >> 32)
|
||||
else: raise RuntimeError(f"unknown NV reloc {typ}")
|
||||
|
||||
self.cbuf_0 = [0] * (cbuf0_size // 4)
|
||||
# Minimum cbuf_0 size for driver params: Blackwell needs index 223 (224 entries), older GPUs need index 11 (12 entries)
|
||||
min_cbuf0_entries = 224 if dev.iface.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else 12
|
||||
self.cbuf_0 = [0] * max(cbuf0_size // 4, min_cbuf0_entries)
|
||||
|
||||
# Ensure device has enough local memory to run the program
|
||||
self.dev._ensure_has_local_memory(self.lcmem_usage)
|
||||
@@ -503,6 +505,8 @@ class PCIIface(PCIIfaceBase):
|
||||
gpus:ClassVar[list[str]] = []
|
||||
|
||||
def __init__(self, dev, dev_id):
|
||||
# PCIIface's MAP_FIXED mmap will overwrite UVM allocations made by NVKIface, so don't try PCIIface if kernel driver was already used.
|
||||
if NVKIface.root is not None: raise RuntimeError("Cannot use PCIIface after NVKIface has been initialized (would corrupt UVM memory)")
|
||||
super().__init__(dev, dev_id, vendor=0x10de, devices=[(0xff00, [0x2200, 0x2400, 0x2500, 0x2600, 0x2700, 0x2800, 0x2b00, 0x2c00, 0x2d00, 0x2f00])],
|
||||
bars=[0, 1], vram_bar=1, va_start=NVMemoryManager.va_allocator.base, va_size=NVMemoryManager.va_allocator.size)
|
||||
if not OSX: System.reserve_hugepages(64)
|
||||
|
||||
Reference in New Issue
Block a user