fix tests for NV (#13744)

* small fix

* min diff

* bfloat16 out
This commit is contained in:
George Hotz
2025-12-18 13:20:21 -04:00
committed by GitHub
parent 77191fb744
commit fa40df972f
2 changed files with 15 additions and 3 deletions

View File

@@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype
from tinygrad.uop.ops import Ops
from tinygrad.dtype import DType
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM
from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM, Context
from test.helpers import slow
from tinygrad.engine.realize import CompiledRunner, get_program
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
@@ -49,13 +49,14 @@ def helper_tc_allclose(N:int, M:int, K:int, dtype_in:DType, dtype_out:DType, axi
assert len([x for x in prg.p.uops[-1].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
prg.exec(bufs)
if dtype_in == dtypes.half: tc_atol, tc_rtol = 1e-2, 1e-3
elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = 1e-2, 1e-2
elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = (1e-1, 2e-2) if dtype_out == dtypes.bfloat16 else (1e-2, 1e-2)
else: tc_atol, tc_rtol = 5e-3, 1e-4
c = bufs[0].numpy().reshape((M,N))
np.testing.assert_allclose(c, np_a @ np_b, atol=tc_atol, rtol=tc_rtol)
class TestTensorCores(unittest.TestCase):
# TODO: don't skip bf16 for real device (METAL, AMD)
@Context(ALLOW_TF32=1)
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
def test_tensor_cores(self):
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
@@ -63,6 +64,7 @@ class TestTensorCores(unittest.TestCase):
# for AMX, tc.dims[2] == 1 so reduceop is None thus tensor_cores are not triggered
helper_tc_allclose(tc.dims[0], tc.dims[1], 2 if AMX else tc.dims[2], tc.dtype_in, tc.dtype_out, axis=0, tc_opt=0)
@Context(ALLOW_TF32=1)
@unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
def test_tensor_cores_codegen(self):
@@ -81,6 +83,7 @@ class TestTensorCores(unittest.TestCase):
else:
assert "__WMMA_" in prg.src
@Context(ALLOW_TF32=1)
@unittest.skipIf((Device.DEFAULT == "AMD") or (Device.DEFAULT == "PYTHON" and Device.default.renderer.device == "AMD"), "broken for AMD")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
def test_tensor_cores_padded(self):
@@ -98,6 +101,7 @@ class TestTensorCores(unittest.TestCase):
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
helper_tc_allclose(tc.dims[0]+(pad:=1), tc.dims[1]+pad, tc.dims[2]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)
@Context(ALLOW_TF32=1)
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
def test_tensor_cores_padded_uops(self):
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
@@ -119,6 +123,7 @@ class TestTensorCores(unittest.TestCase):
if not AMX: # AMX tc.dims[2] == 1
helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//8, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
@Context(ALLOW_TF32=1)
@unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
@slow
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
@@ -149,6 +154,7 @@ class TestTensorCores(unittest.TestCase):
if golden_result is None: golden_result = np.frombuffer(real_bufs[0].as_buffer(), _to_np_dtype(real_bufs[0].dtype))
np.testing.assert_allclose(result, golden_result, atol=0.1, rtol=0.2)
@Context(ALLOW_TF32=1)
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
def test_tensor_cores_unroll_phi(self):
@@ -161,6 +167,7 @@ class TestTensorCores(unittest.TestCase):
if u.op is Ops.WMMA:
assert u.src[-1].src[0].op != Ops.STORE
@Context(ALLOW_TF32=1)
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
@unittest.skipIf(Device.DEFAULT in {"CPU"}, "CPU does not support using a different type for accumulation")
@@ -175,6 +182,7 @@ class TestTensorCores(unittest.TestCase):
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
assert u.src[-1].src[0].op != Ops.STORE
@Context(ALLOW_TF32=1)
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
@unittest.skipIf(Device.DEFAULT in {"CPU"}, "CPU does not support using a different type for accumulation")

View File

@@ -246,7 +246,9 @@ class NVProgram(HCQProgram):
elif typ == 0x39: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) >> 32)
else: raise RuntimeError(f"unknown NV reloc {typ}")
self.cbuf_0 = [0] * (cbuf0_size // 4)
# Minimum cbuf_0 size for driver params: Blackwell needs index 223 (224 entries), older GPUs need index 11 (12 entries)
min_cbuf0_entries = 224 if dev.iface.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else 12
self.cbuf_0 = [0] * max(cbuf0_size // 4, min_cbuf0_entries)
# Ensure device has enough local memory to run the program
self.dev._ensure_has_local_memory(self.lcmem_usage)
@@ -503,6 +505,8 @@ class PCIIface(PCIIfaceBase):
gpus:ClassVar[list[str]] = []
def __init__(self, dev, dev_id):
# PCIIface's MAP_FIXED mmap will overwrite UVM allocations made by NVKIface, so don't try PCIIface if kernel driver was already used.
if NVKIface.root is not None: raise RuntimeError("Cannot use PCIIface after NVKIface has been initialized (would corrupt UVM memory)")
super().__init__(dev, dev_id, vendor=0x10de, devices=[(0xff00, [0x2200, 0x2400, 0x2500, 0x2600, 0x2700, 0x2800, 0x2b00, 0x2c00, 0x2d00, 0x2f00])],
bars=[0, 1], vram_bar=1, va_start=NVMemoryManager.va_allocator.base, va_size=NVMemoryManager.va_allocator.size)
if not OSX: System.reserve_hugepages(64)