fix tests for NV (#13744)

* small fix * min diff * bfloat16 out
2026-01-10 07:28:15 -05:00 · 2025-12-18 13:20:21 -04:00
parent 77191fb744
commit fa40df972f
2 changed files with 15 additions and 3 deletions
--- a/test/opt/test_tensor_cores.py
+++ b/test/opt/test_tensor_cores.py
@@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype
 from tinygrad.uop.ops import Ops
 from tinygrad.dtype import DType
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM
+from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM, Context
 from test.helpers import slow
 from tinygrad.engine.realize import CompiledRunner, get_program
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
@@ -49,13 +49,14 @@ def helper_tc_allclose(N:int, M:int, K:int, dtype_in:DType, dtype_out:DType, axi
  assert len([x for x in prg.p.uops[-1].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
  prg.exec(bufs)
  if dtype_in == dtypes.half: tc_atol, tc_rtol = 1e-2, 1e-3
-  elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = 1e-2, 1e-2
+  elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = (1e-1, 2e-2) if dtype_out == dtypes.bfloat16 else (1e-2, 1e-2)
  else: tc_atol, tc_rtol = 5e-3, 1e-4
  c = bufs[0].numpy().reshape((M,N))
  np.testing.assert_allclose(c, np_a @ np_b, atol=tc_atol, rtol=tc_rtol)

 class TestTensorCores(unittest.TestCase):
  # TODO: don't skip bf16 for real device (METAL, AMD)
+  @Context(ALLOW_TF32=1)
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
  def test_tensor_cores(self):
    for tc in Device[Device.DEFAULT].renderer.tensor_cores:
@@ -63,6 +64,7 @@ class TestTensorCores(unittest.TestCase):
      # for AMX, tc.dims[2] == 1 so reduceop is None thus tensor_cores are not triggered
      helper_tc_allclose(tc.dims[0], tc.dims[1], 2 if AMX else tc.dims[2], tc.dtype_in, tc.dtype_out, axis=0, tc_opt=0)

+  @Context(ALLOW_TF32=1)
  @unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
  def test_tensor_cores_codegen(self):
@@ -81,6 +83,7 @@ class TestTensorCores(unittest.TestCase):
      else:
        assert "__WMMA_" in prg.src

+  @Context(ALLOW_TF32=1)
  @unittest.skipIf((Device.DEFAULT == "AMD") or (Device.DEFAULT == "PYTHON" and Device.default.renderer.device == "AMD"), "broken for AMD")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
  def test_tensor_cores_padded(self):
@@ -98,6 +101,7 @@ class TestTensorCores(unittest.TestCase):
      if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
      helper_tc_allclose(tc.dims[0]+(pad:=1), tc.dims[1]+pad, tc.dims[2]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)

+  @Context(ALLOW_TF32=1)
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
  def test_tensor_cores_padded_uops(self):
    for tc in Device[Device.DEFAULT].renderer.tensor_cores:
@@ -119,6 +123,7 @@ class TestTensorCores(unittest.TestCase):
      if not AMX: # AMX tc.dims[2] == 1
        helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//8, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)

+  @Context(ALLOW_TF32=1)
  @unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
  @slow
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
@@ -149,6 +154,7 @@ class TestTensorCores(unittest.TestCase):
        if golden_result is None: golden_result = np.frombuffer(real_bufs[0].as_buffer(), _to_np_dtype(real_bufs[0].dtype))
        np.testing.assert_allclose(result, golden_result, atol=0.1, rtol=0.2)

+  @Context(ALLOW_TF32=1)
  @unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
  def test_tensor_cores_unroll_phi(self):
@@ -161,6 +167,7 @@ class TestTensorCores(unittest.TestCase):
      if u.op is Ops.WMMA:
        assert u.src[-1].src[0].op != Ops.STORE

+  @Context(ALLOW_TF32=1)
  @unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
  @unittest.skipIf(Device.DEFAULT in {"CPU"}, "CPU does not support using a different type for accumulation")
@@ -175,6 +182,7 @@ class TestTensorCores(unittest.TestCase):
        #assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
        assert u.src[-1].src[0].op != Ops.STORE

+  @Context(ALLOW_TF32=1)
  @unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
  @unittest.skipIf(Device.DEFAULT in {"CPU"}, "CPU does not support using a different type for accumulation")
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -246,7 +246,9 @@ class NVProgram(HCQProgram):
        elif typ == 0x39: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) >> 32)
        else: raise RuntimeError(f"unknown NV reloc {typ}")

-      self.cbuf_0 = [0] * (cbuf0_size // 4)
+      # Minimum cbuf_0 size for driver params: Blackwell needs index 223 (224 entries), older GPUs need index 11 (12 entries)
+      min_cbuf0_entries = 224 if dev.iface.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else 12
+      self.cbuf_0 = [0] * max(cbuf0_size // 4, min_cbuf0_entries)

    # Ensure device has enough local memory to run the program
    self.dev._ensure_has_local_memory(self.lcmem_usage)
@@ -503,6 +505,8 @@ class PCIIface(PCIIfaceBase):
  gpus:ClassVar[list[str]] = []

  def __init__(self, dev, dev_id):
+    # PCIIface's MAP_FIXED mmap will overwrite UVM allocations made by NVKIface, so don't try PCIIface if kernel driver was already used.
+    if NVKIface.root is not None: raise RuntimeError("Cannot use PCIIface after NVKIface has been initialized (would corrupt UVM memory)")
    super().__init__(dev, dev_id, vendor=0x10de, devices=[(0xff00, [0x2200, 0x2400, 0x2500, 0x2600, 0x2700, 0x2800, 0x2b00, 0x2c00, 0x2d00, 0x2f00])],
      bars=[0, 1], vram_bar=1, va_start=NVMemoryManager.va_allocator.base, va_size=NVMemoryManager.va_allocator.size)
    if not OSX: System.reserve_hugepages(64)