Mesa: freedreno (#12746)

* ir3 init * got a program * 1 + 1 works * use isa_disasm instead of shader_disasm * wip * matmul works * works on py3.14 * fix const loading * skip QCOM failing tests * cleanup * args actually work * add compile-only tests * fix typo and install tinymesa * IR3 NULL backend * (float32) images work * autogen fix * fix compile only test * typo * mypy happy * compile-only uses py3.14 * bump mesa * unify qcom disassembler * float16 works * disasm shows in viz * save a line * add real del * variable workgroup sizes * simplify diff * bump line count * properly set wgsz * regen mesa * no preamble * bump lines
2026-01-09 15:08:02 -05:00 · 2025-12-08 11:02:08 -08:00
parent 947c6eefc3
commit 1c16b6e082
11 changed files with 2077 additions and 72 deletions
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
 import numpy as np
 from typing import List, Callable
 import torch
-from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, CPU_LVP, AMD_LLVM
+from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, CPU_LVP, AMD_LLVM, EMULATE
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
@@ -16,6 +16,7 @@ if CI:

 FORWARD_ONLY = getenv("FORWARD_ONLY", 0)
 PRINT_TENSORS = getenv("PRINT_TENSORS", 0)
+COMPILE_ONLY = Device.DEFAULT == "NULL" and not EMULATE

 def slow_test(test_func):
  return unittest.skipIf(getenv("SKIP_SLOW_TEST"), "Skipping slow test")(test_func)
@@ -38,6 +39,7 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn=None, atol=1e-6, rtol=1e-3, gra
  tinygrad_fp = time.monotonic() - st

  def compare(s, tinygrad_output, torch_output, atol, rtol):
+    if COMPILE_ONLY: return
    if PRINT_TENSORS: print(s, tinygrad_output, torch_output)
    try:
      assert tinygrad_output.shape == torch_output.shape, f"shape mismatch: tinygrad={tinygrad_output.shape} | torch={torch_output.shape}"
@@ -421,8 +423,9 @@ class TestOps(unittest.TestCase):
  def test_isinf(self):
    val = [float('-inf'), 0., float('inf'), float('nan'), 1.1]
    helper_test_op(None, torch.isinf, Tensor.isinf, vals=[val], forward_only=True)
-    np.testing.assert_equal(Tensor(val).isinf(detect_positive=True, detect_negative=False).numpy(), [False, False, True, False, False])
-    np.testing.assert_equal(Tensor(val).isinf(detect_positive=False, detect_negative=True).numpy(), [True, False, False, False, False])
+    if not COMPILE_ONLY:
+      np.testing.assert_equal(Tensor(val).isinf(detect_positive=True, detect_negative=False).numpy(), [False, False, True, False, False])
+      np.testing.assert_equal(Tensor(val).isinf(detect_positive=False, detect_negative=True).numpy(), [True, False, False, False, False])

  def test_isnan(self):
    helper_test_op(None, torch.isnan, Tensor.isnan, vals=[[float('-inf'), 0., float('inf'), float('nan'), 1.1]], forward_only=True)
@@ -594,7 +597,7 @@ class TestOps(unittest.TestCase):
    helper_test_op(None, lambda x: x//2, forward_only=True, vals=[[3, 4, 5]])
    helper_test_op(None, functools.partial(torch.div, rounding_mode="trunc"), Tensor.idiv, forward_only=True,
                   vals=[[-4, 7, 5, 4, -7, 8], [2, -3, 8, -2, 3, 5]])
-    if is_dtype_supported(dtypes.uint64):
+    if is_dtype_supported(dtypes.uint64) and not COMPILE_ONLY:
      x = Tensor(2**64 - 1, dtype=dtypes.uint64).idiv(1)
      np.testing.assert_equal(x.numpy(), 2**64 - 1)

@@ -679,6 +682,7 @@ class TestOps(unittest.TestCase):
    # float to power of int
    helper_test_op(None, lambda x: 0.7**x, vals=[[-2,-1,0,1,2,3]], forward_only=True)

+  @unittest.skipIf(COMPILE_ONLY, "test requires runtime")
  def test_pow_const_direct(self):
    # x ** c
    def get_tiny_gradient(x, c):
@@ -1088,8 +1092,9 @@ class TestOps(unittest.TestCase):
    # check if it returns the first index for multiple occurences
    helper_test_op(None, lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[2, 2]])
    helper_test_op(None, lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[1, 2, 2]])
-    np.testing.assert_equal(Tensor([2,2]).argmax().numpy(), 0)
-    np.testing.assert_equal(Tensor([1,2,2]).argmax().numpy(), 1)
+    if not COMPILE_ONLY:
+      np.testing.assert_equal(Tensor([2,2]).argmax().numpy(), 0)
+      np.testing.assert_equal(Tensor([1,2,2]).argmax().numpy(), 1)
    helper_test_op([(10,20)], lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True)
    helper_test_op([(10,20)], lambda x: x.argmax(0, False).type(torch.int32), lambda x: x.argmax(0, False), forward_only=True)
    helper_test_op([(10,20)], lambda x: x.argmax(1, False).type(torch.int32), lambda x: x.argmax(1, False), forward_only=True)
@@ -1107,8 +1112,9 @@ class TestOps(unittest.TestCase):
    # check if it returns the first index for multiple occurences
    helper_test_op(None, lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True, vals=[[2, 2]])
    helper_test_op(None, lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True, vals=[[3, 2, 2]])
-    np.testing.assert_equal(Tensor([2,2]).argmin().numpy(), 0)
-    np.testing.assert_equal(Tensor([3,2,2]).argmin().numpy(), 1)
+    if not COMPILE_ONLY:
+      np.testing.assert_equal(Tensor([2,2]).argmin().numpy(), 0)
+      np.testing.assert_equal(Tensor([3,2,2]).argmin().numpy(), 1)
    helper_test_op([(10,20)], lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True)
    helper_test_op([(10,20)], lambda x: x.argmin(0, False).type(torch.int32), lambda x: x.argmin(0, False), forward_only=True)
    helper_test_op([(10,20)], lambda x: x.argmin(1, False).type(torch.int32), lambda x: x.argmin(1, False), forward_only=True)
@@ -1156,12 +1162,13 @@ class TestOps(unittest.TestCase):
                          lambda x: x.topk(4, dim, largest, sorted_).indices.type(torch.int32),
                          lambda x: x.topk(4, dim, largest, sorted_)[1], forward_only=True)
    # repeated values
-    value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3)
-    np.testing.assert_equal(value.numpy(), [1, 1, 1])
-    np.testing.assert_equal(indices.numpy(), [0, 1, 3])
-    value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3, largest=False)
-    np.testing.assert_equal(value.numpy(), [0, 0, 0])
-    np.testing.assert_equal(indices.numpy(), [2, 4, 6])
+    if not COMPILE_ONLY:
+      value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3)
+      np.testing.assert_equal(value.numpy(), [1, 1, 1])
+      np.testing.assert_equal(indices.numpy(), [0, 1, 3])
+      value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3, largest=False)
+      np.testing.assert_equal(value.numpy(), [0, 0, 0])
+      np.testing.assert_equal(indices.numpy(), [2, 4, 6])
    self.helper_test_exception([(4)], lambda x: x.topk(5), expected=(RuntimeError, ValueError))

  @slow_test
@@ -1313,6 +1320,7 @@ class TestOps(unittest.TestCase):
    helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
  or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
+  @unittest.skipIf(Device.DEFAULT == "QCOM", "not precise enough")
  def test_gemm_fp16(self):
    helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3, grad_atol=5e-3, grad_rtol=5e-3)
  def test_gemm(self):
@@ -1723,6 +1731,7 @@ class TestOps(unittest.TestCase):
    helper_test_op([(7,5,10)], lambda x: x[1:5:2, 3, ::4])
    helper_test_op([(7,5,10)], lambda x: x[1:5:2, None, None, 3, None, ::4])

+  @unittest.skipIf(COMPILE_ONLY, "test requires runtime")
  def test_slice_negative_strides(self):
    # Torch doesn't support slicing with negative steps
    a = np.random.randn(10, 10, 10).astype(np.float32)
@@ -2752,6 +2761,7 @@ class TestOps(unittest.TestCase):
    n = Tensor([1, float("nan")]).max().numpy()
    assert math.isnan(n.item()), f"{n.item()} is not nan"

+  @unittest.skipIf(COMPILE_ONLY, "test requires runtime")
  def test_inf_where(self):
    x = Tensor.full((3, 3), float("inf"))
    n = (x < 0).where(x, 1).numpy()
@@ -3168,6 +3178,7 @@ class TestOps(unittest.TestCase):

  @unittest.skipIf((getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
+  @unittest.skipIf(Device.DEFAULT == "QCOM", "QCOM fails with: Resource deadlock avoided")
  def test_masked_select(self):
    helper_test_op([(32, 10)], lambda x: x.masked_select(x>0.5), lambda x: x.masked_select(x>0.5), forward_only=True)
    helper_test_op([(32, 10)], lambda x: x.masked_select(torch.tensor(True)), lambda x: x.masked_select(Tensor(True)), forward_only=True)