Mesa: freedreno (#12746)

* ir3 init

* got a program

* 1 + 1 works

* use isa_disasm instead of shader_disasm

* wip

* matmul works

* works on py3.14

* fix const loading

* skip QCOM failing tests

* cleanup

* args actually work

* add compile-only tests

* fix typo and install tinymesa

* IR3 NULL backend

* (float32) images work

* autogen fix

* fix compile only test

* typo

* mypy happy

* compile-only uses py3.14

* bump mesa

* unify qcom disassembler

* float16 works

* disasm shows in viz

* save a line

* add real del

* variable workgroup sizes

* simplify diff

* bump line count

* properly set wgsz

* regen mesa

* no preamble

* bump lines
This commit is contained in:
Christopher Milan
2025-12-08 11:02:08 -08:00
committed by GitHub
parent 947c6eefc3
commit 1c16b6e082
11 changed files with 2077 additions and 72 deletions

View File

@@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
import numpy as np
from typing import List, Callable
import torch
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, CPU_LVP, AMD_LLVM
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, CPU_LVP, AMD_LLVM, EMULATE
from tinygrad import Tensor, Device, dtypes
from tinygrad.tensor import _to_np_dtype
from tinygrad.device import is_dtype_supported
@@ -16,6 +16,7 @@ if CI:
FORWARD_ONLY = getenv("FORWARD_ONLY", 0)
PRINT_TENSORS = getenv("PRINT_TENSORS", 0)
COMPILE_ONLY = Device.DEFAULT == "NULL" and not EMULATE
def slow_test(test_func):
return unittest.skipIf(getenv("SKIP_SLOW_TEST"), "Skipping slow test")(test_func)
@@ -38,6 +39,7 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn=None, atol=1e-6, rtol=1e-3, gra
tinygrad_fp = time.monotonic() - st
def compare(s, tinygrad_output, torch_output, atol, rtol):
if COMPILE_ONLY: return
if PRINT_TENSORS: print(s, tinygrad_output, torch_output)
try:
assert tinygrad_output.shape == torch_output.shape, f"shape mismatch: tinygrad={tinygrad_output.shape} | torch={torch_output.shape}"
@@ -421,8 +423,9 @@ class TestOps(unittest.TestCase):
def test_isinf(self):
val = [float('-inf'), 0., float('inf'), float('nan'), 1.1]
helper_test_op(None, torch.isinf, Tensor.isinf, vals=[val], forward_only=True)
np.testing.assert_equal(Tensor(val).isinf(detect_positive=True, detect_negative=False).numpy(), [False, False, True, False, False])
np.testing.assert_equal(Tensor(val).isinf(detect_positive=False, detect_negative=True).numpy(), [True, False, False, False, False])
if not COMPILE_ONLY:
np.testing.assert_equal(Tensor(val).isinf(detect_positive=True, detect_negative=False).numpy(), [False, False, True, False, False])
np.testing.assert_equal(Tensor(val).isinf(detect_positive=False, detect_negative=True).numpy(), [True, False, False, False, False])
def test_isnan(self):
helper_test_op(None, torch.isnan, Tensor.isnan, vals=[[float('-inf'), 0., float('inf'), float('nan'), 1.1]], forward_only=True)
@@ -594,7 +597,7 @@ class TestOps(unittest.TestCase):
helper_test_op(None, lambda x: x//2, forward_only=True, vals=[[3, 4, 5]])
helper_test_op(None, functools.partial(torch.div, rounding_mode="trunc"), Tensor.idiv, forward_only=True,
vals=[[-4, 7, 5, 4, -7, 8], [2, -3, 8, -2, 3, 5]])
if is_dtype_supported(dtypes.uint64):
if is_dtype_supported(dtypes.uint64) and not COMPILE_ONLY:
x = Tensor(2**64 - 1, dtype=dtypes.uint64).idiv(1)
np.testing.assert_equal(x.numpy(), 2**64 - 1)
@@ -679,6 +682,7 @@ class TestOps(unittest.TestCase):
# float to power of int
helper_test_op(None, lambda x: 0.7**x, vals=[[-2,-1,0,1,2,3]], forward_only=True)
@unittest.skipIf(COMPILE_ONLY, "test requires runtime")
def test_pow_const_direct(self):
# x ** c
def get_tiny_gradient(x, c):
@@ -1088,8 +1092,9 @@ class TestOps(unittest.TestCase):
# check if it returns the first index for multiple occurences
helper_test_op(None, lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[2, 2]])
helper_test_op(None, lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[1, 2, 2]])
np.testing.assert_equal(Tensor([2,2]).argmax().numpy(), 0)
np.testing.assert_equal(Tensor([1,2,2]).argmax().numpy(), 1)
if not COMPILE_ONLY:
np.testing.assert_equal(Tensor([2,2]).argmax().numpy(), 0)
np.testing.assert_equal(Tensor([1,2,2]).argmax().numpy(), 1)
helper_test_op([(10,20)], lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True)
helper_test_op([(10,20)], lambda x: x.argmax(0, False).type(torch.int32), lambda x: x.argmax(0, False), forward_only=True)
helper_test_op([(10,20)], lambda x: x.argmax(1, False).type(torch.int32), lambda x: x.argmax(1, False), forward_only=True)
@@ -1107,8 +1112,9 @@ class TestOps(unittest.TestCase):
# check if it returns the first index for multiple occurences
helper_test_op(None, lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True, vals=[[2, 2]])
helper_test_op(None, lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True, vals=[[3, 2, 2]])
np.testing.assert_equal(Tensor([2,2]).argmin().numpy(), 0)
np.testing.assert_equal(Tensor([3,2,2]).argmin().numpy(), 1)
if not COMPILE_ONLY:
np.testing.assert_equal(Tensor([2,2]).argmin().numpy(), 0)
np.testing.assert_equal(Tensor([3,2,2]).argmin().numpy(), 1)
helper_test_op([(10,20)], lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True)
helper_test_op([(10,20)], lambda x: x.argmin(0, False).type(torch.int32), lambda x: x.argmin(0, False), forward_only=True)
helper_test_op([(10,20)], lambda x: x.argmin(1, False).type(torch.int32), lambda x: x.argmin(1, False), forward_only=True)
@@ -1156,12 +1162,13 @@ class TestOps(unittest.TestCase):
lambda x: x.topk(4, dim, largest, sorted_).indices.type(torch.int32),
lambda x: x.topk(4, dim, largest, sorted_)[1], forward_only=True)
# repeated values
value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3)
np.testing.assert_equal(value.numpy(), [1, 1, 1])
np.testing.assert_equal(indices.numpy(), [0, 1, 3])
value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3, largest=False)
np.testing.assert_equal(value.numpy(), [0, 0, 0])
np.testing.assert_equal(indices.numpy(), [2, 4, 6])
if not COMPILE_ONLY:
value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3)
np.testing.assert_equal(value.numpy(), [1, 1, 1])
np.testing.assert_equal(indices.numpy(), [0, 1, 3])
value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3, largest=False)
np.testing.assert_equal(value.numpy(), [0, 0, 0])
np.testing.assert_equal(indices.numpy(), [2, 4, 6])
self.helper_test_exception([(4)], lambda x: x.topk(5), expected=(RuntimeError, ValueError))
@slow_test
@@ -1313,6 +1320,7 @@ class TestOps(unittest.TestCase):
helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
@unittest.skipIf(Device.DEFAULT == "QCOM", "not precise enough")
def test_gemm_fp16(self):
helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3, grad_atol=5e-3, grad_rtol=5e-3)
def test_gemm(self):
@@ -1723,6 +1731,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(7,5,10)], lambda x: x[1:5:2, 3, ::4])
helper_test_op([(7,5,10)], lambda x: x[1:5:2, None, None, 3, None, ::4])
@unittest.skipIf(COMPILE_ONLY, "test requires runtime")
def test_slice_negative_strides(self):
# Torch doesn't support slicing with negative steps
a = np.random.randn(10, 10, 10).astype(np.float32)
@@ -2752,6 +2761,7 @@ class TestOps(unittest.TestCase):
n = Tensor([1, float("nan")]).max().numpy()
assert math.isnan(n.item()), f"{n.item()} is not nan"
@unittest.skipIf(COMPILE_ONLY, "test requires runtime")
def test_inf_where(self):
x = Tensor.full((3, 3), float("inf"))
n = (x < 0).where(x, 1).numpy()
@@ -3168,6 +3178,7 @@ class TestOps(unittest.TestCase):
@unittest.skipIf((getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
@unittest.skipIf(Device.DEFAULT == "QCOM", "QCOM fails with: Resource deadlock avoided")
def test_masked_select(self):
helper_test_op([(32, 10)], lambda x: x.masked_select(x>0.5), lambda x: x.masked_select(x>0.5), forward_only=True)
helper_test_op([(32, 10)], lambda x: x.masked_select(torch.tensor(True)), lambda x: x.masked_select(Tensor(True)), forward_only=True)