mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
Mesa: freedreno (#12746)
* ir3 init * got a program * 1 + 1 works * use isa_disasm instead of shader_disasm * wip * matmul works * works on py3.14 * fix const loading * skip QCOM failing tests * cleanup * args actually work * add compile-only tests * fix typo and install tinymesa * IR3 NULL backend * (float32) images work * autogen fix * fix compile only test * typo * mypy happy * compile-only uses py3.14 * bump mesa * unify qcom disassembler * float16 works * disasm shows in viz * save a line * add real del * variable workgroup sizes * simplify diff * bump line count * properly set wgsz * regen mesa * no preamble * bump lines
This commit is contained in:
committed by
GitHub
parent
947c6eefc3
commit
1c16b6e082
2
.github/actions/setup-tinygrad/action.yml
vendored
2
.github/actions/setup-tinygrad/action.yml
vendored
@@ -298,7 +298,7 @@ runs:
|
||||
- name: Install mesa (linux)
|
||||
if: inputs.mesa == 'true' && runner.os == 'Linux'
|
||||
shell: bash
|
||||
run: sudo curl -fL https://github.com/sirhcm/tinymesa/releases/download/tinymesa-32dc66c/libtinymesa_cpu-mesa-25.2.4-linux-amd64.so -o /usr/lib/libtinymesa_cpu.so
|
||||
run: sudo curl -fL https://github.com/sirhcm/tinymesa/releases/download/v1/libtinymesa_cpu-mesa-25.2.7-linux-amd64.so -o /usr/lib/libtinymesa_cpu.so
|
||||
- name: Install mesa (macOS)
|
||||
if: inputs.mesa == 'true' && runner.os == 'macOS'
|
||||
shell: bash
|
||||
|
||||
33
.github/workflows/test.yml
vendored
33
.github/workflows/test.yml
vendored
@@ -289,8 +289,8 @@ jobs:
|
||||
python extra/optimization/extract_dataset.py
|
||||
gzip -c /tmp/sops > extra/datasets/sops.gz
|
||||
#DEBUG=1 MIN_ASTS=1 python extra/optimization/get_action_space.py
|
||||
- name: Repo line count < 19000 lines
|
||||
run: MAX_LINE_COUNT=19000 python sz.py
|
||||
- name: Repo line count < 19150 lines
|
||||
run: MAX_LINE_COUNT=19150 python sz.py
|
||||
|
||||
spec:
|
||||
strategy:
|
||||
@@ -972,3 +972,32 @@ jobs:
|
||||
run: |
|
||||
python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT"
|
||||
python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20
|
||||
|
||||
# ****** Compile-only Tests ******
|
||||
|
||||
compiletests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
backend: [ir3]
|
||||
name: Compile-only (${{ matrix.backend }})
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v4
|
||||
- name: Setup Environment
|
||||
uses: ./.github/actions/setup-tinygrad
|
||||
with:
|
||||
key: compile-${{ matrix.backend }}
|
||||
deps: testing_minimal
|
||||
mesa: ${{ matrix.backend == 'ir3' && 'true' }}
|
||||
python-version: '3.14'
|
||||
- name: Set env
|
||||
shell: bash
|
||||
run: printf "NULL=1\n${{ matrix.backend == 'ir3' && 'NULL_IR3=1' }}" >> $GITHUB_ENV
|
||||
- name: Run test_ops
|
||||
shell: bash
|
||||
run: |
|
||||
python -c "from tinygrad import Device; assert Device.DEFAULT == 'NULL'"
|
||||
python -m pytest -n=auto test/test_ops.py --durations=20
|
||||
|
||||
@@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
|
||||
import numpy as np
|
||||
from typing import List, Callable
|
||||
import torch
|
||||
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, CPU_LVP, AMD_LLVM
|
||||
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, CPU_LVP, AMD_LLVM, EMULATE
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.device import is_dtype_supported
|
||||
@@ -16,6 +16,7 @@ if CI:
|
||||
|
||||
FORWARD_ONLY = getenv("FORWARD_ONLY", 0)
|
||||
PRINT_TENSORS = getenv("PRINT_TENSORS", 0)
|
||||
COMPILE_ONLY = Device.DEFAULT == "NULL" and not EMULATE
|
||||
|
||||
def slow_test(test_func):
|
||||
return unittest.skipIf(getenv("SKIP_SLOW_TEST"), "Skipping slow test")(test_func)
|
||||
@@ -38,6 +39,7 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn=None, atol=1e-6, rtol=1e-3, gra
|
||||
tinygrad_fp = time.monotonic() - st
|
||||
|
||||
def compare(s, tinygrad_output, torch_output, atol, rtol):
|
||||
if COMPILE_ONLY: return
|
||||
if PRINT_TENSORS: print(s, tinygrad_output, torch_output)
|
||||
try:
|
||||
assert tinygrad_output.shape == torch_output.shape, f"shape mismatch: tinygrad={tinygrad_output.shape} | torch={torch_output.shape}"
|
||||
@@ -421,6 +423,7 @@ class TestOps(unittest.TestCase):
|
||||
def test_isinf(self):
|
||||
val = [float('-inf'), 0., float('inf'), float('nan'), 1.1]
|
||||
helper_test_op(None, torch.isinf, Tensor.isinf, vals=[val], forward_only=True)
|
||||
if not COMPILE_ONLY:
|
||||
np.testing.assert_equal(Tensor(val).isinf(detect_positive=True, detect_negative=False).numpy(), [False, False, True, False, False])
|
||||
np.testing.assert_equal(Tensor(val).isinf(detect_positive=False, detect_negative=True).numpy(), [True, False, False, False, False])
|
||||
|
||||
@@ -594,7 +597,7 @@ class TestOps(unittest.TestCase):
|
||||
helper_test_op(None, lambda x: x//2, forward_only=True, vals=[[3, 4, 5]])
|
||||
helper_test_op(None, functools.partial(torch.div, rounding_mode="trunc"), Tensor.idiv, forward_only=True,
|
||||
vals=[[-4, 7, 5, 4, -7, 8], [2, -3, 8, -2, 3, 5]])
|
||||
if is_dtype_supported(dtypes.uint64):
|
||||
if is_dtype_supported(dtypes.uint64) and not COMPILE_ONLY:
|
||||
x = Tensor(2**64 - 1, dtype=dtypes.uint64).idiv(1)
|
||||
np.testing.assert_equal(x.numpy(), 2**64 - 1)
|
||||
|
||||
@@ -679,6 +682,7 @@ class TestOps(unittest.TestCase):
|
||||
# float to power of int
|
||||
helper_test_op(None, lambda x: 0.7**x, vals=[[-2,-1,0,1,2,3]], forward_only=True)
|
||||
|
||||
@unittest.skipIf(COMPILE_ONLY, "test requires runtime")
|
||||
def test_pow_const_direct(self):
|
||||
# x ** c
|
||||
def get_tiny_gradient(x, c):
|
||||
@@ -1088,6 +1092,7 @@ class TestOps(unittest.TestCase):
|
||||
# check if it returns the first index for multiple occurences
|
||||
helper_test_op(None, lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[2, 2]])
|
||||
helper_test_op(None, lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[1, 2, 2]])
|
||||
if not COMPILE_ONLY:
|
||||
np.testing.assert_equal(Tensor([2,2]).argmax().numpy(), 0)
|
||||
np.testing.assert_equal(Tensor([1,2,2]).argmax().numpy(), 1)
|
||||
helper_test_op([(10,20)], lambda x: x.argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True)
|
||||
@@ -1107,6 +1112,7 @@ class TestOps(unittest.TestCase):
|
||||
# check if it returns the first index for multiple occurences
|
||||
helper_test_op(None, lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True, vals=[[2, 2]])
|
||||
helper_test_op(None, lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True, vals=[[3, 2, 2]])
|
||||
if not COMPILE_ONLY:
|
||||
np.testing.assert_equal(Tensor([2,2]).argmin().numpy(), 0)
|
||||
np.testing.assert_equal(Tensor([3,2,2]).argmin().numpy(), 1)
|
||||
helper_test_op([(10,20)], lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True)
|
||||
@@ -1156,6 +1162,7 @@ class TestOps(unittest.TestCase):
|
||||
lambda x: x.topk(4, dim, largest, sorted_).indices.type(torch.int32),
|
||||
lambda x: x.topk(4, dim, largest, sorted_)[1], forward_only=True)
|
||||
# repeated values
|
||||
if not COMPILE_ONLY:
|
||||
value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3)
|
||||
np.testing.assert_equal(value.numpy(), [1, 1, 1])
|
||||
np.testing.assert_equal(indices.numpy(), [0, 1, 3])
|
||||
@@ -1313,6 +1320,7 @@ class TestOps(unittest.TestCase):
|
||||
helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
|
||||
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
|
||||
or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "not precise enough")
|
||||
def test_gemm_fp16(self):
|
||||
helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3, grad_atol=5e-3, grad_rtol=5e-3)
|
||||
def test_gemm(self):
|
||||
@@ -1723,6 +1731,7 @@ class TestOps(unittest.TestCase):
|
||||
helper_test_op([(7,5,10)], lambda x: x[1:5:2, 3, ::4])
|
||||
helper_test_op([(7,5,10)], lambda x: x[1:5:2, None, None, 3, None, ::4])
|
||||
|
||||
@unittest.skipIf(COMPILE_ONLY, "test requires runtime")
|
||||
def test_slice_negative_strides(self):
|
||||
# Torch doesn't support slicing with negative steps
|
||||
a = np.random.randn(10, 10, 10).astype(np.float32)
|
||||
@@ -2752,6 +2761,7 @@ class TestOps(unittest.TestCase):
|
||||
n = Tensor([1, float("nan")]).max().numpy()
|
||||
assert math.isnan(n.item()), f"{n.item()} is not nan"
|
||||
|
||||
@unittest.skipIf(COMPILE_ONLY, "test requires runtime")
|
||||
def test_inf_where(self):
|
||||
x = Tensor.full((3, 3), float("inf"))
|
||||
n = (x < 0).where(x, 1).numpy()
|
||||
@@ -3168,6 +3178,7 @@ class TestOps(unittest.TestCase):
|
||||
|
||||
@unittest.skipIf((getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "QCOM fails with: Resource deadlock avoided")
|
||||
def test_masked_select(self):
|
||||
helper_test_op([(32, 10)], lambda x: x.masked_select(x>0.5), lambda x: x.masked_select(x>0.5), forward_only=True)
|
||||
helper_test_op([(32, 10)], lambda x: x.masked_select(torch.tensor(True)), lambda x: x.masked_select(Tensor(True)), forward_only=True)
|
||||
|
||||
@@ -366,7 +366,7 @@ def is_dtype_supported(dtype:DType, device:str|None=None) -> bool:
|
||||
if device in ["CUDA", "NV"]: return not CI
|
||||
if device == "CPU" and CPU_LLVM: return OSX
|
||||
if device == "PYTHON": return sys.version_info >= (3, 12)
|
||||
if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "CL")
|
||||
if dtype == dtypes.float64: return device not in {"METAL", "QCOM"} and not (OSX and device == "CL") and not getenv("NULL_IR3")
|
||||
return True
|
||||
|
||||
if PROFILE:
|
||||
|
||||
@@ -186,8 +186,9 @@ EMULATE = ContextVar("EMULATE", "")
|
||||
CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
|
||||
# Compilers
|
||||
CPU_LLVM, CPU_LVP, AMD_LLVM = ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0), ContextVar("AMD_LLVM", 0)
|
||||
NV_PTX, CUDA_PTX, NV_NAK = ContextVar("NV_PTX", 0), ContextVar("CUDA_PTX", 0), ContextVar("NV_NAK", 0)
|
||||
NV_PTX, CUDA_PTX, NV_NAK, QCOM_IR3 = ContextVar("NV_PTX", 0), ContextVar("CUDA_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("QCOM_IR3", 0)
|
||||
AMD_CC, CPU_CC, NV_CC, CUDA_CC = ContextVar("AMD_CC", ""), ContextVar("CPU_CC", ""), ContextVar("NV_CC", ""), ContextVar("CUDA_CC", "")
|
||||
QCOM_CC = ContextVar("QCOM_CC", "")
|
||||
# VIZ implies PROFILE, but you can run PROFILE without VIZ
|
||||
VIZ = ContextVar("VIZ", 0)
|
||||
PROFILE = ContextVar("PROFILE", VIZ.value)
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
from typing import Callable, cast, Any
|
||||
from tinygrad.dtype import AddrSpace, DType, PtrDType, dtypes
|
||||
from tinygrad.dtype import AddrSpace, DType, PtrDType, ImageDType, dtypes
|
||||
from tinygrad.helpers import DEBUG, OSX, unwrap, charptr
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str
|
||||
from tinygrad.runtime.autogen import mesa
|
||||
import base64, ctypes, ctypes.util, struct, functools, inspect, contextlib
|
||||
import base64, ctypes, ctypes.util, struct, functools, inspect, contextlib, itertools
|
||||
|
||||
def g(s:str): return getattr(mesa, s)
|
||||
def nsrc(d:mesa.nir_def) -> mesa.nir_src: return mesa.nir_src(ssa=ctypes.pointer(d))
|
||||
@@ -49,7 +49,7 @@ def nir_instr(nc=1, bs=lambda: None, intrins=None, srcs=None, has_def=True, df=N
|
||||
if has_def: mesa.nir_def_init(instr.contents.instr, getattr(instr.contents, "def"), go(nc), go(bs))
|
||||
for k, v in go(intrins or {}).items():
|
||||
idx = mesa.nir_intrinsic_infos[instr.contents.intrinsic.value].index_map[g(f"NIR_INTRINSIC_{k}")]
|
||||
assert idx > 0
|
||||
assert idx > 0, "invalid intrinsic. mesa version mismatch?"
|
||||
instr.contents.const_index[idx - 1] = go(v)
|
||||
for i, src in enumerate(go(srcs or [])): ctypes.cast(instr.contents.src, ctypes.POINTER(mesa.nir_src))[i] = go(src)
|
||||
for k,v in {k:vcomp for k,v in contents.items() if (vcomp:=go(v)) is not None}.items(): setattr(instr.contents, k, go(v))
|
||||
@@ -67,11 +67,16 @@ def nchannel(b:mesa.nir_builder, src:mesa.nir_def, c:int):
|
||||
ctypes.cast(mov.contents.src, ctypes.POINTER(mesa.nir_alu_src))[0] = alu_src
|
||||
return mov
|
||||
|
||||
def nimm_set(imm:mesa.nir_def, x, dtype:DType):
|
||||
instr = ctypes.cast(imm.parent_instr, ctypes.POINTER(mesa.nir_load_const_instr))
|
||||
struct.pack_into(unwrap(dtype.fmt), (ctypes.c_ubyte * dtype.itemsize).from_address(ctypes.addressof(instr.contents.value)), 0, x)
|
||||
|
||||
@nir_instr(nc=1, bs=lambda dtype: 1 if dtype == dtypes.bool else dtype.itemsize * 8)
|
||||
def nimm(b:mesa.nir_builder, x, dtype:DType) -> mesa.nir_def:
|
||||
instr = mesa.nir_load_const_instr_create(b.shader, 1, 1 if dtype == dtypes.bool else dtype.itemsize * 8)
|
||||
struct.pack_into(unwrap(dtype.fmt), (ctypes.c_ubyte * dtype.itemsize).from_address(ctypes.addressof(instr.contents.value)), 0, x)
|
||||
nimm_set(getattr((instr:=mesa.nir_load_const_instr_create(b.shader, 1, 1 if dtype==dtypes.bool else dtype.itemsize * 8)).contents, "def"), x, dtype)
|
||||
return instr
|
||||
@nir_instr(nc=1, bs=lambda dtype: 1 if dtype == dtypes.bool else dtype.itemsize * 8)
|
||||
def nundef(b, dtype): return mesa.nir_undef_instr_create(b.shader, 1, 1 if dtype == dtypes.bool else dtype.itemsize * 8)
|
||||
|
||||
deref_var = nir_instr(nc=1, bs=32, modes=lambda var:var.data.mode, type=lambda var:var.type, var=lambda var:ctypes.pointer(var))( # pylint: disable=W0108
|
||||
lambda b, var: mesa.nir_deref_instr_create(b.shader, mesa.nir_deref_type_var))
|
||||
@@ -87,6 +92,8 @@ nload = nir_instr(nc=lambda dtype:dtype.count, bs=lambda dtype:dtype.itemsize*8/
|
||||
|
||||
ngid = nir_instr(nc=3, bs=32)(lambda b: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_load_workgroup_id))
|
||||
nlid = nir_instr(nc=3, bs=32)(lambda b: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_load_local_invocation_id))
|
||||
ngsz = nir_instr(nc=3, bs=32)(lambda b: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_load_workgroup_size))
|
||||
def nid(b): return nalu(b, "iadd", nalu(b, "imul", ngid(b), ngsz(b)), nlid(b))
|
||||
|
||||
nbarrier = nir_instr(has_def=False, intrins={"EXECUTION_SCOPE":mesa.SCOPE_WORKGROUP})(
|
||||
lambda b: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_barrier))
|
||||
@@ -123,16 +130,16 @@ class NIRRenderer(Renderer):
|
||||
(UPat(Ops.STORE, src=(UPat(), UPat(dtype=dtypes.bool)), name="x", allow_any_len=True),
|
||||
lambda x: x.replace(src=x.src[0:1] + (x.src[1].cast(dtypes.uint8),) + x.src[2:])),
|
||||
# load/store use pointer arithmetic, and the cast does nothing
|
||||
(UPat(Ops.INDEX, src=(UPat.var("buf"), UPat.var("off")), allow_any_len=True, name="x"),
|
||||
lambda x,buf,off: x.replace(src=(buf,off.cast(dtypes.long))+x.src[2:]) if buf.dtype.addrspace != AddrSpace.REG and off.op != Ops.CAST else None),
|
||||
(UPat(Ops.INDEX, src=(UPat.var("buf"), UPat.var("off")), allow_any_len=True, name="x"), lambda x,buf,off: x.replace(
|
||||
src=(buf,off.cast(dtypes.long))+x.src[2:]) if buf.dtype.addrspace != AddrSpace.REG and off.op not in (Ops.CAST, Ops.VECTORIZE) else None),
|
||||
(UPat(Ops.CAST, name="x"), lambda x: x.src[0] if isinstance(x.dtype, PtrDType) or x.src[0].dtype == dtypes.void else None),
|
||||
])
|
||||
|
||||
def_rewrite = PatternMatcher([
|
||||
(UPat(Ops.CONST, name="x"), lambda ctx,x: nimm(ctx.b, x.arg, x.dtype)),
|
||||
(UPat(Ops.DEFINE_GLOBAL, name="x"), lambda ctx,x: ctx.param(ctx.b, x.dtype, 8)),
|
||||
(UPat(Ops.DEFINE_VAR, name="x"), lambda ctx,x: ctx.param(ctx.b, x.dtype, 4)),
|
||||
(UPat(Ops.SPECIAL, name="x"), lambda ctx,x: nchannel(ctx.b, ngid(ctx.b) if x.arg[0] == 'g' else nlid(ctx.b), int(x.arg[-1]))),
|
||||
(UPat(Ops.DEFINE_GLOBAL, name="x"), lambda ctx,x: ctx.param(ctx.b, x, 8)),
|
||||
(UPat(Ops.DEFINE_VAR, name="x"), lambda ctx,x: ctx.param(ctx.b, x, 4)),
|
||||
(UPat(Ops.SPECIAL, name="x"), lambda ctx,x: nchannel(ctx.b, {'g':ngid, 'l':nlid, 'i': nid}[x.arg[0]](ctx.b), int(x.arg[-1]))),
|
||||
(UPat(Ops.STORE, src=(UPat(Ops.INDEX, src=(UPat.var("buf"),UPat.var("off")), allow_any_len=True), UPat.var("val")), allow_any_len=True, name="x"),
|
||||
lambda ctx,x,buf,off,val: nstore(ctx.b, buf.ptrdtype.addrspace, nidx(ctx.b, ctx.r[buf], ctx.r[off], buf.dtype), ctx.r[val], val.dtype)),
|
||||
(UPat(Ops.LOAD, src=(UPat(Ops.INDEX, src=(UPat.var("buf"), UPat.var("off"), UPat.var("gate"))), UPat.var("alt")), allow_any_len=True, name="x"),
|
||||
@@ -158,9 +165,11 @@ class NIRRenderer(Renderer):
|
||||
|
||||
@property
|
||||
def nir_options(self): raise NotImplementedError("needs nir_options")
|
||||
def param(self, b:mesa.nir_builder, dtype:DType, sz:int) -> mesa.nir_def: raise NotImplementedError("needs param")
|
||||
def param(self, b:mesa.nir_builder, x, sz:int) -> mesa.nir_def: raise NotImplementedError("needs param")
|
||||
def prerender(self, uops:list[UOp]):
|
||||
self.b = mesa.nir_builder_init_simple_shader(mesa.MESA_SHADER_COMPUTE, mesa.nir_shader_compiler_options.from_buffer_copy(self.nir_options), None)
|
||||
self.b.shader.contents.info.workgroup_size_variable = any([u.op == Ops.SPECIAL and u.arg[0] == 'i' for u in uops])
|
||||
def postrender(self, uops:list[UOp]): pass
|
||||
|
||||
def render(self, uops:list[UOp]):
|
||||
self.prerender(uops)
|
||||
@@ -193,6 +202,7 @@ class NIRRenderer(Renderer):
|
||||
else:
|
||||
if (d:=self.def_rewrite.rewrite(u, ctx=self)) is None: raise RuntimeError(f"failed to render {u.op} srcs {[x.dtype for x in u.src]}")
|
||||
self.r[u] = cast(mesa.nir_def, d)
|
||||
self.postrender(uops)
|
||||
|
||||
mesa.nir_validate_shader(self.b.shader, b"after render")
|
||||
if DEBUG >= 4: mesa.nir_print_shader(self.b.shader, ctypes.POINTER(mesa.struct__IO_FILE).in_dll(ctypes.CDLL(ctypes.util.find_library('c')),
|
||||
@@ -206,22 +216,23 @@ class NIRRenderer(Renderer):
|
||||
|
||||
return ret
|
||||
|
||||
class NAKRenderer(NIRRenderer):
|
||||
device = "NV"
|
||||
class NIRRendererWithOpts(NIRRenderer):
|
||||
def __init__(self, dev=None, nir_options=None):
|
||||
self.dev, self._nir_options = dev, nir_options
|
||||
super().__init__()
|
||||
|
||||
def __reduce__(self): return NAKRenderer, (None, self.nir_options,)
|
||||
def __reduce__(self): return self.__class__, (None, self.nir_options)
|
||||
|
||||
@property
|
||||
def nir_options(self):
|
||||
if self._nir_options is None: self._nir_options = self.dev.compiler.nir_options
|
||||
return self._nir_options
|
||||
|
||||
class NAKRenderer(NIRRendererWithOpts):
|
||||
device = "NV"
|
||||
param = nir_instr(nc=1, num_components=1, bs=lambda sz:sz*8, also=lambda self,sz: setattr(self, "param_idx", self.param_idx + sz),
|
||||
intrins={"ALIGN_MUL":lambda sz:sz}, srcs=lambda self,b: [nsrc(nimm(b, 0, dtypes.int)), nsrc(nimm(b, self.param_idx, dtypes.int))])(
|
||||
lambda self, b, dtype, sz: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_ldc_nv))
|
||||
lambda self, b, x, sz: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_ldc_nv))
|
||||
|
||||
class LVPRenderer(NIRRenderer):
|
||||
device = "CPU"
|
||||
@@ -232,9 +243,55 @@ class LVPRenderer(NIRRenderer):
|
||||
|
||||
param = nir_instr(nc=1, bs=lambda sz: sz * 8, num_components=1, intrins={"ALIGN_MUL":lambda sz: sz, "RANGE":lambda self: self.param_sz},
|
||||
srcs=lambda b, self: [nsrc(nimm(b, 0, dtypes.int)), nsrc(nimm(b, self.param_idx, dtypes.int))], also=lambda self, sz:
|
||||
setattr(self, "param_idx", self.param_idx+sz))(lambda self, b, dtype, sz: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_load_ubo))
|
||||
setattr(self, "param_idx", self.param_idx+sz))(lambda self,b,x,sz: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_load_ubo))
|
||||
|
||||
def prerender(self, uops:list[UOp]):
|
||||
super().prerender(uops)
|
||||
self.param_sz = sum([8 if u.op == Ops.DEFINE_GLOBAL else u.dtype.itemsize for u in uops if u.op in (Ops.DEFINE_GLOBAL, Ops.DEFINE_VAR)])
|
||||
|
||||
# FIXME: this should be a rewrite rule
|
||||
def tovec(b, coord): return nalu(b, "vec4", nchannel(b, coord, 0), nchannel(b, coord, 1), nundef(b, dtypes.int), nundef(b, dtypes.int))
|
||||
def nfloat(dtype): return mesa.nir_type_float16 if dtype == dtypes.half else mesa.nir_type_float32
|
||||
nstore_img = nir_instr(has_def=False, df=lambda img:img, num_components=lambda val:val.num_components,
|
||||
intrins=lambda dtype:{'IMAGE_DIM':mesa.GLSL_SAMPLER_DIM_2D, 'ACCESS':mesa.ACCESS_CAN_REORDER, 'SRC_TYPE':nfloat(dtype)},
|
||||
srcs=lambda b,img,coord,val:[nsrc(x) for x in [img, tovec(b, coord), nundef(b, dtypes.int), val, nimm(b, 0, dtypes.int)]])(
|
||||
lambda b,img,coord,val,dtype:mesa.nir_intrinsic_instr_create(b.shader,g("nir_intrinsic_image_store")))
|
||||
|
||||
_nload_img = nir_instr(intrins=lambda dtype:{'IMAGE_DIM':mesa.GLSL_SAMPLER_DIM_2D, 'ACCESS':mesa.ACCESS_CAN_REORDER, 'DEST_TYPE':nfloat(dtype)},
|
||||
nc=4, bs=32, num_components=4, srcs=lambda b,img,coord:[nsrc(x) for x in [img, tovec(b, coord), nundef(b, dtypes.int), nimm(b, 0, dtypes.int)]])(
|
||||
lambda b,img,coord,dtype: mesa.nir_intrinsic_instr_create(b.shader, g("nir_intrinsic_image_load")))
|
||||
|
||||
class IR3Renderer(NIRRendererWithOpts):
|
||||
device = "QCOM"
|
||||
|
||||
def nload_img(ctx,img,coord):
|
||||
ctx.texs.add(img)
|
||||
return _nload_img(ctx.b, ctx.r[img], ctx.r[coord], img.dtype)
|
||||
|
||||
def_rewrite = PatternMatcher([
|
||||
(UPat(Ops.STORE, src=(UPat.var('img').index(UPat.var('coord', dtypes.int.vec(2)), allow_any_len=True), UPat.var("val")),
|
||||
allow_any_len=True), lambda ctx,img,coord,val: nstore_img(ctx.b, ctx.r[img], ctx.r[coord], ctx.r[val], val.dtype)),
|
||||
(UPat(Ops.LOAD, src=(UPat.var('img').index(UPat.var('coord', dtypes.int.vec(2)), UPat.var("gate")), UPat.var("alt"))),
|
||||
lambda ctx,img,coord,alt,gate: if_phi(ctx.b, ctx.r[gate], lambda: ctx.nload_img(img, coord), lambda: ctx.r[alt])),
|
||||
(UPat(Ops.LOAD, src=(UPat.var('img').index(UPat.var('coord', dtypes.int.vec(2))),)), nload_img),
|
||||
]) + NIRRenderer.def_rewrite
|
||||
|
||||
_param = LVPRenderer.param
|
||||
def _param_img(self, x):
|
||||
self.img_idx += 1
|
||||
return nimm(self.b, self.img_idx - 1, dtypes.int)
|
||||
|
||||
def param(self, b, x, sz): return self._param_img(x) if isinstance(x.dtype, ImageDType) else self._param(b, x, sz)
|
||||
|
||||
def prerender(self, uops:list[UOp]):
|
||||
super().prerender(uops)
|
||||
self.texs:set[UOp] = set()
|
||||
self.uops, self.ibo_idx, self.img_idx = uops, 0, 0
|
||||
self.param_sz = sum([8 if u.op == Ops.DEFINE_GLOBAL else u.dtype.itemsize for u in uops if u.op in (Ops.DEFINE_GLOBAL, Ops.DEFINE_VAR)])
|
||||
|
||||
def postrender(self, uops:list[UOp]):
|
||||
bufs, texs, imgs = [u for u in uops if u.op == Ops.DEFINE_GLOBAL], itertools.count().__next__, itertools.count().__next__
|
||||
for b in filter(lambda b: isinstance(b.dtype, ImageDType), bufs): nimm_set(self.r[b], texs() if b in self.texs else imgs(), dtypes.int)
|
||||
|
||||
self.b.shader.contents.info.num_ubos = len([u for u in bufs if not isinstance(u.dtype, ImageDType)])
|
||||
self.b.shader.contents.info.num_images = texs() + imgs()
|
||||
|
||||
@@ -113,17 +113,20 @@ def __getattr__(nm):
|
||||
*[f"{{}}/src/nouveau/{s}.h" for s in ["headers/nv_device_info", "compiler/nak"]],
|
||||
*[f"{{}}/src/gallium/auxiliary/gallivm/lp_bld{s}.h" for s in ["", "_passmgr", "_misc", "_type", "_init", "_nir", "_struct", "_jit_types",
|
||||
"_flow", "_const"]],
|
||||
"{}/src/compiler/glsl_types.h", "{}/src/util/blob.h", "{}/src/util/ralloc.h", "{}/gen/builtin_types.h", "{}/gen/a6xx.xml.h",
|
||||
"{}/gen/adreno_pm4.xml.h", "{}/gen/a6xx_enums.xml.h", "{}/gen/a6xx_descriptors.xml.h"], args=lambda:[
|
||||
*[f"{{}}/src/freedreno/{s}.h" for s in ["common/freedreno_dev_info", "ir3/ir3_compiler", "ir3/ir3_shader", "ir3/ir3_nir"]],
|
||||
"{}/src/compiler/glsl_types.h", "{}/src/util/blob.h", "{}/src/util/ralloc.h", "{}/gen/ir3-isa.h", "{}/gen/builtin_types.h",
|
||||
"{}/gen/a6xx.xml.h", "{}/gen/adreno_pm4.xml.h", "{}/gen/a6xx_enums.xml.h", "{}/gen/a6xx_descriptors.xml.h"], args=lambda:[
|
||||
"-DHAVE_ENDIAN_H", "-DHAVE_STRUCT_TIMESPEC", "-DHAVE_PTHREAD", "-DHAVE_FUNC_ATTRIBUTE_PACKED", "-I{}/src", "-I{}/include", "-I{}/gen",
|
||||
"-I{}/src/compiler/nir", "-I{}/src/gallium/auxiliary", "-I{}/src/gallium/include", f"-I{system('llvm-config-20 --includedir')}"],
|
||||
"-I{}/src/compiler/nir", "-I{}/src/gallium/auxiliary", "-I{}/src/gallium/include", "-I{}/src/freedreno/common",
|
||||
f"-I{system('llvm-config-20 --includedir')}"],
|
||||
preprocess=lambda path: subprocess.run("\n".join(["mkdir -p gen/util/format", "python3 src/compiler/builtin_types_h.py gen/builtin_types.h",
|
||||
"python3 src/compiler/isaspec/decode.py --xml src/freedreno/isa/ir3.xml --out-c /dev/null --out-h gen/ir3-isa.h",
|
||||
"python3 src/util/format/u_format_table.py src/util/format/u_format.yaml --enums > gen/util/format/u_format_gen.h",
|
||||
*["python3 src/freedreno/registers/gen_header.py --rnn src/freedreno/registers/ --xml " +
|
||||
f"src/freedreno/registers/adreno/{s}.xml c-defines > gen/{s}.xml.h" for s in ["a6xx", "adreno_pm4", "a6xx_enums", "a6xx_descriptors"]],
|
||||
*[f"python3 src/compiler/{s}_h.py > gen/{s.split('/')[-1]}.h" for s in ["nir/nir_opcodes", "nir/nir_builder_opcodes"]],
|
||||
*[f"python3 src/compiler/nir/nir_{s}_h.py --outdir gen" for s in ["intrinsics", "intrinsics_indices"]]]), cwd=path, shell=True, check=True),
|
||||
tarball="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.4/mesa-25.2.4.tar.gz",
|
||||
tarball="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
|
||||
prolog=["import gzip, base64", "from tinygrad.helpers import OSX"], epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
|
||||
case "libclang":
|
||||
return load("libclang", ["os.getenv('LIBCLANG_PATH', find_library('clang-20'))"],
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,6 +5,8 @@ from tinygrad.renderer.cstyle import Renderer, CStyleLanguage
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.helpers import cpu_profile, EMULATE
|
||||
from tinygrad.renderer.nir import IR3Renderer
|
||||
from tinygrad.runtime.support.compiler_mesa import IR3Compiler
|
||||
|
||||
class NullRenderer(CStyleLanguage):
|
||||
device = "NULL"
|
||||
@@ -37,4 +39,6 @@ class NullDevice(Compiled):
|
||||
case "AMD_RDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx1201")
|
||||
case "": renderer = NullRenderer
|
||||
case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
|
||||
super().__init__(device, NullAllocator(self), CompilerSet([CompilerPair(renderer, Compiler)]), functools.partial(NullProgram, device), NullGraph)
|
||||
compilers = CompilerSet([CompilerPair(renderer, Compiler),
|
||||
CompilerPair(functools.partial(IR3Renderer, self), functools.partial(IR3Compiler, 0x6030001))]) # adreno 630
|
||||
super().__init__(device, NullAllocator(self), compilers, functools.partial(NullProgram, device), NullGraph)
|
||||
|
||||
@@ -9,7 +9,10 @@ from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
|
||||
from tinygrad.runtime.autogen import kgsl, mesa
|
||||
from tinygrad.runtime.ops_cl import CLCompiler, CLDevice
|
||||
from tinygrad.renderer.cstyle import QCOMRenderer
|
||||
from tinygrad.renderer.nir import IR3Renderer
|
||||
from tinygrad.runtime.support.compiler_mesa import IR3Compiler
|
||||
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport, cpu_profile, lo32, PROFILE, suppress_finalizing
|
||||
from tinygrad.helpers import flatten, QCOM_IR3, QCOM_CC
|
||||
from tinygrad.runtime.support.system import System
|
||||
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
@@ -34,9 +37,11 @@ def pkt7_hdr(opcode: int, cnt: int): return mesa.CP_TYPE7_PKT | cnt & 0x3FFF | p
|
||||
|
||||
def pkt4_hdr(reg: int, cnt: int): return mesa.CP_TYPE4_PKT | cnt & 0x7F | parity(cnt) << 7 | (reg & 0x3FFFF) << 8 | parity(reg) << 27
|
||||
|
||||
def _read_lib(lib, off) -> int: return struct.unpack("I", lib[off:off+4])[0]
|
||||
class QCOMCompiler(CLCompiler):
|
||||
def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')
|
||||
def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
|
||||
def disassemble(self, lib:bytes):
|
||||
fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])
|
||||
|
||||
class QCOMSignal(HCQSignal):
|
||||
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 19.2})
|
||||
@@ -120,9 +125,9 @@ class QCOMComputeQueue(HWQueue):
|
||||
self.reg(mesa.REG_A6XX_SP_UPDATE_CNTL, 0x0)
|
||||
self.reg(mesa.REG_A6XX_SP_CS_TSIZE, qreg.a6xx_sp_cs_tsize(0x80)) # is this right? mesa uses 1
|
||||
self.reg(mesa.REG_A6XX_SP_CS_USIZE, qreg.a6xx_sp_cs_usize(0x40)) # mesa also uses 1
|
||||
self.reg(mesa.REG_A6XX_SP_MODE_CNTL, qreg.a6xx_sp_mode_cntl(isammode=mesa.ISAMMODE_CL))
|
||||
self.reg(mesa.REG_A6XX_SP_MODE_CNTL, qreg.a6xx_sp_mode_cntl(isammode=mesa.ISAMMODE_GL if prg.NIR else mesa.ISAMMODE_CL))
|
||||
self.reg(mesa.REG_A6XX_SP_PERFCTR_SHADER_MASK, qreg.a6xx_sp_perfctr_shader_mask(cs=True))
|
||||
self.reg(mesa.REG_A6XX_TPL1_MODE_CNTL, qreg.a6xx_tpl1_mode_cntl(isammode=mesa.ISAMMODE_CL))
|
||||
self.reg(mesa.REG_A6XX_TPL1_MODE_CNTL, qreg.a6xx_tpl1_mode_cntl(isammode=mesa.ISAMMODE_GL if prg.NIR else mesa.ISAMMODE_CL))
|
||||
self.reg(mesa.REG_A6XX_TPL1_DBG_ECO_CNTL, 0)
|
||||
self.cmd(mesa.CP_WAIT_FOR_IDLE)
|
||||
|
||||
@@ -138,6 +143,7 @@ class QCOMComputeQueue(HWQueue):
|
||||
qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.dev._stack.va_addr),
|
||||
qreg.a6xx_sp_cs_pvt_mem_size(totalpvtmemsize=prg.pvtmem_size_total))
|
||||
|
||||
if prg.NIR and prg.wgsz != 0xfc: to_mv(args_state.buf.va_addr + prg.wgsz * 4, 12)[:] = struct.pack("III", *local_size)
|
||||
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST_CONSTANTS, state_src=mesa.SS6_INDIRECT,
|
||||
state_block=mesa.SB6_CS_SHADER, num_unit=1024 // 4),
|
||||
*data64_le(args_state.buf.va_addr))
|
||||
@@ -150,20 +156,20 @@ class QCOMComputeQueue(HWQueue):
|
||||
self.reg(mesa.REG_A6XX_SP_CS_PVT_MEM_STACK_OFFSET, qreg.a6xx_sp_cs_pvt_mem_stack_offset(prg.hw_stack_offset))
|
||||
self.reg(mesa.REG_A6XX_SP_CS_INSTR_SIZE, qreg.a6xx_sp_cs_instr_size(prg.image_size // 4))
|
||||
|
||||
if args_state.prg.samp_cnt > 0:
|
||||
if prg.samp_cnt > 0:
|
||||
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST_SHADER, state_src=mesa.SS6_INDIRECT,
|
||||
state_block=mesa.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
|
||||
*data64_le(args_state.buf.va_addr + args_state.prg.samp_off))
|
||||
self.reg(mesa.REG_A6XX_SP_CS_SAMPLER_BASE, *data64_le(args_state.buf.va_addr + args_state.prg.samp_off))
|
||||
self.reg(mesa.REG_A6XX_TPL1_CS_BORDER_COLOR_BASE, *data64_le(prg.dev.border_color_buf.va_addr))
|
||||
|
||||
if args_state.prg.tex_cnt > 0:
|
||||
if prg.tex_cnt > 0:
|
||||
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST_CONSTANTS, state_src=mesa.SS6_INDIRECT,
|
||||
state_block=mesa.SB6_CS_TEX, num_unit=min(16, args_state.prg.tex_cnt)),
|
||||
*data64_le(args_state.buf.va_addr + args_state.prg.tex_off))
|
||||
self.reg(mesa.REG_A6XX_SP_CS_TEXMEMOBJ_BASE, *data64_le(args_state.buf.va_addr + args_state.prg.tex_off))
|
||||
|
||||
if args_state.prg.ibo_cnt > 0:
|
||||
if prg.ibo_cnt > 0:
|
||||
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST6_UAV, state_src=mesa.SS6_INDIRECT,
|
||||
state_block=mesa.SB6_CS_SHADER, num_unit=args_state.prg.ibo_cnt),
|
||||
*data64_le(args_state.buf.va_addr + args_state.prg.ibo_off))
|
||||
@@ -171,7 +177,15 @@ class QCOMComputeQueue(HWQueue):
|
||||
|
||||
self.reg(mesa.REG_A6XX_SP_CS_CONFIG,
|
||||
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nuav=args_state.prg.ibo_cnt))
|
||||
self.cmd(mesa.CP_RUN_OPENCL, 0)
|
||||
|
||||
if prg.NIR:
|
||||
self.reg(mesa.REG_A6XX_SP_CS_CONST_CONFIG_0,
|
||||
qreg.a6xx_sp_cs_const_config_0(wgidconstid=prg.wgid, wgsizeconstid=prg.wgsz, wgoffsetconstid=0xfc, localidregid=prg.lid),
|
||||
qreg.a6xx_sp_cs_wge_cntl(linearlocalidregid=0xfc, threadsize=mesa.THREAD64))
|
||||
self.cmd(mesa.CP_EXEC_CS, 0,
|
||||
qreg.cp_exec_cs_1(ngroups_x=global_size[0]), qreg.cp_exec_cs_2(ngroups_y=global_size[1]), qreg.cp_exec_cs_3(_ngroups_z=global_size[2]))
|
||||
else: self.cmd(mesa.CP_RUN_OPENCL, 0)
|
||||
|
||||
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
|
||||
return self
|
||||
|
||||
@@ -195,11 +209,45 @@ class QCOMArgsState(HCQArgsState):
|
||||
|
||||
for i, v in enumerate(vals): self.bind_sints_to_buf(v, buf=self.buf, fmt='I', offset=self.args_info[i].offset)
|
||||
|
||||
class IR3ArgsState(HCQArgsState):
|
||||
def __init__(self, buf:HCQBuffer, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
||||
super().__init__(buf, prg, bufs, vals=vals)
|
||||
ctypes.memset(cast(int, self.buf.va_addr), 0, prg.kernargs_alloc_size)
|
||||
to_mv(self.buf.va_addr + prg.imm_off, len(prg.imm_vals))[:] = prg.imm_vals
|
||||
|
||||
ubos, uavs = [b for b in bufs if b.texture_info is None], [b for b in bufs if b.texture_info is not None]
|
||||
ibos, texs = (uavs, []) if prg.tex_cnt == 0 else (uavs[:-prg.tex_cnt], uavs[-prg.tex_cnt:]) # textures are at the end
|
||||
|
||||
if prg.samp_cnt > 0: to_mv(self.buf.va_addr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
|
||||
self.bind_sints_to_buf(*[b.va_addr for b in ubos], buf=self.buf, fmt='Q', offset=prg.buf_off)
|
||||
self.bind_sints_to_buf(*vals, buf=self.buf, fmt='I', offset=prg.buf_off + len(ubos) * 8)
|
||||
self.bind_sints_to_buf(*flatten([b.texture_info.desc + ([0] * 8) for b in texs]), buf=self.buf, fmt='I', offset=prg.tex_off)
|
||||
self.bind_sints_to_buf(*flatten([b.texture_info.ibo + ([0] * 8) for b in ibos]), buf=self.buf, fmt='I', offset=prg.ibo_off)
|
||||
|
||||
class QCOMProgram(HCQProgram):
|
||||
def __init__(self, dev: QCOMDevice, name: str, lib: bytes):
|
||||
self.dev: QCOMDevice = dev
|
||||
self.name, self.lib = name, lib
|
||||
self._parse_lib()
|
||||
self.name, self.lib, self.NIR = name, lib, isinstance(dev.compiler, IR3Compiler)
|
||||
|
||||
if self.NIR:
|
||||
from tinygrad.runtime.autogen import mesa
|
||||
v, cs, self.imm_vals, self.image = IR3Compiler.unpack_lib(lib)
|
||||
self.prg_offset, self.brnchstck, self.image_size, self.pvtmem, self.shmem = 0, v.branchstack, v.info.size, v.pvtmem_size, v.shared_size
|
||||
self.wgsz = alloc.offset_vec4 * 4 + 8 if (alloc:=cs.allocs.consts[mesa.IR3_CONST_ALLOC_DRIVER_PARAMS]).size_vec4 else 0xfc
|
||||
|
||||
self.wgid, self.lid = v.cs.work_group_id, v.cs.local_invocation_id # register ids
|
||||
self.buf_off, self.imm_off = cs.ubo_state.range[0].offset, cs.allocs.max_const_offset_vec4 * 16
|
||||
|
||||
# see https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_shader.h#L525
|
||||
# and https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_compiler_nir.c#L5389
|
||||
self.samp_cnt, self.tex_cnt, self.ibo_cnt = (nt:=v.image_mapping.num_tex), nt, v.num_uavs - nt
|
||||
# IR3 outputs a sampler for every texture (https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_compiler_nir.c#L1714)
|
||||
self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=mesa.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode),
|
||||
qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0] * self.samp_cnt
|
||||
|
||||
self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
|
||||
self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
|
||||
else: self._parse_lib()
|
||||
|
||||
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
|
||||
to_mv(cast(int, self.lib_gpu.va_addr), self.image_size)[:] = self.image
|
||||
@@ -211,8 +259,8 @@ class QCOMProgram(HCQProgram):
|
||||
self.max_threads = min(1024, ((384 * 32) // (max(1, (self.fregs + round_up(self.hregs, 2) // 2)) * 128)) * 128)
|
||||
dev._ensure_stack_size(self.hw_stack_offset * 4)
|
||||
|
||||
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100)
|
||||
super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
|
||||
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + len(self.samplers) * 4, 0x100)
|
||||
super().__init__(IR3ArgsState if self.NIR else QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
|
||||
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
|
||||
|
||||
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
@@ -222,27 +270,26 @@ class QCOMProgram(HCQProgram):
|
||||
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
||||
|
||||
def _parse_lib(self):
|
||||
def _read_lib(off) -> int: return struct.unpack("I", self.lib[off:off+4])[0]
|
||||
|
||||
# Extract image binary
|
||||
self.image_size = _read_lib(0x100)
|
||||
self.image = bytearray(self.lib[(image_offset:=_read_lib(0xc0)):image_offset+self.image_size])
|
||||
self.image_size = _read_lib(self.lib, 0x100)
|
||||
self.image = bytearray(self.lib[(image_offset:=_read_lib(self.lib, 0xc0)):image_offset+self.image_size])
|
||||
|
||||
# Parse image descriptors
|
||||
image_desc_off = _read_lib(0x110)
|
||||
self.prg_offset, self.brnchstck = _read_lib(image_desc_off+0xc4), _read_lib(image_desc_off+0x108) // 2
|
||||
self.pvtmem, self.shmem = _read_lib(image_desc_off+0xc8), _read_lib(image_desc_off+0xd8)
|
||||
image_desc_off = _read_lib(self.lib, 0x110)
|
||||
self.prg_offset, self.brnchstck = _read_lib(self.lib, image_desc_off+0xc4), _read_lib(self.lib, image_desc_off+0x108) // 2
|
||||
self.pvtmem, self.shmem = _read_lib(self.lib, image_desc_off+0xc8), _read_lib(self.lib, image_desc_off+0xd8)
|
||||
|
||||
# Fill up constants and buffers info
|
||||
self.buf_info, self.consts_info = [], []
|
||||
|
||||
# Collect sampler info.
|
||||
self.samp_cnt = samp_cnt_in_file = _read_lib(image_desc_off + 0xdc)
|
||||
self.samp_cnt = samp_cnt_in_file = _read_lib(self.lib, image_desc_off + 0xdc)
|
||||
assert self.samp_cnt <= 1, "Up to one sampler supported"
|
||||
if self.samp_cnt:
|
||||
self.samp_cnt += 1
|
||||
self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=mesa.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode),
|
||||
qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0, 0, 0, 0, 0]
|
||||
else: self.samplers = []
|
||||
|
||||
# Collect kernel arguments (buffers) info.
|
||||
bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samp_cnt_in_file
|
||||
@@ -260,16 +307,16 @@ class QCOMProgram(HCQProgram):
|
||||
if x.type is BUFTYPE_IBO: x.offset, cur_ibo_off = cur_ibo_off, cur_ibo_off + 0x40
|
||||
elif x.type is BUFTYPE_TEX: x.offset, cur_tex_off = cur_tex_off, cur_tex_off + 0x40
|
||||
|
||||
if _read_lib(0xb0) != 0: # check if we have constants.
|
||||
cdoff = _read_lib(0xac)
|
||||
if _read_lib(self.lib, 0xb0) != 0: # check if we have constants.
|
||||
cdoff = _read_lib(self.lib, 0xac)
|
||||
while cdoff + 40 <= image_offset:
|
||||
cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
|
||||
self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
|
||||
cdoff += 40
|
||||
|
||||
# Registers info
|
||||
reg_desc_off = _read_lib(0x34)
|
||||
self.fregs, self.hregs = _read_lib(reg_desc_off + 0x14), _read_lib(reg_desc_off + 0x18)
|
||||
reg_desc_off = _read_lib(self.lib, 0x34)
|
||||
self.fregs, self.hregs = _read_lib(self.lib, reg_desc_off + 0x14), _read_lib(self.lib, reg_desc_off + 0x18)
|
||||
|
||||
class QCOMTextureInfo:
|
||||
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
|
||||
@@ -354,8 +401,10 @@ class QCOMDevice(HCQCompiled):
|
||||
if PROFILE and self.gpu_id[:2] < (7, 3):
|
||||
System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
|
||||
|
||||
super().__init__(device, QCOMAllocator(self), CompilerSet([CompilerPair(QCOMRenderer, functools.partial(QCOMCompiler, device))]),
|
||||
functools.partial(QCOMProgram, self), QCOMSignal, functools.partial(QCOMComputeQueue, self), None)
|
||||
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[CompilerPair(QCOMRenderer, functools.partial(QCOMCompiler, device)),
|
||||
CompilerPair(functools.partial(IR3Renderer, self), functools.partial(IR3Compiler, info.chip_id), QCOM_IR3)])
|
||||
super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
|
||||
functools.partial(QCOMComputeQueue, self), None)
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer:
|
||||
flags |= flag("KGSL_MEMALIGN", alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
|
||||
|
||||
@@ -1,11 +1,16 @@
|
||||
import base64, ctypes, pathlib, tempfile, hashlib
|
||||
import base64, ctypes, pathlib, tempfile, hashlib, sys
|
||||
from tinygrad.device import Compiler
|
||||
from tinygrad.helpers import cpu_objdump, system
|
||||
from tinygrad.helpers import cpu_objdump, system, data64
|
||||
from tinygrad.runtime.autogen import mesa
|
||||
from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, expect, cerr
|
||||
try: from tinygrad.runtime.autogen import llvm
|
||||
except (ImportError, FileNotFoundError): llvm = None #type:ignore[assignment]
|
||||
|
||||
def rzalloc(typ, ctx=None, **kwargs):
|
||||
s = ctypes.cast(mesa.rzalloc_size(ctypes.cast(ctx, ctypes.c_void_p), ctypes.sizeof(typ)), ctypes.POINTER(typ))
|
||||
for k,v in kwargs.items(): setattr(s.contents, k, v)
|
||||
return s
|
||||
|
||||
def deserialize(enc_src, opts):
|
||||
blobreader = mesa.struct_blob_reader()
|
||||
mesa.blob_reader_init(blobreader, src:=base64.b64decode(enc_src), len(src))
|
||||
@@ -84,3 +89,56 @@ class NAKCompiler(NIRCompiler):
|
||||
with open(fn, "wb") as f: f.write(lib[ctypes.sizeof(mesa.struct_nak_shader_info):])
|
||||
print(system(f"nvdisasm -b SM{self.arch[3:]} {fn}"))
|
||||
except Exception as e: print("Failed to generate SASS", str(e), "Make sure your PATH contains nvdisasm binary of compatible version.")
|
||||
|
||||
def disas_adreno(lib:bytes, gpu_id=630):
|
||||
with tempfile.TemporaryFile('w+', buffering=1) as tf:
|
||||
@ctypes.CFUNCTYPE(None, ctypes.c_void_p, ctypes.c_uint32, ctypes.c_void_p)
|
||||
def hd(data, n, instr):
|
||||
fst, snd = data64(ctypes.cast(instr, ctypes.POINTER(ctypes.c_uint64)).contents.value)
|
||||
print(f"{n:04} [{fst:08x}_{snd:08x}] ", end="", flush=True, file=tf)
|
||||
|
||||
ctypes.CDLL(None).setlinebuf(fp:=ctypes.cast(ctypes.CDLL(None).fdopen(tf.fileno(), b"w"), ctypes.POINTER(mesa.struct__IO_FILE)))
|
||||
mesa.ir3_isa_disasm(lib, len(lib), fp, mesa.struct_isa_decode_options(gpu_id, True, 0, True, pre_instr_cb=hd))
|
||||
tf.seek(0)
|
||||
print(tf.read())
|
||||
|
||||
class IR3Compiler(NIRCompiler):
|
||||
def __init__(self, chip_id, cache_key="ir3"):
|
||||
assert sys.version_info >= (3,14), "IR3 requires python 3.14's bitfield fixes"
|
||||
self.dev_id = mesa.struct_fd_dev_id(((chip_id >> 24) & 0xFF) * 100 + ((chip_id >> 16) & 0xFF) * 10 + ((chip_id >> 8) & 0xFF), chip_id)
|
||||
self.cc = mesa.ir3_compiler_create(None, self.dev_id, mesa.fd_dev_info(self.dev_id),
|
||||
mesa.struct_ir3_compiler_options(disable_cache=True)).contents
|
||||
self.cc.has_preamble = False
|
||||
self.nir_options = bytes(mesa.ir3_get_compiler_options(self.cc).contents)
|
||||
super().__init__(f"compile_{cache_key}")
|
||||
|
||||
def __del__(self):
|
||||
mesa.ir3_compiler_destroy(self.cc)
|
||||
super().__del__()
|
||||
|
||||
def __reduce__(self): return IR3Compiler, (self.dev_id.chip_id,)
|
||||
|
||||
# ir3_shader_variant info: https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_shader.c#L1099
|
||||
def compile(self, src) -> bytes:
|
||||
nir_shader = deserialize(src, self.nir_options)
|
||||
mesa.ir3_nir_lower_io_vars_to_temporaries(nir_shader)
|
||||
mesa.ir3_finalize_nir(self.cc, mesa.struct_ir3_shader_nir_options(), nir_shader)
|
||||
shader = rzalloc(mesa.struct_ir3_shader, compiler=ctypes.pointer(self.cc), type=mesa.MESA_SHADER_COMPUTE, nir=nir_shader).contents
|
||||
mesa.ir3_nir_post_finalize(shader)
|
||||
v = rzalloc(mesa.struct_ir3_shader_variant, type=shader.type, compiler=ctypes.pointer(self.cc), key=mesa.struct_ir3_shader_key()).contents
|
||||
v.const_state, shader.variants, shader.variant_count = rzalloc(mesa.struct_ir3_const_state, ctypes.pointer(v)), ctypes.pointer(v), 1
|
||||
v.num_uavs = (info:=nir_shader.contents.info).num_ssbos + info.num_images
|
||||
assert not mesa.ir3_compile_shader_nir(self.cc, shader, v), "compilation failed"
|
||||
lib = ctypes.cast(mesa.ir3_shader_assemble(v), ctypes.POINTER(ctypes.c_uint32))
|
||||
# NB: bytes(v) means the pointers in v are no longer safe! a custom __reduce__ that supports pointers for c.Struct would make this simpler
|
||||
ret = bytes(v) + bytes(v.const_state.contents) + ctypes.string_at(v.imm_state.values, v.imm_state.count * 4) + ctypes.string_at(lib, v.info.size)
|
||||
mesa.ralloc_free(ctypes.pointer(v))
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def unpack_lib(lib: bytes) -> tuple[mesa.struct_ir3_shader_variant, mesa.struct_ir3_const_state, bytes, bytes]:
|
||||
shifted = lib[ctypes.sizeof(v:=mesa.struct_ir3_shader_variant.from_buffer_copy(lib)):]
|
||||
shifted = shifted[ctypes.sizeof(cs:=mesa.struct_ir3_const_state.from_buffer_copy(shifted)):]
|
||||
return v, cs, shifted[:v.imm_state.count * 4], shifted[v.imm_state.count * 4:]
|
||||
|
||||
def disassemble(self, lib: bytes): disas_adreno(self.unpack_lib(lib)[3], self.dev_id.gpu_id)
|
||||
|
||||
Reference in New Issue
Block a user