From fc4faed0b2437f3247dd2a7ecb122eec3d1e96e0 Mon Sep 17 00:00:00 2001 From: Christopher Milan Date: Mon, 29 Dec 2025 14:42:28 -0800 Subject: [PATCH] Revert "NIR: new-style compilers (#13875)" (#13888) This reverts commit 72236bbd3d5c1589e8d48a6e087fc79f5bb8ac61. --- tinygrad/renderer/nir.py | 35 ++++++++++++----------- tinygrad/runtime/ops_cpu.py | 5 ++-- tinygrad/runtime/ops_null.py | 6 ++-- tinygrad/runtime/ops_nv.py | 5 ++-- tinygrad/runtime/ops_qcom.py | 7 +++-- tinygrad/runtime/support/compiler_cpu.py | 8 +++--- tinygrad/runtime/support/compiler_mesa.py | 30 +++++++++++++------ 7 files changed, 59 insertions(+), 37 deletions(-) diff --git a/tinygrad/renderer/nir.py b/tinygrad/renderer/nir.py index d5e184e0d6..4e0c8e859a 100644 --- a/tinygrad/renderer/nir.py +++ b/tinygrad/renderer/nir.py @@ -1,6 +1,6 @@ from typing import Callable, cast, Any from tinygrad.dtype import AddrSpace, DType, PtrDType, ImageDType, dtypes -from tinygrad.helpers import DEBUG, OSX, unwrap, charptr, fromimport +from tinygrad.helpers import DEBUG, OSX, unwrap, charptr from tinygrad.renderer import Renderer from tinygrad.renderer.cstyle import CUDARenderer from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str @@ -115,8 +115,7 @@ def nidx(b:mesa.nir_builder, buf, off, dtype, gate=None) -> mesa.nir_def: return if_phi(b, gate, f, lambda: buf) if gate is not None else f() class NIRRenderer(Renderer): - suffix = "NIR" - nir_options: bytes + suffix = "NAK" global_max, local_max, shared_max = CUDARenderer.global_max, CUDARenderer.local_max, CUDARenderer.shared_max code_for_op = {**{k:lambda:None for k in u_aop.keys()}, **{k:lambda:None for k in s_aop.keys()}, **{k:lambda:None for k in f_aop.keys()}} @@ -159,14 +158,13 @@ class NIRRenderer(Renderer): (UPat(Ops.ENDIF, name="x"), lambda ctx,x: (lambda _: mesa.nir_def())(mesa.nir_pop_if(ctx.b, ctx.r[x.src[0]]))) ]) - def __init__(self, compiler): - self.compiler = compiler - if hasattr(self.compiler, "nir_options"): self.nir_options = self.compiler.nir_options - mesa.glsl_type_singleton_init_or_ref() + def __init__(self): mesa.glsl_type_singleton_init_or_ref() def __del__(self): with contextlib.suppress(AttributeError): mesa.glsl_type_singleton_decref() + @property + def nir_options(self): raise NotImplementedError("needs nir_options") def param(self, b:mesa.nir_builder, x, sz:int) -> mesa.nir_def: raise NotImplementedError("needs param") def prerender(self, uops:list[UOp]): self.b = mesa.nir_builder_init_simple_shader(mesa.MESA_SHADER_COMPUTE, mesa.nir_shader_compiler_options.from_buffer_copy(self.nir_options), None) @@ -218,11 +216,20 @@ class NIRRenderer(Renderer): return ret -class NAKRenderer(NIRRenderer): +class NIRRendererWithOpts(NIRRenderer): + def __init__(self, dev=None, nir_options=None): + self.dev, self._nir_options = dev, nir_options + super().__init__() + + def __reduce__(self): return self.__class__, (None, self.nir_options) + + @property + def nir_options(self): + if self._nir_options is None: self._nir_options = self.dev.compiler.nir_options + return self._nir_options + +class NAKRenderer(NIRRendererWithOpts): device = "NV" - - def __init__(self, arch, warps_per_sm): super().__init__(fromimport("tinygrad.runtime.support.compiler_mesa", "NAKCompiler")(arch, warps_per_sm)) - param = nir_instr(nc=1, num_components=1, bs=lambda sz:sz*8, also=lambda self,sz: setattr(self, "param_idx", self.param_idx + sz), intrins={"ALIGN_MUL":lambda sz:sz}, srcs=lambda self,b: [nsrc(nimm(b, 0, dtypes.int)), nsrc(nimm(b, self.param_idx, dtypes.int))])( lambda self, b, x, sz: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_ldc_nv)) @@ -234,8 +241,6 @@ class LVPRenderer(NIRRenderer): global_max = (1, 0, 0) nir_options = mesa.lvp_nir_options - def __init__(self): super().__init__(fromimport("tinygrad.runtime.support.compiler_mesa", "LVPCompiler")()) - param = nir_instr(nc=1, bs=lambda sz: sz * 8, num_components=1, intrins={"ALIGN_MUL":lambda sz: sz, "RANGE":lambda self: self.param_sz}, srcs=lambda b, self: [nsrc(nimm(b, 0, dtypes.int)), nsrc(nimm(b, self.param_idx, dtypes.int))], also=lambda self, sz: setattr(self, "param_idx", self.param_idx+sz))(lambda self,b,x,sz: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_load_ubo)) @@ -256,11 +261,9 @@ _nload_img = nir_instr(intrins=lambda dtype:{'IMAGE_DIM':mesa.GLSL_SAMPLER_DIM_2 nc=4, bs=32, num_components=4, srcs=lambda b,img,coord:[nsrc(x) for x in [img, tovec(b, coord), nundef(b, dtypes.int), nimm(b, 0, dtypes.int)]])( lambda b,img,coord,dtype: mesa.nir_intrinsic_instr_create(b.shader, g("nir_intrinsic_image_load"))) -class IR3Renderer(NIRRenderer): +class IR3Renderer(NIRRendererWithOpts): device = "QCOM" - def __init__(self, chip_id): super().__init__(fromimport("tinygrad.runtime.support.compiler_mesa", "IR3Compiler")(chip_id)) - def nload_img(ctx,img,coord): ctx.texs.add(img) return _nload_img(ctx.b, ctx.r[img], ctx.r[coord], img.dtype) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 9affe333c2..2e76328e22 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -9,6 +9,7 @@ from tinygrad.renderer.cstyle import ClangJITRenderer from tinygrad.renderer.llvmir import LLVMRenderer from tinygrad.renderer.nir import LVPRenderer from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler +from tinygrad.runtime.support.compiler_mesa import LVPCompiler from tinygrad.runtime.support.elf import jit_loader from tinygrad.uop.ops import sint @@ -71,7 +72,7 @@ class CPUProgram(HCQProgram): except OSError: pass def __init__(self, dev, name:str, lib:bytes): - LVP = isinstance(dev.renderer, LVPRenderer) + LVP = isinstance(dev.compiler, LVPCompiler) if sys.platform == "win32": # mypy doesn't understand when WIN is used here PAGE_EXECUTE_READWRITE, MEM_COMMIT, MEM_RESERVE = 0x40, 0x1000, 0x2000 ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_void_p @@ -136,5 +137,5 @@ class CPUDevice(HCQCompiled): self.tasks:queue.Queue = queue.Queue() CPUWorker(self, self.tasks, thread_id=0).start() compilers = CompilerSet([CompilerPair(ClangJITRenderer, None), CompilerPair(LLVMRenderer, CPULLVMCompiler, ctrl_var=CPU_LLVM), - CompilerPair(LVPRenderer, None, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC) + CompilerPair(LVPRenderer, LVPCompiler, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC) super().__init__(device, CPUAllocator(self), compilers, functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue) diff --git a/tinygrad/runtime/ops_null.py b/tinygrad/runtime/ops_null.py index d77773654c..14f5dc63c0 100644 --- a/tinygrad/runtime/ops_null.py +++ b/tinygrad/runtime/ops_null.py @@ -6,6 +6,7 @@ from tinygrad.renderer.llvmir import AMDLLVMRenderer from tinygrad.uop.ops import Ops from tinygrad.helpers import cpu_profile, EMULATE, NULL_IR3, NULL_NAK from tinygrad.renderer.nir import IR3Renderer, NAKRenderer +from tinygrad.runtime.support.compiler_mesa import IR3Compiler, NAKCompiler class NullRenderer(CStyleLanguage): device = "NULL" @@ -38,6 +39,7 @@ class NullDevice(Compiled): case "AMD_RDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx1201") case "": renderer = NullRenderer case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}") - compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630 - CompilerPair(functools.partial(NAKRenderer, "sm_120", 48), None, NULL_NAK)]) # 5090 + compilers = CompilerSet([CompilerPair(renderer, Compiler), + CompilerPair(functools.partial(IR3Renderer, self), functools.partial(IR3Compiler, 0x6030001), NULL_IR3), # adreno 630 + CompilerPair(functools.partial(NAKRenderer, self), functools.partial(NAKCompiler, "sm_120", 48), NULL_NAK)]) # 5090 super().__init__(device, NullAllocator(self), compilers, functools.partial(NullProgram, device), NullGraph) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index e2eb712151..083e03ddf1 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -11,6 +11,7 @@ from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, pr from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.cstyle import NVRenderer from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, NVPTXCompiler, NVCompiler +from tinygrad.runtime.support.compiler_mesa import NAKCompiler from tinygrad.runtime.autogen import nv_570, nv_580, pci, mesa from tinygrad.runtime.support.elf import elf_loader from tinygrad.runtime.support.nv.nvdev import NVDev, NVMemoryManager @@ -215,7 +216,7 @@ class NVProgram(HCQProgram): self.dev, self.name, self.lib = dev, name, lib self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]] - if (NAK:=isinstance(dev.renderer, NAKRenderer)): + if (NAK:=isinstance(dev.compiler, NAKCompiler)): image, self.cbuf_0 = memoryview(bytearray(lib[ctypes.sizeof(info:=mesa.struct_nak_shader_info.from_buffer_copy(lib)):])), [] self.regs_usage, self.shmem_usage, self.lcmem_usage = info.num_gprs, round_up(info.cs.smem_size, 128), round_up(info.slm_size, 16) elif MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore @@ -585,7 +586,7 @@ class NVDevice(HCQCompiled[HCQSignal]): cucc, ptxcc = (CUDACompiler, PTXCompiler) if MOCKGPU else (NVCompiler, NVPTXCompiler) compilers = CompilerSet(ctrl_var=NV_CC, cset=[CompilerPair(functools.partial(NVRenderer, self.arch),functools.partial(cucc, self.arch)), CompilerPair(functools.partial(PTXRenderer, self.arch, device="NV"), functools.partial(ptxcc, self.arch), NV_PTX), - CompilerPair(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), None, NV_NAK)]) + CompilerPair(functools.partial(NAKRenderer, dev=self), functools.partial(NAKCompiler, self.arch, self.max_warps_per_sm), NV_NAK)]) super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), HCQSignal, NVComputeQueue, NVCopyQueue) self._setup_gpfifos() diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index fd3f88aef3..b1284bbd1e 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -10,6 +10,7 @@ from tinygrad.runtime.autogen import kgsl, mesa from tinygrad.runtime.ops_cl import CLCompiler, CLDevice from tinygrad.renderer.cstyle import QCOMRenderer from tinygrad.renderer.nir import IR3Renderer +from tinygrad.runtime.support.compiler_mesa import IR3Compiler from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport, cpu_profile, lo32, PROFILE, suppress_finalizing from tinygrad.helpers import flatten, QCOM_IR3, QCOM_CC from tinygrad.runtime.support.system import System @@ -226,10 +227,10 @@ class IR3ArgsState(HCQArgsState): class QCOMProgram(HCQProgram): def __init__(self, dev: QCOMDevice, name: str, lib: bytes): self.dev: QCOMDevice = dev - self.name, self.lib, self.NIR = name, lib, isinstance(dev.renderer, IR3Renderer) + self.name, self.lib, self.NIR = name, lib, isinstance(dev.compiler, IR3Compiler) if self.NIR: - from tinygrad.runtime.support.compiler_mesa import IR3Compiler + from tinygrad.runtime.autogen import mesa v, cs, self.imm_vals, self.image = IR3Compiler.unpack_lib(lib) self.prg_offset, self.brnchstck, self.image_size, self.pvtmem, self.shmem = 0, v.branchstack, v.info.size, v.pvtmem_size, v.shared_size self.wgsz = alloc.offset_vec4 * 4 + 8 if (alloc:=cs.allocs.consts[mesa.IR3_CONST_ALLOC_DRIVER_PARAMS]).size_vec4 else 0xfc @@ -401,7 +402,7 @@ class QCOMDevice(HCQCompiled): System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276") compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[CompilerPair(QCOMRenderer, functools.partial(QCOMCompiler, device)), - CompilerPair(functools.partial(IR3Renderer, info.chip_id), None, QCOM_IR3)]) + CompilerPair(functools.partial(IR3Renderer, self), functools.partial(IR3Compiler, info.chip_id), QCOM_IR3)]) super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal, functools.partial(QCOMComputeQueue, self), None) diff --git a/tinygrad/runtime/support/compiler_cpu.py b/tinygrad/runtime/support/compiler_cpu.py index 8b11f3af8e..ce64ee09ce 100644 --- a/tinygrad/runtime/support/compiler_cpu.py +++ b/tinygrad/runtime/support/compiler_cpu.py @@ -29,7 +29,7 @@ def expect(x, err, ret=None): class LLVMCompiler(Compiler): jit = True target_arch = {'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86', 'riscv64': 'riscv64'}[platform.machine()] - def __init__(self, processor:str, feats:str, cache_key=None): + def __init__(self, processor:str, feats:str): for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')() triple = {'AArch64': b'aarch64-none-unknown-elf', 'X86': b'x86_64-none-unknown-elf', 'AMDGPU': b'amdgcn-amd-amdhsa'}[self.target_arch] @@ -59,7 +59,7 @@ class LLVMCompiler(Compiler): self.diag_msgs.append(msg) self.handle_diag = handle_diag llvm.LLVMContextSetDiagnosticHandler(self.context, handle_diag, None) - super().__init__(cache_key or f"compile_llvm_{processor}_{feats}{'_jit' if self.jit else ''}{'_opt' if opt else ''}") + super().__init__(f"compile_llvm_{processor}_{feats}{'_jit' if self.jit else ''}{'_opt' if opt else ''}") def __del__(self): llvm.LLVMDisposePassBuilderOptions(self.pbo) @@ -83,7 +83,7 @@ class LLVMCompiler(Compiler): def disassemble(self, lib:bytes): capstone_flatdump(lib) class CPULLVMCompiler(LLVMCompiler): - def __init__(self, cache_key=None): + def __init__(self): # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures()) - super().__init__(cpu.decode(), feats.decode(), cache_key) + super().__init__(cpu.decode(), feats.decode()) diff --git a/tinygrad/runtime/support/compiler_mesa.py b/tinygrad/runtime/support/compiler_mesa.py index 204867ca74..7b8c810690 100644 --- a/tinygrad/runtime/support/compiler_mesa.py +++ b/tinygrad/runtime/support/compiler_mesa.py @@ -4,8 +4,6 @@ from tinygrad.helpers import cpu_objdump, system, data64 from tinygrad.runtime.autogen import mesa, llvm from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, expect, cerr -# NB: compilers assume mesa's glsl type cache is managed externally with mesa.glsl_type_singleton_init_or_ref() and mesa.glsl_type_singleton_decref() - def rzalloc(typ, ctx=None, **kwargs): s = ctypes.cast(mesa.rzalloc_size(ctypes.cast(ctx, ctypes.c_void_p), ctypes.sizeof(typ)), ctypes.POINTER(typ)) for k,v in kwargs.items(): setattr(s.contents, k, v) @@ -16,8 +14,20 @@ def deserialize(enc_src, opts): mesa.blob_reader_init(blobreader, src:=base64.b64decode(enc_src), len(src)) return mesa.nir_deserialize(None, ctypes.cast(opts, ctypes.POINTER(mesa.nir_shader_compiler_options)), blobreader) -class LVPCompiler(CPULLVMCompiler): - def __init__(self, cache_key="lvp"): CPULLVMCompiler.__init__(self, cache_key=f"compile_{cache_key}") +class NIRCompiler(Compiler): + def __init__(self, cache_key): + mesa.glsl_type_singleton_init_or_ref() + super().__init__(cache_key) + def __del__(self): mesa.glsl_type_singleton_decref() + +class LVPCompiler(CPULLVMCompiler, NIRCompiler): + def __init__(self, cache_key="lvp"): + CPULLVMCompiler.__init__(self) + NIRCompiler.__init__(self, f"compile_{cache_key}") + + def __del__(self): + NIRCompiler.__del__(self) + CPULLVMCompiler.__del__(self) def compile(self, src) -> bytes: shader, ctx = deserialize(src, mesa.lvp_nir_options), llvm.LLVMGetGlobalContext() @@ -50,14 +60,16 @@ class LVPCompiler(CPULLVMCompiler): def disassemble(self, lib: bytes): cpu_objdump(lib) -class NAKCompiler(Compiler): +class NAKCompiler(NIRCompiler): def __init__(self, arch, warps_per_sm, cache_key="nak"): self.arch, self.warps_per_sm = arch, warps_per_sm self.cc = mesa.nak_compiler_create(mesa.struct_nv_device_info(sm=int(arch[3:]), max_warps_per_mp=warps_per_sm)) self.nir_options = bytes(mesa.nak_nir_options(self.cc).contents) super().__init__(f"compile_{cache_key}_{arch}") - def __del__(self): mesa.nak_compiler_destroy(self.cc) + def __del__(self): + mesa.nak_compiler_destroy(self.cc) + super().__del__() def __reduce__(self): return NAKCompiler, (self.arch, self.warps_per_sm) @@ -88,7 +100,7 @@ def disas_adreno(lib:bytes, gpu_id=630): tf.seek(0) print(tf.read()) -class IR3Compiler(Compiler): +class IR3Compiler(NIRCompiler): def __init__(self, chip_id, cache_key="ir3"): assert sys.version_info >= (3,14), "IR3 requires python 3.14's bitfield fixes" self.dev_id = mesa.struct_fd_dev_id(((chip_id >> 24) & 0xFF) * 100 + ((chip_id >> 16) & 0xFF) * 10 + ((chip_id >> 8) & 0xFF), chip_id) @@ -98,7 +110,9 @@ class IR3Compiler(Compiler): self.nir_options = bytes(mesa.ir3_get_compiler_options(self.cc).contents) super().__init__(f"compile_{cache_key}") - def __del__(self): mesa.ir3_compiler_destroy(self.cc) + def __del__(self): + mesa.ir3_compiler_destroy(self.cc) + super().__del__() def __reduce__(self): return IR3Compiler, (self.dev_id.chip_id,)