From ebbaad6bfd6466da9126a6df2f27d889009dcd34 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Fri, 2 Jan 2026 15:25:15 +0300 Subject: [PATCH] am: enable all sdma engines (#13970) --- tinygrad/runtime/support/am/amdev.py | 12 +++++++----- tinygrad/runtime/support/am/ip.py | 16 +++++++++------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/tinygrad/runtime/support/am/amdev.py b/tinygrad/runtime/support/am/amdev.py index bf163742b3..78d363ee78 100644 --- a/tinygrad/runtime/support/am/amdev.py +++ b/tinygrad/runtime/support/am/amdev.py @@ -1,6 +1,6 @@ from __future__ import annotations import ctypes, collections, dataclasses, functools, hashlib, array -from tinygrad.helpers import mv_address, getenv, DEBUG, fetch +from tinygrad.helpers import mv_address, getenv, DEBUG, fetch, lo32, hi32 from tinygrad.runtime.autogen.am import am from tinygrad.runtime.support.hcq import MMIOInterface from tinygrad.runtime.support.amd import AMDReg, import_module, import_asic_regs @@ -55,7 +55,8 @@ class AMFirmware: # SDMA firmware blob, hdr = self.load_fw(f"sdma_{fmt_ver(am.SDMA0_HWIP)}.bin", versioned_header="struct_sdma_firmware_header") if hdr.header.header_version_major == 1: - self.descs += [self.desc(blob, hdr.header.ucode_array_offset_bytes, hdr.header.ucode_size_bytes, am.GFX_FW_TYPE_SDMA0)] + self.descs += [self.desc(blob, hdr.header.ucode_array_offset_bytes, hdr.header.ucode_size_bytes, am.GFX_FW_TYPE_SDMA0, + am.GFX_FW_TYPE_SDMA1, am.GFX_FW_TYPE_SDMA2, am.GFX_FW_TYPE_SDMA3)] elif hdr.header.header_version_major == 2: self.descs += [self.desc(blob, hdr.ctl_ucode_offset, hdr.ctl_ucode_size_bytes, am.GFX_FW_TYPE_SDMA_UCODE_TH1)] self.descs += [self.desc(blob, hdr.header.ucode_array_offset_bytes, hdr.ctx_ucode_size_bytes, am.GFX_FW_TYPE_SDMA_UCODE_TH0)] @@ -250,10 +251,11 @@ class AMDev(PCIDevImplBase): self.reg("regBIF_BX_PF0_RSMU_DATA").write(val) def indirect_wreg_pcie(self, reg:int, val:int, aid:int=0): - self.reg("regBIF_BX0_PCIE_INDEX2").write(reg * 4 + ((((aid & 0b11) << 32) | (1 << 34)) if aid > 0 else 0)) - self.reg("regBIF_BX0_PCIE_INDEX2").read() + reg_addr = reg * 4 + ((((aid & 0b11) << 32) | (1 << 34)) if aid > 0 else 0) + self.reg("regBIF_BX0_PCIE_INDEX2").write(lo32(reg_addr)) + if reg_addr >> 32: self.reg("regBIF_BX0_PCIE_INDEX2_HI").write(hi32(reg_addr) & 0xff) self.reg("regBIF_BX0_PCIE_DATA2").write(val) - self.reg("regBIF_BX0_PCIE_DATA2").read() + if reg_addr >> 32: self.reg("regBIF_BX0_PCIE_INDEX2_HI").write(0) def _read_vram(self, addr, size) -> bytes: assert addr % 4 == 0 and size % 4 == 0, f"Invalid address {addr:#x} or size {size:#x}" diff --git a/tinygrad/runtime/support/am/ip.py b/tinygrad/runtime/support/am/ip.py index de093fcfd3..fdf28a9ce7 100644 --- a/tinygrad/runtime/support/am/ip.py +++ b/tinygrad/runtime/support/am/ip.py @@ -25,12 +25,12 @@ class AM_SOC(AM_IP): def set_clockgating_state(self): if self.adev.ip_ver[am.HDP_HWIP] >= (5,2,1): self.adev.regHDP_MEM_POWER_CTRL.update(atomic_mem_power_ctrl_en=1, atomic_mem_power_ds_en=1) - def doorbell_enable(self, port, awid=0, awaddr_31_28_value=0, offset=0, size=0): + def doorbell_enable(self, port, awid=0, awaddr_31_28_value=0, offset=0, size=0, aid=0): reg = self.adev.reg(f"{'regGDC_S2A0_S2A' if self.adev.ip_ver[am.GC_HWIP] >= (12,0,0) else 'regS2A'}_DOORBELL_ENTRY_{port}_CTRL") val = reg.encode(**{f"s2a_doorbell_port{port}_enable":1, f"s2a_doorbell_port{port}_awid":awid, f"s2a_doorbell_port{port}_range_size":size, f"s2a_doorbell_port{port}_awaddr_31_28_value":awaddr_31_28_value, f"s2a_doorbell_port{port}_range_offset":offset}) - if self.adev.ip_ver[am.NBIO_HWIP] in {(7,9,0), (7,9,1)}: self.adev.indirect_wreg_pcie(reg.addr[0], val) + if self.adev.ip_ver[am.NBIO_HWIP] in {(7,9,0), (7,9,1)}: self.adev.indirect_wreg_pcie(reg.addr[0], val, aid=aid) else: reg.write(val) class AM_GMC(AM_IP): @@ -432,9 +432,12 @@ class AM_SDMA(AM_IP): **({'utc_l1_enable':1} if self.adev.ip_ver[am.SDMA0_HWIP] <= (5,2,0) else {}), inst=inst) if self.adev.ip_ver[am.NBIO_HWIP] in {(7,9,0), (7,9,1)}: - for i in range(16): self.adev.reg(f"regDOORBELL0_CTRL_ENTRY_{i+1}").write(**{f"bif_doorbell{i+1}_range_size_entry":4, - f"bif_doorbell{i+1}_range_offset_entry":(am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0 + i * 0xA) * 2}) - self.adev.soc.doorbell_enable(port=2, awid=0xe, awaddr_31_28_value=0x1, offset=0xe, size=4) + for aid_id in range(4): + for dev_inst, (port, awid, offset, awaddr) in enumerate([(1, 0xe, 0xe, 0x1), (2, 0x8, 0x8, 0x2), (5, 0x9, 0x9, 0x8), (6, 0xa, 0xa, 0x9)]): + entry = dev_inst + 1 + 4 * aid_id + self.adev.reg(f"regDOORBELL0_CTRL_ENTRY_{entry}").write(**{f"bif_doorbell{entry}_range_size_entry": 20, + f"bif_doorbell{entry}_range_offset_entry": (am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0 + (entry - 1) * 0xA) * 2}) + self.adev.soc.doorbell_enable(port=port, awid=awid, awaddr_31_28_value=awaddr, offset=offset, size=4, aid=aid_id) else: self.adev.soc.doorbell_enable(port=2, awid=0xe, awaddr_31_28_value=0x3, offset=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0*2, size=4) def fini_hw(self): @@ -448,8 +451,7 @@ class AM_SDMA(AM_IP): self.adev.regGRBM_SOFT_RESET.write(0x0) def setup_ring(self, ring_addr:int, ring_size:int, rptr_addr:int, wptr_addr:int, idx:int) -> tuple[int, int]: - assert idx <= 3, "only 4 SDMA queues supported in am" - pipe, queue = idx // 4, idx % 4 + pipe, queue = idx % 4, idx // 4 reg, inst = ("regSDMA_GFX", pipe+queue*4) if self.adev.ip_ver[am.SDMA0_HWIP][:2] == (4,4) else (f"regSDMA{pipe}_QUEUE{queue}", 0) doorbell = am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0 + (pipe+queue*4) * 0xA self.sdma_reginst.append((reg, inst))