From ba674256806d20094e7af6232bc7c95474eca67e Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Fri, 13 Feb 2026 11:22:32 +0300 Subject: [PATCH] am: reset mi300 with pm4 (#14727) --- tinygrad/runtime/support/am/amdev.py | 4 ++-- tinygrad/runtime/support/am/ip.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tinygrad/runtime/support/am/amdev.py b/tinygrad/runtime/support/am/amdev.py index da31286868..e5eaacc05a 100644 --- a/tinygrad/runtime/support/am/amdev.py +++ b/tinygrad/runtime/support/am/amdev.py @@ -193,7 +193,7 @@ class AMDev(PCIDevImplBase): if DEBUG >= 2: print(f"am {self.devfmt}: boot done") def init_sw(self, smi_dev=False): - self.smi_dev, self.is_err_state = smi_dev, False + self.smi_dev, self.is_err_state, self.has_aql_queue = smi_dev, False, False # Memory manager & firmware self.mm = AMMemoryManager(self, self.vram_size - self.reserved_vram_size, boot_size=(32 << 20), pt_t=AMPageTableEntry, va_shifts=[12, 21, 30, 39], @@ -226,7 +226,7 @@ class AMDev(PCIDevImplBase): self.reg("regSCRATCH_REG6").write(self.is_err_state) # set finalized state. def recover(self) -> bool: - if self.is_hive() or not self.is_err_state: return False # TODO: support mi300 + if (self.has_aql_queue and self.is_hive()) or not self.is_err_state: return False # TODO: support aql queue recovery on hive if DEBUG >= 2: print(f"am {self.devfmt}: Start recovery") self.ih.interrupt_handler() self.gfx.reset_mec() diff --git a/tinygrad/runtime/support/am/ip.py b/tinygrad/runtime/support/am/ip.py index eeefca2202..4ef90d3297 100644 --- a/tinygrad/runtime/support/am/ip.py +++ b/tinygrad/runtime/support/am/ip.py @@ -291,6 +291,7 @@ class AM_GFX(AM_IP): self._enable_mec() def setup_ring(self, ring_addr:int, ring_size:int, rptr_addr:int, wptr_addr:int, eop_addr:int, eop_size:int, idx:int, aql:bool) -> tuple[int, int]: + self.adev.has_aql_queue |= aql pipe, queue, doorbell = idx // 4, idx % 4, am.AMDGPU_NAVI10_DOORBELL_MEC_RING0 self._grbm_select(me=1, pipe=pipe, queue=queue, inst=0) restore_queue = aql and self.xccs > 1 and self.adev.partial_boot and (self.adev.regCP_HQD_ACTIVE.read(inst=0) & 1)