mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
am: reset mi300 with pm4 (#14727)
This commit is contained in:
@@ -193,7 +193,7 @@ class AMDev(PCIDevImplBase):
|
||||
if DEBUG >= 2: print(f"am {self.devfmt}: boot done")
|
||||
|
||||
def init_sw(self, smi_dev=False):
|
||||
self.smi_dev, self.is_err_state = smi_dev, False
|
||||
self.smi_dev, self.is_err_state, self.has_aql_queue = smi_dev, False, False
|
||||
|
||||
# Memory manager & firmware
|
||||
self.mm = AMMemoryManager(self, self.vram_size - self.reserved_vram_size, boot_size=(32 << 20), pt_t=AMPageTableEntry, va_shifts=[12, 21, 30, 39],
|
||||
@@ -226,7 +226,7 @@ class AMDev(PCIDevImplBase):
|
||||
self.reg("regSCRATCH_REG6").write(self.is_err_state) # set finalized state.
|
||||
|
||||
def recover(self) -> bool:
|
||||
if self.is_hive() or not self.is_err_state: return False # TODO: support mi300
|
||||
if (self.has_aql_queue and self.is_hive()) or not self.is_err_state: return False # TODO: support aql queue recovery on hive
|
||||
if DEBUG >= 2: print(f"am {self.devfmt}: Start recovery")
|
||||
self.ih.interrupt_handler()
|
||||
self.gfx.reset_mec()
|
||||
|
||||
@@ -291,6 +291,7 @@ class AM_GFX(AM_IP):
|
||||
self._enable_mec()
|
||||
|
||||
def setup_ring(self, ring_addr:int, ring_size:int, rptr_addr:int, wptr_addr:int, eop_addr:int, eop_size:int, idx:int, aql:bool) -> tuple[int, int]:
|
||||
self.adev.has_aql_queue |= aql
|
||||
pipe, queue, doorbell = idx // 4, idx % 4, am.AMDGPU_NAVI10_DOORBELL_MEC_RING0
|
||||
self._grbm_select(me=1, pipe=pipe, queue=queue, inst=0)
|
||||
restore_queue = aql and self.xccs > 1 and self.adev.partial_boot and (self.adev.regCP_HQD_ACTIVE.read(inst=0) & 1)
|
||||
|
||||
Reference in New Issue
Block a user