am: enable perfmon (#13013)

* am: enable perfmon

* try

* msg
This commit is contained in:
nimlgen
2025-10-30 22:28:36 +08:00
committed by GitHub
parent 985b6eb95f
commit 4d7a7096c9
3 changed files with 9 additions and 6 deletions

View File

@@ -98,8 +98,10 @@ if __name__ == "__main__":
return rocprof.ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS
rocprof.rocprof_trace_decoder_parse_data(copy_cb, trace_cb, isa_cb, None)
print('SQTT:', ROCParseCtx.wave_events.keys())
try:
rocprof.rocprof_trace_decoder_parse_data(copy_cb, trace_cb, isa_cb, None)
print('SQTT:', ROCParseCtx.wave_events.keys())
except Exception as e: print("Error in sqtt decoder:", e)
for ev in pmc_events:
print(f"PMC Event: dev={ev.device} kern={ev.kern}")
@@ -108,5 +110,5 @@ if __name__ == "__main__":
view = memoryview(ev.blob).cast('Q')
print(f"\t{s.name}")
for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]}")
print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]:#x}")
ptr += 1

View File

@@ -761,7 +761,7 @@ class PCIIface(PCIIfaceBase):
self._setup_adev(self.pci_dev)
self.pci_dev.write_config(pci.PCI_COMMAND, self.pci_dev.read_config(pci.PCI_COMMAND, 2) | pci.PCI_COMMAND_MASTER, 2)
def is_in_profile_mode(self): return False
def is_in_profile_mode(self): return True
def _setup_adev(self, pci_dev:PCIDevice, dma_regions:list[tuple[int, MMIOInterface]]|None=None):
self.dev_impl:AMDev = AMDev(pci_dev, dma_regions)
@@ -897,7 +897,7 @@ class AMDDevice(HCQCompiled):
self.pmc_enabled = PROFILE and PMC > 0
if self.pmc_enabled:
if self.target[0] not in {11}: raise RuntimeError(f'PMC are not supported on gc:{self.target}')
if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: AMD_IFACE=KFD and `amd-smi set -l stable_std`")
if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: run `amd-smi set -l stable_std` for KFD iface")
self.pmc_sched:list[PMCSample] = []
self.pmc_counters = import_pmc(self.target)
@@ -908,6 +908,7 @@ class AMDDevice(HCQCompiled):
cast(AMDComputeQueue, self.hw_compute_queue_t()).pmc_start([self.pmc_counters[k] for k in PMC_COUNTERS]).submit(self)
self.pmc_buffer = self.allocator.alloc(self.pmc_sched[-1].off + self.pmc_sched[-1].size, BufferSpec(nolru=True, uncached=True))
self.allocator._copyin(self.pmc_buffer, memoryview(bytearray(self.pmc_buffer.size))) # zero pmc buffers, some counters have only lo part.
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
self.sqtt_enabled = PROFILE and SQTT > 0

View File

@@ -272,7 +272,7 @@ class AM_GFX(AM_IP):
self.adev.regSDMA0_RLC_CGCG_CTRL.update(cgcg_int_enable=1)
self.adev.regSDMA1_RLC_CGCG_CTRL.update(cgcg_int_enable=1)
self.adev.regRLC_CGTT_MGCG_OVERRIDE.update(perfmon_clock_state=0, gfxip_fgcg_override=0, gfxip_repeater_fgcg_override=0,
self.adev.regRLC_CGTT_MGCG_OVERRIDE.update(perfmon_clock_state=1, gfxip_fgcg_override=0, gfxip_repeater_fgcg_override=0,
grbm_cgtt_sclk_override=0, rlc_cgtt_sclk_override=0, gfxip_mgcg_override=0, gfxip_cgls_override=0, gfxip_cgcg_override=0)
self.adev.regRLC_SAFE_MODE.write(message=0, cmd=1)