From 4d7a7096c9d628cffdc29c05873d149faab8eaf6 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Thu, 30 Oct 2025 22:28:36 +0800 Subject: [PATCH] am: enable perfmon (#13013) * am: enable perfmon * try * msg --- extra/sqtt/roc.py | 8 +++++--- tinygrad/runtime/ops_amd.py | 5 +++-- tinygrad/runtime/support/am/ip.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/extra/sqtt/roc.py b/extra/sqtt/roc.py index 2c5dc8b17f..5f494f715b 100644 --- a/extra/sqtt/roc.py +++ b/extra/sqtt/roc.py @@ -98,8 +98,10 @@ if __name__ == "__main__": return rocprof.ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS - rocprof.rocprof_trace_decoder_parse_data(copy_cb, trace_cb, isa_cb, None) - print('SQTT:', ROCParseCtx.wave_events.keys()) + try: + rocprof.rocprof_trace_decoder_parse_data(copy_cb, trace_cb, isa_cb, None) + print('SQTT:', ROCParseCtx.wave_events.keys()) + except Exception as e: print("Error in sqtt decoder:", e) for ev in pmc_events: print(f"PMC Event: dev={ev.device} kern={ev.kern}") @@ -108,5 +110,5 @@ if __name__ == "__main__": view = memoryview(ev.blob).cast('Q') print(f"\t{s.name}") for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)): - print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]}") + print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]:#x}") ptr += 1 diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index d736a13abc..70be6c277c 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -761,7 +761,7 @@ class PCIIface(PCIIfaceBase): self._setup_adev(self.pci_dev) self.pci_dev.write_config(pci.PCI_COMMAND, self.pci_dev.read_config(pci.PCI_COMMAND, 2) | pci.PCI_COMMAND_MASTER, 2) - def is_in_profile_mode(self): return False + def is_in_profile_mode(self): return True def _setup_adev(self, pci_dev:PCIDevice, dma_regions:list[tuple[int, MMIOInterface]]|None=None): self.dev_impl:AMDev = AMDev(pci_dev, dma_regions) @@ -897,7 +897,7 @@ class AMDDevice(HCQCompiled): self.pmc_enabled = PROFILE and PMC > 0 if self.pmc_enabled: if self.target[0] not in {11}: raise RuntimeError(f'PMC are not supported on gc:{self.target}') - if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: AMD_IFACE=KFD and `amd-smi set -l stable_std`") + if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: run `amd-smi set -l stable_std` for KFD iface") self.pmc_sched:list[PMCSample] = [] self.pmc_counters = import_pmc(self.target) @@ -908,6 +908,7 @@ class AMDDevice(HCQCompiled): cast(AMDComputeQueue, self.hw_compute_queue_t()).pmc_start([self.pmc_counters[k] for k in PMC_COUNTERS]).submit(self) self.pmc_buffer = self.allocator.alloc(self.pmc_sched[-1].off + self.pmc_sched[-1].size, BufferSpec(nolru=True, uncached=True)) + self.allocator._copyin(self.pmc_buffer, memoryview(bytearray(self.pmc_buffer.size))) # zero pmc buffers, some counters have only lo part. # SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them) self.sqtt_enabled = PROFILE and SQTT > 0 diff --git a/tinygrad/runtime/support/am/ip.py b/tinygrad/runtime/support/am/ip.py index 8916ab362b..39a897d79d 100644 --- a/tinygrad/runtime/support/am/ip.py +++ b/tinygrad/runtime/support/am/ip.py @@ -272,7 +272,7 @@ class AM_GFX(AM_IP): self.adev.regSDMA0_RLC_CGCG_CTRL.update(cgcg_int_enable=1) self.adev.regSDMA1_RLC_CGCG_CTRL.update(cgcg_int_enable=1) - self.adev.regRLC_CGTT_MGCG_OVERRIDE.update(perfmon_clock_state=0, gfxip_fgcg_override=0, gfxip_repeater_fgcg_override=0, + self.adev.regRLC_CGTT_MGCG_OVERRIDE.update(perfmon_clock_state=1, gfxip_fgcg_override=0, gfxip_repeater_fgcg_override=0, grbm_cgtt_sclk_override=0, rlc_cgtt_sclk_override=0, gfxip_mgcg_override=0, gfxip_cgls_override=0, gfxip_cgcg_override=0) self.adev.regRLC_SAFE_MODE.write(message=0, cmd=1)