mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
@@ -98,8 +98,10 @@ if __name__ == "__main__":
|
||||
|
||||
return rocprof.ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS
|
||||
|
||||
rocprof.rocprof_trace_decoder_parse_data(copy_cb, trace_cb, isa_cb, None)
|
||||
print('SQTT:', ROCParseCtx.wave_events.keys())
|
||||
try:
|
||||
rocprof.rocprof_trace_decoder_parse_data(copy_cb, trace_cb, isa_cb, None)
|
||||
print('SQTT:', ROCParseCtx.wave_events.keys())
|
||||
except Exception as e: print("Error in sqtt decoder:", e)
|
||||
|
||||
for ev in pmc_events:
|
||||
print(f"PMC Event: dev={ev.device} kern={ev.kern}")
|
||||
@@ -108,5 +110,5 @@ if __name__ == "__main__":
|
||||
view = memoryview(ev.blob).cast('Q')
|
||||
print(f"\t{s.name}")
|
||||
for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
|
||||
print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]}")
|
||||
print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]:#x}")
|
||||
ptr += 1
|
||||
|
||||
@@ -761,7 +761,7 @@ class PCIIface(PCIIfaceBase):
|
||||
self._setup_adev(self.pci_dev)
|
||||
self.pci_dev.write_config(pci.PCI_COMMAND, self.pci_dev.read_config(pci.PCI_COMMAND, 2) | pci.PCI_COMMAND_MASTER, 2)
|
||||
|
||||
def is_in_profile_mode(self): return False
|
||||
def is_in_profile_mode(self): return True
|
||||
|
||||
def _setup_adev(self, pci_dev:PCIDevice, dma_regions:list[tuple[int, MMIOInterface]]|None=None):
|
||||
self.dev_impl:AMDev = AMDev(pci_dev, dma_regions)
|
||||
@@ -897,7 +897,7 @@ class AMDDevice(HCQCompiled):
|
||||
self.pmc_enabled = PROFILE and PMC > 0
|
||||
if self.pmc_enabled:
|
||||
if self.target[0] not in {11}: raise RuntimeError(f'PMC are not supported on gc:{self.target}')
|
||||
if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: AMD_IFACE=KFD and `amd-smi set -l stable_std`")
|
||||
if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: run `amd-smi set -l stable_std` for KFD iface")
|
||||
|
||||
self.pmc_sched:list[PMCSample] = []
|
||||
self.pmc_counters = import_pmc(self.target)
|
||||
@@ -908,6 +908,7 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
cast(AMDComputeQueue, self.hw_compute_queue_t()).pmc_start([self.pmc_counters[k] for k in PMC_COUNTERS]).submit(self)
|
||||
self.pmc_buffer = self.allocator.alloc(self.pmc_sched[-1].off + self.pmc_sched[-1].size, BufferSpec(nolru=True, uncached=True))
|
||||
self.allocator._copyin(self.pmc_buffer, memoryview(bytearray(self.pmc_buffer.size))) # zero pmc buffers, some counters have only lo part.
|
||||
|
||||
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
|
||||
self.sqtt_enabled = PROFILE and SQTT > 0
|
||||
|
||||
@@ -272,7 +272,7 @@ class AM_GFX(AM_IP):
|
||||
self.adev.regSDMA0_RLC_CGCG_CTRL.update(cgcg_int_enable=1)
|
||||
self.adev.regSDMA1_RLC_CGCG_CTRL.update(cgcg_int_enable=1)
|
||||
|
||||
self.adev.regRLC_CGTT_MGCG_OVERRIDE.update(perfmon_clock_state=0, gfxip_fgcg_override=0, gfxip_repeater_fgcg_override=0,
|
||||
self.adev.regRLC_CGTT_MGCG_OVERRIDE.update(perfmon_clock_state=1, gfxip_fgcg_override=0, gfxip_repeater_fgcg_override=0,
|
||||
grbm_cgtt_sclk_override=0, rlc_cgtt_sclk_override=0, gfxip_mgcg_override=0, gfxip_cgls_override=0, gfxip_cgcg_override=0)
|
||||
|
||||
self.adev.regRLC_SAFE_MODE.write(message=0, cmd=1)
|
||||
|
||||
Reference in New Issue
Block a user