diff --git a/extra/sqtt/roc.py b/extra/sqtt/roc.py index 221a3ecb45..2c5dc8b17f 100644 --- a/extra/sqtt/roc.py +++ b/extra/sqtt/roc.py @@ -1,9 +1,9 @@ -import ctypes, pathlib, argparse, pickle, re, functools, dataclasses +import ctypes, pathlib, argparse, pickle, re, functools, dataclasses, itertools from extra.sqtt.rocprof import rocprof from extra.sqtt.disasm import comgr_get_address_table from tinygrad.helpers import temp, DEBUG from tinygrad.device import ProfileEvent, ProfileProgramEvent -from tinygrad.runtime.ops_amd import ProfileSQTTEvent +from tinygrad.runtime.ops_amd import ProfileSQTTEvent, ProfilePMCEvent @dataclasses.dataclass class InstInfo: @@ -56,9 +56,11 @@ if __name__ == "__main__": with args.profile.open("rb") as f: profile = pickle.load(f) sqtt_events:list[ProfileSQTTEvent] = [] + pmc_events:list[ProfilePMCEvent] = [] prog_events:list[ProfileProgramEvent] = [] for e in profile: if isinstance(e, ProfileSQTTEvent): sqtt_events.append(e) + if isinstance(e, ProfilePMCEvent): pmc_events.append(e) if isinstance(e, ProfileProgramEvent) and e.device.startswith("AMD"): prog_events.append(e) ROCParseCtx = _ROCParseCtx(sqtt_events, prog_events) @@ -97,4 +99,14 @@ if __name__ == "__main__": return rocprof.ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS rocprof.rocprof_trace_decoder_parse_data(copy_cb, trace_cb, isa_cb, None) - print(ROCParseCtx.wave_events.keys()) + print('SQTT:', ROCParseCtx.wave_events.keys()) + + for ev in pmc_events: + print(f"PMC Event: dev={ev.device} kern={ev.kern}") + ptr = 0 + for s in ev.sched: + view = memoryview(ev.blob).cast('Q') + print(f"\t{s.name}") + for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)): + print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]}") + ptr += 1 diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 3211b76ee4..300f32c14b 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,13 +1,13 @@ from __future__ import annotations from typing import cast, ClassVar -import os, ctypes, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, weakref, itertools +import os, ctypes, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, weakref, itertools, collections assert sys.platform != 'win32' from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filter_visible_devices from tinygrad.uop.ops import sint from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerPairT -from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, suppress_finalizing, lo32, hi32, colored +from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, suppress_finalizing, lo32, hi32, colored, prod from tinygrad.renderer.cstyle import AMDRenderer from tinygrad.renderer.llvmir import AMDLLVMRenderer from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt @@ -15,11 +15,11 @@ from tinygrad.runtime.autogen.am import am from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler from tinygrad.runtime.support.elf import elf_loader from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager -from tinygrad.runtime.support.amd import AMDReg, AMDIP, import_module, import_soc, import_ip_offsets +from tinygrad.runtime.support.amd import AMDReg, AMDIP, import_module, import_soc, import_ip_offsets, import_pmc from tinygrad.runtime.support.system import System, PCIIfaceBase, PCIAllocationMeta, PCIDevice, USBPCIDevice, MAP_FIXED, MAP_NORESERVE if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import -SQTT = getenv("SQTT", 0) +SQTT, PMC = getenv("SQTT", 0), getenv("PMC", 0) EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h WAIT_REG_MEM_FUNCTION_EQ = 3 # == WAIT_REG_MEM_FUNCTION_NEQ = 4 # != @@ -30,6 +30,12 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM << @dataclass(frozen=True) class ProfileSQTTEvent(ProfileEvent): device:str; se:int; props:dict; blob:bytes; itrace:bool # noqa: E702 +@dataclass(frozen=True) +class PMCSample: name:str; block:str; inst:int; se:int; sa:int; wgp:int; off:int; size:int; reg:str # noqa: E702 + +@dataclass(frozen=True) +class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes # noqa: E702 + class AMDSignal(HCQSignal): def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100}) @@ -69,6 +75,9 @@ class AMDComputeQueue(HWQueue): def set_grbm_broadcast(self): self.wreg(self.gc.regGRBM_GFX_INDEX, **{f'{f}_broadcast_writes': 1 for f in ['se', 'sh' if self.dev.target[0] == 9 else 'sa', 'instance']}) def set_grbm_se(self, se): self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, instance_broadcast_writes=1) + def set_grbm_inst(self, n): + self.wreg(self.gc.regGRBM_GFX_INDEX, **{f'{f}_broadcast_writes': 1 for f in ['se', 'sh' if self.dev.target[0] == 9 else 'sa']}, instance_index=n) + def set_grbm_se_sh_wgp(self, se, sa, wgp): self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, sa_index=sa, instance_index=wgp << 2) def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg=None, reg_done=0, op=WAIT_REG_MEM_FUNCTION_GEQ): wrm_info_dw = self.pm4.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | self.pm4.WAIT_REG_MEM_OPERATION(int(mem is None and reg_done > 0)) \ @@ -126,6 +135,48 @@ class AMDComputeQueue(HWQueue): self.wreg(self.gc.regSPI_CONFIG_CNTL, ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688, enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing)) + ### PMC ### + + def pmc_reset_counters(self, en=True): + self.set_grbm_broadcast() + self.wreg(self.gc.regCP_PERFMON_CNTL, perfmon_state=0) + if en: self.wreg(self.gc.regCP_PERFMON_CNTL, perfmon_state=1) + return self + + def pmc_start(self, counters): + self.pmc_reset_counters(en=False) + self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL, cs_en=1, ps_en=1, gs_en=1, hs_en=1) + self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL2, force_en=1, vmid_en=0xffff) + + out_off = 0 + block2pid:dict[str, itertools.count] = collections.defaultdict(lambda: itertools.count()) + for name,block,idx in counters: + inst_cnt, se_cnt, sa_cnt, wgp_cnt = (32, 1, 1, 1) if block != "SQ" else (1, self.dev.se_cnt, 2, self.dev.iface.props['cu_per_simd_array'] // 2) + reg, out_off = f'reg{block}_PERFCOUNTER{next(block2pid[block])}', out_off + (rec_size:=prod((inst_cnt, se_cnt, sa_cnt, wgp_cnt)) * 8) + self.wreg(getattr(self.gc, f'{reg}_SELECT'), idx) + self.dev.pmc_sched.append(PMCSample(name, block, inst_cnt, se_cnt, sa_cnt, wgp_cnt, out_off-rec_size, rec_size, reg)) + + self.wreg(self.gc.regCOMPUTE_PERFCOUNT_ENABLE, 1) + return self.pmc_reset_counters(en=True) + + def pmc_read(self, buf, sched): + self.set_grbm_broadcast() + self.wreg(self.gc.regCP_PERFMON_CNTL, perfmon_state=1, perfmon_sample_enable=1) # read counters + + for s in sched: + offset = itertools.count(s.off, step=8) + + for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)): + if s.inst > 1: self.set_grbm_inst(inst) + else: self.set_grbm_se_sh_wgp(se_idx, sa_idx, wgp_idx) + + # Copy counter to memory (src_sel = perf, dst_sel = tc_l2) + lo, hi = getattr(self.gc, f'{s.reg}_LO'), getattr(self.gc, f'{s.reg}_HI', None) + self.pkt3(self.pm4.PACKET3_COPY_DATA, 2 << 8 | 4, lo.addr[0], 0, *data64_le(buf.va_addr+(loff:=next(offset)))) + if hi is not None: self.pkt3(self.pm4.PACKET3_COPY_DATA, 2 << 8 | 4, hi.addr[0], 0, *data64_le(buf.va_addr+loff+4)) + + return self.pmc_reset_counters(en=True) + ### SQTT ### def sqtt_setup_exec(self, prg, global_size): @@ -520,6 +571,15 @@ class AMDProgram(HCQProgram): base=self.lib_gpu.va_addr) weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec) + def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False): + res = super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait) + if self.dev.pmc_enabled: + cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).pmc_read(self.dev.pmc_buffer, self.dev.pmc_sched) \ + .signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev) + self.dev.allocator._copyout(pmc_buf:=memoryview(bytearray(self.dev.pmc_buffer.size)), self.dev.pmc_buffer) + Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.name, self.dev.pmc_sched, bytes(pmc_buf))] + return res + class AMDAllocator(HCQAllocator['AMDDevice']): def __init__(self, dev:AMDDevice): super().__init__(dev, copy_bufs=getattr(dev.iface, 'copy_bufs', None), max_copyout_size=0x1000 if dev.is_usb() else None) @@ -581,7 +641,8 @@ class KFDIface: self.gpu_id = int(FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read()) self.props = {(p:=l.split())[0]: int(p[1]) for l in FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()} - ip_base = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device/ip_discovery/die/0" + self.dev_sysfs_path = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device" + ip_base = f"{self.dev_sysfs_path}/ip_discovery/die/0" id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP} ip_hw = [(id2ip[int(hwid)], int(hwid)) for hwid in FileIOInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip] self.ip_versions = {ip:tuple(int(FileIOInterface(f'{ip_base}/{hw}/0/{part}').read()) for part in ['major','minor','revision']) for ip,hw in ip_hw} @@ -689,6 +750,8 @@ class KFDIface: raise RuntimeError("\n".join(report)) + def is_in_profile_mode(self): return FileIOInterface(f'{self.dev_sysfs_path}/power_dpm_force_performance_level').read() == 'profile_standard\n' + class PCIIface(PCIIfaceBase): gpus:ClassVar[list[str]] = [] @@ -698,14 +761,16 @@ class PCIIface(PCIIfaceBase): self._setup_adev(self.pci_dev) self.pci_dev.write_config(pci.PCI_COMMAND, self.pci_dev.read_config(pci.PCI_COMMAND, 2) | pci.PCI_COMMAND_MASTER, 2) + def is_in_profile_mode(self): return False + def _setup_adev(self, pci_dev:PCIDevice, dma_regions:list[tuple[int, MMIOInterface]]|None=None): self.dev_impl:AMDev = AMDev(pci_dev, dma_regions) self.ip_versions = self.dev_impl.ip_ver gfxver = int(f"{self.dev_impl.ip_ver[am.GC_HWIP][0]:02d}{self.dev_impl.ip_ver[am.GC_HWIP][1]:02d}{self.dev_impl.ip_ver[am.GC_HWIP][2]:02d}") array_count = self.dev_impl.gc_info.gc_num_sa_per_se * self.dev_impl.gc_info.gc_num_se - simd_count = 2 * array_count * (self.dev_impl.gc_info.gc_num_wgp0_per_sa + self.dev_impl.gc_info.gc_num_wgp1_per_sa) - self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': gfxver, + self.props = {'cu_per_simd_array': (cu_per_sa:=2 * (self.dev_impl.gc_info.gc_num_wgp0_per_sa + self.dev_impl.gc_info.gc_num_wgp1_per_sa)), + 'simd_count': 2 * cu_per_sa * array_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': gfxver, 'max_slots_scratch_cu': self.dev_impl.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.dev_impl.gc_info.gc_max_waves_per_simd, 'simd_arrays_per_engine': self.dev_impl.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.dev_impl.gc_info.gc_lds_size} @@ -829,6 +894,21 @@ class AMDDevice(HCQCompiled): self.max_private_segment_size = 0 self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread + self.pmc_enabled = PROFILE and PMC > 0 + if self.pmc_enabled: + if self.target[0] not in {11}: raise RuntimeError(f'PMC are not supported on gc:{self.target}') + if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: AMD_IFACE=KFD and `amd-smi set -l stable_std`") + + self.pmc_sched:list[PMCSample] = [] + self.pmc_counters = import_pmc(self.target) + + # validate counters + for k in (PMC_COUNTERS:=getenv("PMC_COUNTERS", "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT").split(",")): + if k not in self.pmc_counters: raise RuntimeError(f"PMC counter {k} is not supported. Available: {','.join(self.pmc_counters.keys())}") + + cast(AMDComputeQueue, self.hw_compute_queue_t()).pmc_start([self.pmc_counters[k] for k in PMC_COUNTERS]).submit(self) + self.pmc_buffer = self.allocator.alloc(self.pmc_sched[-1].off + self.pmc_sched[-1].size, BufferSpec(nolru=True, uncached=True)) + # SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them) self.sqtt_enabled = PROFILE and SQTT > 0 if self.sqtt_enabled: @@ -838,7 +918,7 @@ class AMDDevice(HCQCompiled): f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n" "For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md") SQTT_BUFFER_SIZE = getenv("SQTT_BUFFER_SIZE", 256) # in mb, per shader engine - self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE*1024*1024, BufferSpec(nolru=True, uncached=True)) for _ in range(self.se_cnt)] + self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE << 20, BufferSpec(nolru=True, uncached=True)) for _ in range(self.se_cnt)] self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", -1 if SQTT >= 2 else (1 << 1)) # se bitmask: -1 enable all, 0 disable all self.sqtt_next_cmd_id = itertools.count(0) cast(AMDComputeQueue, self.hw_compute_queue_t()).sqtt_start(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self) diff --git a/tinygrad/runtime/support/amd.py b/tinygrad/runtime/support/amd.py index 81f9873af4..0a64867181 100644 --- a/tinygrad/runtime/support/amd.py +++ b/tinygrad/runtime/support/amd.py @@ -27,9 +27,8 @@ class AMDIP: def __getattr__(self, name:str): if name in self.regs: return self.regs[name] - - # NOTE: gfx10 gc registers always start with mm, no reg prefix - return self.regs[name.replace('reg', 'mm')] + if (name10:=name.replace('reg', 'mm')) in self.regs: return self.regs[name10] + raise AttributeError(f"{self.name.upper()} has no register {name}") def fixup_ip_version(ip:str, version:tuple[int, ...]) -> list[tuple[int, ...]]: # override versions @@ -64,6 +63,10 @@ def import_soc(ip): def import_ip_offsets(ip): return type("IPOFF", (object,), import_header(f"include/{('sienna_cichlid' if ip[0] > 9 else 'vega20')}_ip_offset.h")) +def import_pmc(ip) -> dict[str, tuple[str, str, int]]: + m = re.search(r'(.*?)', header_download("rocprofiler/src/core/counters/basic/gfx_metrics.xml", url=ROCM_URL), re.S) + return {n:(n,b,int(e)) for n,b,e in re.findall(r' dict[str, AMDReg]: def _split_name(name): return name[:(pos:=next((i for i,c in enumerate(name) if c.isupper()), len(name)))], name[pos:] def _extract_regs(txt):