amd: pmc for gfx9 (#13036)

* amd: pmc for gfx9

* xcc

* vmid mask

* ugh

* tiny

* minor

* sorryg
This commit is contained in:
nimlgen
2025-11-01 04:26:34 +08:00
committed by GitHub
parent f6786c1bfd
commit a23226e61e
2 changed files with 31 additions and 21 deletions

View File

@@ -109,6 +109,6 @@ if __name__ == "__main__":
for s in ev.sched:
view = memoryview(ev.blob).cast('Q')
print(f"\t{s.name}")
for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]:#x}")
for xcc, inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.xcc), range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
print(f"\t\tXCC {xcc} Inst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]:#x}")
ptr += 1

View File

@@ -31,7 +31,7 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM <<
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; props:dict; blob:bytes; itrace:bool # noqa: E702
@dataclass(frozen=True)
class PMCSample: name:str; block:str; inst:int; se:int; sa:int; wgp:int; off:int; size:int; reg:str # noqa: E702
class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; reg:str # noqa: E702
@dataclass(frozen=True)
class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes # noqa: E702
@@ -79,6 +79,7 @@ class AMDComputeQueue(HWQueue):
def set_grbm_se_sh(self, se, sh):
self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, **{f'{"sh" if self.dev.target[0] == 9 else "sa"}_index':sh}, instance_broadcast_writes=1)
def set_grbm_se_sh_wgp(self, se, sh, wgp): self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, sa_index=sh, instance_index=wgp << 2)
def set_grbm_se(self, se): self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, sh_broadcast_writes=1, instance_broadcast_writes=1)
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg=None, reg_done=0, op=WAIT_REG_MEM_FUNCTION_GEQ):
wrm_info_dw = self.pm4.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | self.pm4.WAIT_REG_MEM_OPERATION(int(mem is None and reg_done > 0)) \
@@ -146,18 +147,22 @@ class AMDComputeQueue(HWQueue):
def pmc_start(self, counters):
self.pmc_reset_counters(en=False)
self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL, cs_en=1, ps_en=1, gs_en=1, hs_en=1)
self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL2, force_en=1, vmid_en=0xffff)
self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL, cs_en=1, ps_en=1, gs_en=1, hs_en=1, **({'vmid_mask':0xffff} if (gfx9:=self.dev.target[0] == 9) else {}))
if self.dev.target[0] >= 11: self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL2, force_en=1, vmid_en=0xffff)
out_off = 0
end_off = 0
block2pid:dict[str, itertools.count] = collections.defaultdict(lambda: itertools.count())
for name,block,idx in counters:
inst_cnt, se_cnt, sa_cnt, wgp_cnt = {"GRBM": (1, 1, 1, 1), "GL2C": (32, 1, 1, 1),
"SQ": (1, self.dev.se_cnt, 2, self.dev.iface.props['cu_per_simd_array'] // 2)}[block]
reg, out_off = f'reg{block}_PERFCOUNTER{next(block2pid[block])}', out_off + (rec_size:=prod((inst_cnt, se_cnt, sa_cnt, wgp_cnt)) * 8)
self.wreg(getattr(self.gc, f'{reg}_SELECT'), idx)
self.dev.pmc_sched.append(PMCSample(name, block, inst_cnt, se_cnt, sa_cnt, wgp_cnt, out_off-rec_size, rec_size, reg))
# sq block on gfx11+ goes down to wgps
inst_cnt, se_cnt, sa_cnt, wgp_cnt = {"GRBM": (1, 1, 1, 1), "GL2C": (32, 1, 1, 1), "TCC": (16, 1, 1, 1),
"SQ": (1, self.dev.se_cnt // self.dev.xccs) + ((1, 1) if gfx9 else (2, self.dev.iface.props['cu_per_simd_array'] // 2))}[block]
end_off += (rec_size:=prod((self.dev.xccs, inst_cnt, se_cnt, sa_cnt, wgp_cnt)) * 8)
self.wreg(getattr(self.gc, (reg:=f'reg{block}_PERFCOUNTER{next(block2pid[block])}') + '_SELECT'), perf_sel=idx,
**({'simd_mask':0xf, 'sqc_bank_mask':0xf, 'sqc_client_mask':0xf} if gfx9 and block == "SQ" else {}))
self.dev.pmc_sched.append(PMCSample(name, block, self.dev.xccs, inst_cnt, se_cnt, sa_cnt, wgp_cnt, end_off-rec_size, rec_size, reg))
if gfx9: self.wreg(self.gc.regSQ_PERFCOUNTER_MASK, sh0_mask=0xffff, sh1_mask=0xffff)
self.wreg(self.gc.regCOMPUTE_PERFCOUNT_ENABLE, 1)
return self.pmc_reset_counters(en=True)
@@ -168,14 +173,17 @@ class AMDComputeQueue(HWQueue):
for s in sched:
offset = itertools.count(s.off, step=8)
for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
if s.inst > 1: self.set_grbm_inst(inst)
else: self.set_grbm_se_sh_wgp(se_idx, sa_idx, wgp_idx)
for xcc in range(s.xcc):
with self.pred_exec(xcc_mask=1 << xcc):
for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
if s.inst > 1: self.set_grbm_inst(inst)
elif self.dev.target[0] == 9: self.set_grbm_se(se_idx)
else: self.set_grbm_se_sh_wgp(se_idx, sa_idx, wgp_idx)
# Copy counter to memory (src_sel = perf, dst_sel = tc_l2)
lo, hi = getattr(self.gc, f'{s.reg}_LO'), getattr(self.gc, f'{s.reg}_HI', None)
self.pkt3(self.pm4.PACKET3_COPY_DATA, 2 << 8 | 4, lo.addr[0], 0, *data64_le(buf.va_addr+(loff:=next(offset))))
if hi is not None: self.pkt3(self.pm4.PACKET3_COPY_DATA, 2 << 8 | 4, hi.addr[0], 0, *data64_le(buf.va_addr+loff+4))
# Copy counter to memory (src_sel = perf, dst_sel = tc_l2)
lo, hi = getattr(self.gc, f'{s.reg}_LO'), getattr(self.gc, f'{s.reg}_HI', None)
self.pkt3(self.pm4.PACKET3_COPY_DATA, (2 << 8) | 4, lo.addr[0], 0, *data64_le(buf.va_addr+(loff:=next(offset))))
if hi is not None: self.pkt3(self.pm4.PACKET3_COPY_DATA, (2 << 8) | 4, hi.addr[0], 0, *data64_le(buf.va_addr+loff+4))
return self.pmc_reset_counters(en=True)
@@ -752,7 +760,8 @@ class KFDIface:
raise RuntimeError("\n".join(report))
def is_in_profile_mode(self): return FileIOInterface(f'{self.dev_sysfs_path}/power_dpm_force_performance_level').read()[:16] == 'profile_standard'
def is_in_profile_mode(self):
return self.dev.target[0] == 9 or FileIOInterface(f'{self.dev_sysfs_path}/power_dpm_force_performance_level').read()[:16] == 'profile_standard'
class PCIIface(PCIIfaceBase):
gpus:ClassVar[list[str]] = []
@@ -898,14 +907,15 @@ class AMDDevice(HCQCompiled):
self.pmc_enabled = PROFILE and PMC > 0
if self.pmc_enabled:
if self.target[0] not in {11, 12}: raise RuntimeError(f'PMC are not supported on gc:{self.target}')
if self.target[0] not in {9, 11, 12}: raise RuntimeError(f'PMC are not supported on gc:{self.target}')
if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: run `amd-smi set -l stable_std` for KFD iface")
self.pmc_sched:list[PMCSample] = []
self.pmc_counters = import_pmc(self.target)
# validate counters
for k in (PMC_COUNTERS:=getenv("PMC_COUNTERS", "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT").split(",")):
pmc_default = "TCC_HIT,TCC_MISS,SQ_LDS_BANK_CONFLICT" if self.target[0] == 9 else "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT"
for k in (PMC_COUNTERS:=getenv("PMC_COUNTERS", pmc_default).split(",")):
if k not in self.pmc_counters: raise RuntimeError(f"PMC counter {k} is not supported. Available: {','.join(self.pmc_counters.keys())}")
cast(AMDComputeQueue, self.hw_compute_queue_t()).pmc_start([(k, *self.pmc_counters[k]) for k in PMC_COUNTERS]).submit(self)