mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
amd: pmc for gfx9 (#13036)
* amd: pmc for gfx9 * xcc * vmid mask * ugh * tiny * minor * sorryg
This commit is contained in:
@@ -109,6 +109,6 @@ if __name__ == "__main__":
|
||||
for s in ev.sched:
|
||||
view = memoryview(ev.blob).cast('Q')
|
||||
print(f"\t{s.name}")
|
||||
for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
|
||||
print(f"\t\tInst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]:#x}")
|
||||
for xcc, inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.xcc), range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
|
||||
print(f"\t\tXCC {xcc} Inst {inst} SE {se_idx} SA {sa_idx} WGP {wgp_idx}: {view[ptr]:#x}")
|
||||
ptr += 1
|
||||
|
||||
@@ -31,7 +31,7 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM <<
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; props:dict; blob:bytes; itrace:bool # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PMCSample: name:str; block:str; inst:int; se:int; sa:int; wgp:int; off:int; size:int; reg:str # noqa: E702
|
||||
class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; reg:str # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes # noqa: E702
|
||||
@@ -79,6 +79,7 @@ class AMDComputeQueue(HWQueue):
|
||||
def set_grbm_se_sh(self, se, sh):
|
||||
self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, **{f'{"sh" if self.dev.target[0] == 9 else "sa"}_index':sh}, instance_broadcast_writes=1)
|
||||
def set_grbm_se_sh_wgp(self, se, sh, wgp): self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, sa_index=sh, instance_index=wgp << 2)
|
||||
def set_grbm_se(self, se): self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, sh_broadcast_writes=1, instance_broadcast_writes=1)
|
||||
|
||||
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg=None, reg_done=0, op=WAIT_REG_MEM_FUNCTION_GEQ):
|
||||
wrm_info_dw = self.pm4.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | self.pm4.WAIT_REG_MEM_OPERATION(int(mem is None and reg_done > 0)) \
|
||||
@@ -146,18 +147,22 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
def pmc_start(self, counters):
|
||||
self.pmc_reset_counters(en=False)
|
||||
self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL, cs_en=1, ps_en=1, gs_en=1, hs_en=1)
|
||||
self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL2, force_en=1, vmid_en=0xffff)
|
||||
self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL, cs_en=1, ps_en=1, gs_en=1, hs_en=1, **({'vmid_mask':0xffff} if (gfx9:=self.dev.target[0] == 9) else {}))
|
||||
if self.dev.target[0] >= 11: self.wreg(self.gc.regSQ_PERFCOUNTER_CTRL2, force_en=1, vmid_en=0xffff)
|
||||
|
||||
out_off = 0
|
||||
end_off = 0
|
||||
block2pid:dict[str, itertools.count] = collections.defaultdict(lambda: itertools.count())
|
||||
for name,block,idx in counters:
|
||||
inst_cnt, se_cnt, sa_cnt, wgp_cnt = {"GRBM": (1, 1, 1, 1), "GL2C": (32, 1, 1, 1),
|
||||
"SQ": (1, self.dev.se_cnt, 2, self.dev.iface.props['cu_per_simd_array'] // 2)}[block]
|
||||
reg, out_off = f'reg{block}_PERFCOUNTER{next(block2pid[block])}', out_off + (rec_size:=prod((inst_cnt, se_cnt, sa_cnt, wgp_cnt)) * 8)
|
||||
self.wreg(getattr(self.gc, f'{reg}_SELECT'), idx)
|
||||
self.dev.pmc_sched.append(PMCSample(name, block, inst_cnt, se_cnt, sa_cnt, wgp_cnt, out_off-rec_size, rec_size, reg))
|
||||
# sq block on gfx11+ goes down to wgps
|
||||
inst_cnt, se_cnt, sa_cnt, wgp_cnt = {"GRBM": (1, 1, 1, 1), "GL2C": (32, 1, 1, 1), "TCC": (16, 1, 1, 1),
|
||||
"SQ": (1, self.dev.se_cnt // self.dev.xccs) + ((1, 1) if gfx9 else (2, self.dev.iface.props['cu_per_simd_array'] // 2))}[block]
|
||||
|
||||
end_off += (rec_size:=prod((self.dev.xccs, inst_cnt, se_cnt, sa_cnt, wgp_cnt)) * 8)
|
||||
self.wreg(getattr(self.gc, (reg:=f'reg{block}_PERFCOUNTER{next(block2pid[block])}') + '_SELECT'), perf_sel=idx,
|
||||
**({'simd_mask':0xf, 'sqc_bank_mask':0xf, 'sqc_client_mask':0xf} if gfx9 and block == "SQ" else {}))
|
||||
self.dev.pmc_sched.append(PMCSample(name, block, self.dev.xccs, inst_cnt, se_cnt, sa_cnt, wgp_cnt, end_off-rec_size, rec_size, reg))
|
||||
|
||||
if gfx9: self.wreg(self.gc.regSQ_PERFCOUNTER_MASK, sh0_mask=0xffff, sh1_mask=0xffff)
|
||||
self.wreg(self.gc.regCOMPUTE_PERFCOUNT_ENABLE, 1)
|
||||
return self.pmc_reset_counters(en=True)
|
||||
|
||||
@@ -168,14 +173,17 @@ class AMDComputeQueue(HWQueue):
|
||||
for s in sched:
|
||||
offset = itertools.count(s.off, step=8)
|
||||
|
||||
for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
|
||||
if s.inst > 1: self.set_grbm_inst(inst)
|
||||
else: self.set_grbm_se_sh_wgp(se_idx, sa_idx, wgp_idx)
|
||||
for xcc in range(s.xcc):
|
||||
with self.pred_exec(xcc_mask=1 << xcc):
|
||||
for inst, se_idx, sa_idx, wgp_idx in itertools.product(range(s.inst), range(s.se), range(s.sa), range(s.wgp)):
|
||||
if s.inst > 1: self.set_grbm_inst(inst)
|
||||
elif self.dev.target[0] == 9: self.set_grbm_se(se_idx)
|
||||
else: self.set_grbm_se_sh_wgp(se_idx, sa_idx, wgp_idx)
|
||||
|
||||
# Copy counter to memory (src_sel = perf, dst_sel = tc_l2)
|
||||
lo, hi = getattr(self.gc, f'{s.reg}_LO'), getattr(self.gc, f'{s.reg}_HI', None)
|
||||
self.pkt3(self.pm4.PACKET3_COPY_DATA, 2 << 8 | 4, lo.addr[0], 0, *data64_le(buf.va_addr+(loff:=next(offset))))
|
||||
if hi is not None: self.pkt3(self.pm4.PACKET3_COPY_DATA, 2 << 8 | 4, hi.addr[0], 0, *data64_le(buf.va_addr+loff+4))
|
||||
# Copy counter to memory (src_sel = perf, dst_sel = tc_l2)
|
||||
lo, hi = getattr(self.gc, f'{s.reg}_LO'), getattr(self.gc, f'{s.reg}_HI', None)
|
||||
self.pkt3(self.pm4.PACKET3_COPY_DATA, (2 << 8) | 4, lo.addr[0], 0, *data64_le(buf.va_addr+(loff:=next(offset))))
|
||||
if hi is not None: self.pkt3(self.pm4.PACKET3_COPY_DATA, (2 << 8) | 4, hi.addr[0], 0, *data64_le(buf.va_addr+loff+4))
|
||||
|
||||
return self.pmc_reset_counters(en=True)
|
||||
|
||||
@@ -752,7 +760,8 @@ class KFDIface:
|
||||
|
||||
raise RuntimeError("\n".join(report))
|
||||
|
||||
def is_in_profile_mode(self): return FileIOInterface(f'{self.dev_sysfs_path}/power_dpm_force_performance_level').read()[:16] == 'profile_standard'
|
||||
def is_in_profile_mode(self):
|
||||
return self.dev.target[0] == 9 or FileIOInterface(f'{self.dev_sysfs_path}/power_dpm_force_performance_level').read()[:16] == 'profile_standard'
|
||||
|
||||
class PCIIface(PCIIfaceBase):
|
||||
gpus:ClassVar[list[str]] = []
|
||||
@@ -898,14 +907,15 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
self.pmc_enabled = PROFILE and PMC > 0
|
||||
if self.pmc_enabled:
|
||||
if self.target[0] not in {11, 12}: raise RuntimeError(f'PMC are not supported on gc:{self.target}')
|
||||
if self.target[0] not in {9, 11, 12}: raise RuntimeError(f'PMC are not supported on gc:{self.target}')
|
||||
if not self.iface.is_in_profile_mode(): raise RuntimeError("PMC requires stable power state: run `amd-smi set -l stable_std` for KFD iface")
|
||||
|
||||
self.pmc_sched:list[PMCSample] = []
|
||||
self.pmc_counters = import_pmc(self.target)
|
||||
|
||||
# validate counters
|
||||
for k in (PMC_COUNTERS:=getenv("PMC_COUNTERS", "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT").split(",")):
|
||||
pmc_default = "TCC_HIT,TCC_MISS,SQ_LDS_BANK_CONFLICT" if self.target[0] == 9 else "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT"
|
||||
for k in (PMC_COUNTERS:=getenv("PMC_COUNTERS", pmc_default).split(",")):
|
||||
if k not in self.pmc_counters: raise RuntimeError(f"PMC counter {k} is not supported. Available: {','.join(self.pmc_counters.keys())}")
|
||||
|
||||
cast(AMDComputeQueue, self.hw_compute_queue_t()).pmc_start([(k, *self.pmc_counters[k]) for k in PMC_COUNTERS]).submit(self)
|
||||
|
||||
Reference in New Issue
Block a user