amd: props in device not sqtt (#13106)

* amd: props in device not sqtt

* fix

* f

* fix

* fix
This commit is contained in:
nimlgen
2025-11-05 23:43:20 +08:00
committed by GitHub
parent 757ceab2a2
commit eff80beeed
6 changed files with 11 additions and 11 deletions

View File

@@ -154,6 +154,7 @@ class RGP:
if device not in device_events: raise RuntimeError(f"Device {device} not found in profile, devices in profile: {', '.join(device_events.keys())} ")
device_event = device_events[device]
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
device_props = device_event.props
# merge events per SE
merged_sqtt_events:dict[int, ProfileSQTTEvent] = {}
for ev in sqtt_events:
@@ -164,12 +165,10 @@ class RGP:
se=ev.se,
itrace=merged_sqtt_events[ev.se].itrace or ev.itrace,
blob=merged_sqtt_events[ev.se].blob + ev.blob,
props=ev.props,
)
sqtt_events = list(merged_sqtt_events.values())
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
device_props = sqtt_events[0].props
gfx_ver = device_props['gfx_target_version'] // 10000
gfx_iplvl = getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}_{(device_props['gfx_target_version']//100)%100}",
getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}", None))

View File

@@ -57,7 +57,8 @@ class _ROCParseCtx:
self.inst_execs:dict[tuple[str, int, int, int], list[InstExec]] = {}
for prog in prog_evs:
for addr, info in llvm_disasm(dev_evs[prog.device].arch, unwrap(prog.lib)).items():
arch = "gfx%d%x%x" % ((trgt:=dev_evs[prog.device].props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
for addr, info in llvm_disasm(arch, unwrap(prog.lib)).items():
self.disasms[unwrap(prog.base) + addr] = info
self.addr2prg[unwrap(prog.base) + addr] = prog

View File

@@ -32,7 +32,7 @@ def get_sqtt(asm:list[str], l:int=1, g:int=1) -> list[InstExec]:
k = Tensor.custom_kernel(Tensor.empty(1), fxn=fxn)[0]
# exec and decode sqtt
k.realize()
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", arch=dev.device_info())])
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
assert len(rctx.inst_execs) > 0, "empty sqtt output"
return list(rctx.inst_execs.values())[0][:-1]
@@ -83,7 +83,7 @@ class TestTiming(unittest.TestCase):
diff_hw_reg = Tensor.empty(1, dtype=dtypes.ulong)
diff_hw_reg = Tensor.custom_kernel(diff_hw_reg, fxn=sleep_kernel)[0]
diff_hw_reg.realize()
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", arch=dev.device_info())])
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
diff_sqtt = list(rctx.inst_execs.values())[0][2]
self.assertEqual(diff_sqtt.dur, diff_hw_reg.item()-1) # 1 cycle for reading the counter register

View File

@@ -54,7 +54,7 @@ atexit.register(lambda: [Device[dn].finalize() for dn in Device._opened_devices]
@dataclass(frozen=True)
class ProfileDeviceEvent(ProfileEvent):
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); arch:str="" # noqa: E702
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702
@dataclass(frozen=True)
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702

View File

@@ -28,7 +28,7 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM <<
| (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
@dataclass(frozen=True)
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; props:dict; blob:bytes; itrace:bool # noqa: E702
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702
@dataclass(frozen=True)
class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; regsample:str # noqa: E702
@@ -605,7 +605,7 @@ class AMDProgram(HCQProgram):
self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf)
resbuf = (struct.pack('<Q', 0x11 | (4 << 13) | (0xf << 16) | (se << 24)) + bytes(sqtt_mv)) if self.dev.target[0] == 9 else bytes(sqtt_mv)
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, se, self.dev.iface.props, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1))]
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1))]
return res
class AMDAllocator(HCQAllocator['AMDDevice']):
@@ -999,4 +999,4 @@ class AMDDevice(HCQCompiled):
def on_device_hang(self): self.iface.on_device_hang()
def device_info(self): return self.arch
def device_props(self): return self.iface.props

View File

@@ -409,7 +409,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
for dev in HCQCompiled.peer_groups[pg]: cast(HCQAllocator, dev.allocator).map(alc)
return self.signal_t(base_buf=HCQCompiled.signal_pool[pg].pop(), owner=self, **kwargs)
def device_info(self) -> str: return "" # to be overridden if needed
def device_props(self) -> dict[str,Any]: return {} # to be overridden if needed. dict keys are backend dependent.
def _at_profile_finalize(self):
self.synchronize() # Expect device to be synchronizes
@@ -424,7 +424,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)])
if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0)
else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)])
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, arch=self.device_info())]
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, props=self.device_props())]
def _wrap_timeline_signal(self):
self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1