mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
amd: props in device not sqtt (#13106)
* amd: props in device not sqtt * fix * f * fix * fix
This commit is contained in:
@@ -154,6 +154,7 @@ class RGP:
|
||||
if device not in device_events: raise RuntimeError(f"Device {device} not found in profile, devices in profile: {', '.join(device_events.keys())} ")
|
||||
device_event = device_events[device]
|
||||
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
|
||||
device_props = device_event.props
|
||||
# merge events per SE
|
||||
merged_sqtt_events:dict[int, ProfileSQTTEvent] = {}
|
||||
for ev in sqtt_events:
|
||||
@@ -164,12 +165,10 @@ class RGP:
|
||||
se=ev.se,
|
||||
itrace=merged_sqtt_events[ev.se].itrace or ev.itrace,
|
||||
blob=merged_sqtt_events[ev.se].blob + ev.blob,
|
||||
props=ev.props,
|
||||
)
|
||||
sqtt_events = list(merged_sqtt_events.values())
|
||||
|
||||
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
|
||||
device_props = sqtt_events[0].props
|
||||
gfx_ver = device_props['gfx_target_version'] // 10000
|
||||
gfx_iplvl = getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}_{(device_props['gfx_target_version']//100)%100}",
|
||||
getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}", None))
|
||||
|
||||
@@ -57,7 +57,8 @@ class _ROCParseCtx:
|
||||
self.inst_execs:dict[tuple[str, int, int, int], list[InstExec]] = {}
|
||||
|
||||
for prog in prog_evs:
|
||||
for addr, info in llvm_disasm(dev_evs[prog.device].arch, unwrap(prog.lib)).items():
|
||||
arch = "gfx%d%x%x" % ((trgt:=dev_evs[prog.device].props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
|
||||
for addr, info in llvm_disasm(arch, unwrap(prog.lib)).items():
|
||||
self.disasms[unwrap(prog.base) + addr] = info
|
||||
self.addr2prg[unwrap(prog.base) + addr] = prog
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ def get_sqtt(asm:list[str], l:int=1, g:int=1) -> list[InstExec]:
|
||||
k = Tensor.custom_kernel(Tensor.empty(1), fxn=fxn)[0]
|
||||
# exec and decode sqtt
|
||||
k.realize()
|
||||
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", arch=dev.device_info())])
|
||||
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
|
||||
assert len(rctx.inst_execs) > 0, "empty sqtt output"
|
||||
return list(rctx.inst_execs.values())[0][:-1]
|
||||
|
||||
@@ -83,7 +83,7 @@ class TestTiming(unittest.TestCase):
|
||||
diff_hw_reg = Tensor.empty(1, dtype=dtypes.ulong)
|
||||
diff_hw_reg = Tensor.custom_kernel(diff_hw_reg, fxn=sleep_kernel)[0]
|
||||
diff_hw_reg.realize()
|
||||
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", arch=dev.device_info())])
|
||||
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
|
||||
diff_sqtt = list(rctx.inst_execs.values())[0][2]
|
||||
self.assertEqual(diff_sqtt.dur, diff_hw_reg.item()-1) # 1 cycle for reading the counter register
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ atexit.register(lambda: [Device[dn].finalize() for dn in Device._opened_devices]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileDeviceEvent(ProfileEvent):
|
||||
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); arch:str="" # noqa: E702
|
||||
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702
|
||||
|
||||
@@ -28,7 +28,7 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM <<
|
||||
| (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; props:dict; blob:bytes; itrace:bool # noqa: E702
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; regsample:str # noqa: E702
|
||||
@@ -605,7 +605,7 @@ class AMDProgram(HCQProgram):
|
||||
|
||||
self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf)
|
||||
resbuf = (struct.pack('<Q', 0x11 | (4 << 13) | (0xf << 16) | (se << 24)) + bytes(sqtt_mv)) if self.dev.target[0] == 9 else bytes(sqtt_mv)
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, se, self.dev.iface.props, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1))]
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1))]
|
||||
return res
|
||||
|
||||
class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
@@ -999,4 +999,4 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
def on_device_hang(self): self.iface.on_device_hang()
|
||||
|
||||
def device_info(self): return self.arch
|
||||
def device_props(self): return self.iface.props
|
||||
|
||||
@@ -409,7 +409,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
for dev in HCQCompiled.peer_groups[pg]: cast(HCQAllocator, dev.allocator).map(alc)
|
||||
return self.signal_t(base_buf=HCQCompiled.signal_pool[pg].pop(), owner=self, **kwargs)
|
||||
|
||||
def device_info(self) -> str: return "" # to be overridden if needed
|
||||
def device_props(self) -> dict[str,Any]: return {} # to be overridden if needed. dict keys are backend dependent.
|
||||
|
||||
def _at_profile_finalize(self):
|
||||
self.synchronize() # Expect device to be synchronizes
|
||||
@@ -424,7 +424,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)])
|
||||
if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0)
|
||||
else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)])
|
||||
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, arch=self.device_info())]
|
||||
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, props=self.device_props())]
|
||||
|
||||
def _wrap_timeline_signal(self):
|
||||
self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1
|
||||
|
||||
Reference in New Issue
Block a user