amd: sqtt for all gfx11 (#12546)

* amd: general sqtt for gfx11

* target

* ops

* no gfx12 here
This commit is contained in:
nimlgen
2025-10-09 17:04:06 +08:00
committed by GitHub
parent a0cbbc35ad
commit a11b686c71
2 changed files with 17 additions and 15 deletions

View File

@@ -155,6 +155,7 @@ class RGP:
device_event = device_events[device]
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
device_props = sqtt_events[0].props
sqtt_itrace_enabled = any([event.itrace for event in sqtt_events])
sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events])
sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0
@@ -192,14 +193,14 @@ class RGP:
flags=0,
trace_shader_core_clock=0x93f05080,
trace_memory_clock=0x4a723a40,
device_id=0x744c,
device_id={110000: 0x744c, 110003: 0x7480}[device_props['gfx_target_version']],
device_revision_id=0xc8,
vgprs_per_simd=1536,
sgprs_per_simd=128*16,
shader_engines=6,
compute_unit_per_shader_engine=16,
simd_per_compute_unit=2,
wavefronts_per_simd=16,
shader_engines=device_props['array_count'] // device_props['simd_arrays_per_engine'],
compute_unit_per_shader_engine=device_props['simd_count'] // device_props['simd_per_cu'] // (device_props['array_count'] // device_props['simd_arrays_per_engine']),
simd_per_compute_unit=device_props['simd_per_cu'],
wavefronts_per_simd=device_props['max_waves_per_simd'],
minimum_vgpr_alloc=4,
vgpr_alloc_granularity=8,
minimum_sgpr_alloc=128,
@@ -218,7 +219,7 @@ class RGP:
vram_bus_width=384, # 384-bit
l2_cache_size=6 * 1024 * 1024, # 6 MB
l1_cache_size=32 * 1024, # 32 KB per SIMD (?)
lds_size=65536, # 64 KB per CU
lds_size=device_props['lds_size_in_kb'] * 1024,
gpu_name=b'NAVI31',
alu_per_clock=0,
texture_per_clock=0,

View File

@@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator
from tinygrad.uop.ops import sint
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerPairT
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, suppress_finalizing, lo32, hi32
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, suppress_finalizing, lo32, hi32, colored
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.renderer.llvmir import AMDLLVMRenderer
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt
@@ -27,6 +27,9 @@ WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) \
| (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
@dataclass(frozen=True)
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; props:dict; blob:bytes; itrace:bool # noqa: E702
class AMDSignal(HCQSignal):
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100})
@@ -497,9 +500,6 @@ class AMDAllocator(HCQAllocator['AMDDevice']):
def _map(self, buf:HCQBuffer): return self.dev.iface.map(buf._base if buf._base is not None else buf)
@dataclass(frozen=True)
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702
@dataclass
class AMDQueueDesc:
ring: MMIOInterface
@@ -803,7 +803,7 @@ class AMDDevice(HCQCompiled):
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
if self.sqtt_enabled:
if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX')
if self.target[0] != 11: raise RuntimeError(f'SQ Thread Tracing is not supported on gc:{self.target}')
if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add "
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
@@ -871,13 +871,14 @@ class AMDDevice(HCQCompiled):
cast(AMDComputeQueue, self.hw_compute_queue_t()).sqtt_stop(len(self.sqtt_buffers), wptrs_buf) \
.signal(self.timeline_signal, self.next_timeline()).submit(self)
self.synchronize()
if DEBUG>=2: print('Saving SQTT in profile...')
if DEBUG >= 2: print(f'{self.device}: Saving SQTT in profile...')
for i,buf0 in enumerate(self.sqtt_buffers):
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - ((buf0.va_addr//32) & 0x1FFFFFFF)) * 32
if DEBUG>=2: print(f'Se {i} blob size {wptr:#x}')
if DEBUG >= 2: print(f'\t{self.device}: SE {i} blob size {wptr:#x}')
assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen"
# When sqtt buffer overflows, wptr stops at the last dword
if wptr >= buf0.size-32: print(f"WARNING: SQTT BUFFER IS FULL (SE {i})! INCREASE SQTT BUFFER SIZE WITH SQTT_BUFFER_SIZE=X (in MB)")
if wptr >= buf0.size - 32:
print(colored(f"{self.device}: Warning: SQTT buffer is full (SE {i})! Increase SQTT buffer with SQTT_BUFFER_SIZE=X (in MB)", "yellow"))
self.allocator._copyout(sqtt_buf:=memoryview(bytearray(wptr)), buf0)
Compiled.profile_events += [ProfileSQTTEvent(self.device, i, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))]
Compiled.profile_events += [ProfileSQTTEvent(self.device, i, self.iface.props, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))]
super()._at_profile_finalize()