mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
amd: sqtt for all gfx11 (#12546)
* amd: general sqtt for gfx11 * target * ops * no gfx12 here
This commit is contained in:
@@ -155,6 +155,7 @@ class RGP:
|
||||
device_event = device_events[device]
|
||||
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
|
||||
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
|
||||
device_props = sqtt_events[0].props
|
||||
sqtt_itrace_enabled = any([event.itrace for event in sqtt_events])
|
||||
sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events])
|
||||
sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0
|
||||
@@ -192,14 +193,14 @@ class RGP:
|
||||
flags=0,
|
||||
trace_shader_core_clock=0x93f05080,
|
||||
trace_memory_clock=0x4a723a40,
|
||||
device_id=0x744c,
|
||||
device_id={110000: 0x744c, 110003: 0x7480}[device_props['gfx_target_version']],
|
||||
device_revision_id=0xc8,
|
||||
vgprs_per_simd=1536,
|
||||
sgprs_per_simd=128*16,
|
||||
shader_engines=6,
|
||||
compute_unit_per_shader_engine=16,
|
||||
simd_per_compute_unit=2,
|
||||
wavefronts_per_simd=16,
|
||||
shader_engines=device_props['array_count'] // device_props['simd_arrays_per_engine'],
|
||||
compute_unit_per_shader_engine=device_props['simd_count'] // device_props['simd_per_cu'] // (device_props['array_count'] // device_props['simd_arrays_per_engine']),
|
||||
simd_per_compute_unit=device_props['simd_per_cu'],
|
||||
wavefronts_per_simd=device_props['max_waves_per_simd'],
|
||||
minimum_vgpr_alloc=4,
|
||||
vgpr_alloc_granularity=8,
|
||||
minimum_sgpr_alloc=128,
|
||||
@@ -218,7 +219,7 @@ class RGP:
|
||||
vram_bus_width=384, # 384-bit
|
||||
l2_cache_size=6 * 1024 * 1024, # 6 MB
|
||||
l1_cache_size=32 * 1024, # 32 KB per SIMD (?)
|
||||
lds_size=65536, # 64 KB per CU
|
||||
lds_size=device_props['lds_size_in_kb'] * 1024,
|
||||
gpu_name=b'NAVI31',
|
||||
alu_per_clock=0,
|
||||
texture_per_clock=0,
|
||||
|
||||
@@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerPairT
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, suppress_finalizing, lo32, hi32
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, suppress_finalizing, lo32, hi32, colored
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt
|
||||
@@ -27,6 +27,9 @@ WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
||||
AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) \
|
||||
| (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; props:dict; blob:bytes; itrace:bool # noqa: E702
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100})
|
||||
|
||||
@@ -497,9 +500,6 @@ class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
|
||||
def _map(self, buf:HCQBuffer): return self.dev.iface.map(buf._base if buf._base is not None else buf)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702
|
||||
|
||||
@dataclass
|
||||
class AMDQueueDesc:
|
||||
ring: MMIOInterface
|
||||
@@ -803,7 +803,7 @@ class AMDDevice(HCQCompiled):
|
||||
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
|
||||
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
|
||||
if self.sqtt_enabled:
|
||||
if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX')
|
||||
if self.target[0] != 11: raise RuntimeError(f'SQ Thread Tracing is not supported on gc:{self.target}')
|
||||
if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
|
||||
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add "
|
||||
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
|
||||
@@ -871,13 +871,14 @@ class AMDDevice(HCQCompiled):
|
||||
cast(AMDComputeQueue, self.hw_compute_queue_t()).sqtt_stop(len(self.sqtt_buffers), wptrs_buf) \
|
||||
.signal(self.timeline_signal, self.next_timeline()).submit(self)
|
||||
self.synchronize()
|
||||
if DEBUG>=2: print('Saving SQTT in profile...')
|
||||
if DEBUG >= 2: print(f'{self.device}: Saving SQTT in profile...')
|
||||
for i,buf0 in enumerate(self.sqtt_buffers):
|
||||
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - ((buf0.va_addr//32) & 0x1FFFFFFF)) * 32
|
||||
if DEBUG>=2: print(f'Se {i} blob size {wptr:#x}')
|
||||
if DEBUG >= 2: print(f'\t{self.device}: SE {i} blob size {wptr:#x}')
|
||||
assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen"
|
||||
# When sqtt buffer overflows, wptr stops at the last dword
|
||||
if wptr >= buf0.size-32: print(f"WARNING: SQTT BUFFER IS FULL (SE {i})! INCREASE SQTT BUFFER SIZE WITH SQTT_BUFFER_SIZE=X (in MB)")
|
||||
if wptr >= buf0.size - 32:
|
||||
print(colored(f"{self.device}: Warning: SQTT buffer is full (SE {i})! Increase SQTT buffer with SQTT_BUFFER_SIZE=X (in MB)", "yellow"))
|
||||
self.allocator._copyout(sqtt_buf:=memoryview(bytearray(wptr)), buf0)
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.device, i, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))]
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.device, i, self.iface.props, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))]
|
||||
super()._at_profile_finalize()
|
||||
|
||||
Reference in New Issue
Block a user