amd: sqtt on gfx12 (#12564)

* amd: sqtt on gfx12

* cleaner

* thi

* and this

* ops

* ugh

* back

* rm this

* rm
This commit is contained in:
nimlgen
2025-10-10 17:54:14 +08:00
committed by GitHub
parent 95ad047445
commit 89be3590aa
7 changed files with 67 additions and 36 deletions

View File

@@ -435,8 +435,8 @@ generate_sqtt() {
-o extra/sqtt/rocprof/rocprof.py
fixup extra/sqtt/rocprof/rocprof.py
sed -i '1s/^/# pylint: skip-file\n/' extra/sqtt/rocprof/rocprof.py
sed -i "s/import ctypes/import ctypes, tinygrad.helpers.fetch as tgfetch/g" extra/sqtt/rocprof/rocprof.py
sed -i "s|FunctionFactoryStub()|ctypes.CDLL(str(tgfetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so')))|g" extra/sqtt/rocprof/rocprof.py
sed -i "s/import ctypes/import ctypes\nfrom tinygrad.helpers import fetch/g" extra/sqtt/rocprof/rocprof.py
sed -i "s|FunctionFactoryStub()|ctypes.CDLL(str(fetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so')))|g" extra/sqtt/rocprof/rocprof.py
}
generate_webgpu() {

View File

@@ -156,6 +156,9 @@ class RGP:
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
device_props = sqtt_events[0].props
gfx_ver = device_props['gfx_target_version'] // 10000
gfx_iplvl = getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}_{(device_props['gfx_target_version']//100)%100}",
getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}", None))
sqtt_itrace_enabled = any([event.itrace for event in sqtt_events])
sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events])
sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0
@@ -193,7 +196,7 @@ class RGP:
flags=0,
trace_shader_core_clock=0x93f05080,
trace_memory_clock=0x4a723a40,
device_id={110000: 0x744c, 110003: 0x7480}[device_props['gfx_target_version']],
device_id={110000: 0x744c, 110003: 0x7480, 120001: 0x7550}[device_props['gfx_target_version']],
device_revision_id=0xc8,
vgprs_per_simd=1536,
sgprs_per_simd=128*16,
@@ -207,7 +210,7 @@ class RGP:
sgpr_alloc_granularity=128,
hardware_contexts=8,
gpu_type=sqtt.SQTT_GPU_TYPE_DISCRETE,
gfxip_level=sqtt.SQTT_GFXIP_LEVEL_GFXIP_11_0,
gfxip_level=gfx_iplvl,
gpu_index=0,
gds_size=0,
gds_per_shader_engine=0,
@@ -258,7 +261,7 @@ class RGP:
major_version=0, minor_version=2,
),
shader_engine_index=sqtt_event.se,
sqtt_version=sqtt.SQTT_VERSION_3_2,
sqtt_version={11: sqtt.SQTT_VERSION_3_2, 12: sqtt.SQTT_VERSION_3_3}.get(gfx_ver),
_0=sqtt.union_sqtt_file_chunk_sqtt_desc_0(
v1=sqtt.struct_sqtt_file_chunk_sqtt_desc_0_v1(
instrumentation_spec_version=1,

View File

@@ -20,14 +20,15 @@ class InstInfo:
class _ROCParseCtx:
def __init__(self, sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
self.sqtt_evs, self.prog_evs = iter(sqtt_evs), prog_evs
self.wave_events = {}
self.wave_events, self.disasms, self.addr2prg = {}, {}, {}
for prog in prog_evs:
for addr, info in comgr_get_address_table(prog.lib).items():
self.disasms[prog.base + addr] = info
self.addr2prg[prog.base + addr] = prog
def next_sqtt(self): return next(self.sqtt_evs, None)
def find_program(self, idx): return self.prog_evs[idx]
def get_instr_info(self, idx, exec_addr): return self.disasm_program(idx)[exec_addr - self.find_program(idx).base]
@functools.lru_cache(None)
def disasm_program(self, idx): return comgr_get_address_table(self.find_program(idx).lib)
def find_program(self, addr): return self.addr2prg[addr]
def on_occupancy_ev(self, ev):
if DEBUG >= 4: print("OCC", ev.time, ev.cu, ev.simd, ev.wave_id, ev.start)
@@ -39,10 +40,10 @@ class _ROCParseCtx:
for j in range(ev.instructions_size):
inst_ev = ev.instructions_array[j]
inst_typ = rocprof.rocprofiler_thread_trace_decoder_inst_category_t__enumvalues[inst_ev.category]
asm.setdefault(inst_ev.pc.address, InstInfo(typ=inst_typ, inst=self.get_instr_info(inst_ev.pc.code_object_id, inst_ev.pc.address)[0]))
asm.setdefault(inst_ev.pc.address, InstInfo(typ=inst_typ, inst=self.disasms[inst_ev.pc.address][0]))
asm[inst_ev.pc.address].on_ev(inst_ev)
self.wave_events[(self.find_program(ev.instructions_array[0].pc.code_object_id).name, ev.wave_id, ev.cu, ev.simd)] = asm
self.wave_events[(self.find_program(ev.instructions_array[0].pc.address).name, ev.wave_id, ev.cu, ev.simd)] = asm
if __name__ == "__main__":
parser = argparse.ArgumentParser()
@@ -78,7 +79,7 @@ if __name__ == "__main__":
@rocprof.rocprof_trace_decoder_isa_callback_t
def isa_cb(instr_ptr, mem_size_ptr, size_ptr, pc, data_ptr):
instr, mem_size_ptr[0] = ROCParseCtx.get_instr_info(pc.code_object_id, pc.address)
instr, mem_size_ptr[0] = ROCParseCtx.disasms[pc.address]
# this is the number of bytes to next instruction, set to 0 for end_pgm
if instr == "s_endpgm": mem_size_ptr[0] = 0

View File

@@ -7,7 +7,8 @@
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes, tinygrad.helpers.fetch as tgfetch
import ctypes
from tinygrad.helpers import fetch
class AsDictMixin:
@@ -155,7 +156,7 @@ class FunctionFactoryStub:
# You can either re-run clan2py with -l /path/to/library.so
# Or manually fix this by comment the ctypes.CDLL loading
_libraries = {}
_libraries['FIXME_STUB'] = ctypes.CDLL(str(tgfetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so'))) # ctypes.CDLL('FIXME_STUB')
_libraries['FIXME_STUB'] = ctypes.CDLL(str(fetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so'))) # ctypes.CDLL('FIXME_STUB')

View File

@@ -43,6 +43,7 @@ enum sqtt_version
SQTT_VERSION_2_3 = 0x6, /* GFX9 */
SQTT_VERSION_2_4 = 0x7, /* GFX10+ */
SQTT_VERSION_3_2 = 0xb, /* GFX11+ */
SQTT_VERSION_3_3 = 0xc, /* GFX12+ */
};
enum sqtt_file_chunk_type
@@ -144,6 +145,8 @@ enum sqtt_gfxip_level
SQTT_GFXIP_LEVEL_GFXIP_10_1 = 0x7,
SQTT_GFXIP_LEVEL_GFXIP_10_3 = 0x9,
SQTT_GFXIP_LEVEL_GFXIP_11_0 = 0xc,
SQTT_GFXIP_LEVEL_GFXIP_11_5 = 0xd,
SQTT_GFXIP_LEVEL_GFXIP_12 = 0x10,
};
enum sqtt_memory_type
@@ -427,6 +430,8 @@ enum elf_gfxip_level
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043,
EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x04e,
};
struct sqtt_file_chunk_spm_db {

View File

@@ -174,12 +174,14 @@ sqtt_version__enumvalues = {
6: 'SQTT_VERSION_2_3',
7: 'SQTT_VERSION_2_4',
11: 'SQTT_VERSION_3_2',
12: 'SQTT_VERSION_3_3',
}
SQTT_VERSION_NONE = 0
SQTT_VERSION_2_2 = 5
SQTT_VERSION_2_3 = 6
SQTT_VERSION_2_4 = 7
SQTT_VERSION_3_2 = 11
SQTT_VERSION_3_3 = 12
sqtt_version = ctypes.c_uint32 # enum
# values for enumeration 'sqtt_file_chunk_type'
@@ -336,6 +338,8 @@ sqtt_gfxip_level__enumvalues = {
7: 'SQTT_GFXIP_LEVEL_GFXIP_10_1',
9: 'SQTT_GFXIP_LEVEL_GFXIP_10_3',
12: 'SQTT_GFXIP_LEVEL_GFXIP_11_0',
13: 'SQTT_GFXIP_LEVEL_GFXIP_11_5',
16: 'SQTT_GFXIP_LEVEL_GFXIP_12',
}
SQTT_GFXIP_LEVEL_NONE = 0
SQTT_GFXIP_LEVEL_GFXIP_6 = 1
@@ -346,6 +350,8 @@ SQTT_GFXIP_LEVEL_GFXIP_9 = 5
SQTT_GFXIP_LEVEL_GFXIP_10_1 = 7
SQTT_GFXIP_LEVEL_GFXIP_10_3 = 9
SQTT_GFXIP_LEVEL_GFXIP_11_0 = 12
SQTT_GFXIP_LEVEL_GFXIP_11_5 = 13
SQTT_GFXIP_LEVEL_GFXIP_12 = 16
sqtt_gfxip_level = ctypes.c_uint32 # enum
# values for enumeration 'sqtt_memory_type'
@@ -806,12 +812,16 @@ elf_gfxip_level__enumvalues = {
51: 'EF_AMDGPU_MACH_AMDGCN_GFX1010',
54: 'EF_AMDGPU_MACH_AMDGCN_GFX1030',
65: 'EF_AMDGPU_MACH_AMDGCN_GFX1100',
67: 'EF_AMDGPU_MACH_AMDGCN_GFX1150',
78: 'EF_AMDGPU_MACH_AMDGCN_GFX1200',
}
EF_AMDGPU_MACH_AMDGCN_GFX801 = 40
EF_AMDGPU_MACH_AMDGCN_GFX900 = 44
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 51
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 54
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 65
EF_AMDGPU_MACH_AMDGCN_GFX1150 = 67
EF_AMDGPU_MACH_AMDGCN_GFX1200 = 78
elf_gfxip_level = ctypes.c_uint32 # enum
class struct_sqtt_file_chunk_spm_db(Structure):
pass
@@ -1607,7 +1617,8 @@ __all__ = \
'ApiCmdUpdateBuffer', 'ApiCmdWaitEvents', 'ApiCmdWriteTimestamp',
'ApiInvalid', 'ApiRayTracingSeparateCompiled',
'EF_AMDGPU_MACH_AMDGCN_GFX1010', 'EF_AMDGPU_MACH_AMDGCN_GFX1030',
'EF_AMDGPU_MACH_AMDGCN_GFX1100', 'EF_AMDGPU_MACH_AMDGCN_GFX801',
'EF_AMDGPU_MACH_AMDGCN_GFX1100', 'EF_AMDGPU_MACH_AMDGCN_GFX1150',
'EF_AMDGPU_MACH_AMDGCN_GFX1200', 'EF_AMDGPU_MACH_AMDGCN_GFX801',
'EF_AMDGPU_MACH_AMDGCN_GFX900', 'EventCmdBlitImage',
'EventCmdBuildAccelerationStructuresIndirectKHR',
'EventCmdBuildAccelerationStructuresKHR',
@@ -1671,7 +1682,8 @@ __all__ = \
'SQTT_FILE_CHUNK_TYPE_SQTT_DESC', 'SQTT_FILE_MAGIC_NUMBER',
'SQTT_FILE_VERSION_MAJOR', 'SQTT_FILE_VERSION_MINOR',
'SQTT_GFXIP_LEVEL_GFXIP_10_1', 'SQTT_GFXIP_LEVEL_GFXIP_10_3',
'SQTT_GFXIP_LEVEL_GFXIP_11_0', 'SQTT_GFXIP_LEVEL_GFXIP_6',
'SQTT_GFXIP_LEVEL_GFXIP_11_0', 'SQTT_GFXIP_LEVEL_GFXIP_11_5',
'SQTT_GFXIP_LEVEL_GFXIP_12', 'SQTT_GFXIP_LEVEL_GFXIP_6',
'SQTT_GFXIP_LEVEL_GFXIP_7', 'SQTT_GFXIP_LEVEL_GFXIP_8',
'SQTT_GFXIP_LEVEL_GFXIP_8_1', 'SQTT_GFXIP_LEVEL_GFXIP_9',
'SQTT_GFXIP_LEVEL_NONE', 'SQTT_GPU_NAME_MAX_SIZE',
@@ -1697,9 +1709,9 @@ __all__ = \
'SQTT_QUEUE_TYPE_COMPUTE', 'SQTT_QUEUE_TYPE_DMA',
'SQTT_QUEUE_TYPE_UNIVERSAL', 'SQTT_QUEUE_TYPE_UNKNOWN',
'SQTT_SA_PER_SE', 'SQTT_VERSION_2_2', 'SQTT_VERSION_2_3',
'SQTT_VERSION_2_4', 'SQTT_VERSION_3_2', 'SQTT_VERSION_NONE',
'UserEventObjectName', 'UserEventPop', 'UserEventPush',
'UserEventTrigger', 'elf_gfxip_level',
'SQTT_VERSION_2_4', 'SQTT_VERSION_3_2', 'SQTT_VERSION_3_3',
'SQTT_VERSION_NONE', 'UserEventObjectName', 'UserEventPop',
'UserEventPush', 'UserEventTrigger', 'elf_gfxip_level',
'rgp_sqtt_marker_event_type', 'rgp_sqtt_marker_general_api_type',
'rgp_sqtt_marker_identifier', 'rgp_sqtt_marker_user_event_type',
'sqtt_api_type', 'sqtt_engine_type',

View File

@@ -130,8 +130,9 @@ class AMDComputeQueue(HWQueue):
self.wreg(self.gc.regSQ_THREAD_TRACE_USERDATA_2, *data_ints[i:i+2])
def sqtt_config(self, tracing:bool):
self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
rt_freq=self.soc.SQ_TT_RT_FREQ_4096_CLK, util_timer=self.soc.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
trace_ctrl = {'rt_freq': self.soc.SQ_TT_RT_FREQ_4096_CLK} if self.dev.target < (12,0,0) else {}
self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1, util_timer=1,
mode=int(tracing), **trace_ctrl)
# Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
def sqtt_start(self, buf0s:list[HCQBuffer], se_mask:int):
@@ -140,24 +141,35 @@ class AMDComputeQueue(HWQueue):
# One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well
for se in range(len(buf0s)):
self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, instance_broadcast_writes=1)
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12)
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size>>12)
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo)
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr >> 12)
if self.dev.target >= (12,0,0):
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, size=buf0s[se].size >> 12)
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_LO, base_lo=buf0_lo)
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_HI, base_hi=buf0_hi)
else:
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size >> 12)
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo)
# NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
# For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
# and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
# sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
# be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
# CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0)
cs_wtype = (1 << 6) if self.dev.target >= (12,0,0) else self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT
self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=cs_wtype, simd_sel=0, wgp_sel=0, sa_sel=0)
reg_include = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT
token_exclude = 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
token_exclude = (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT) if self.dev.target < (12,0,0) else 0
# disable tracing
if not (se_mask >> se) & 0b1:
token_exclude |= 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
# gfx12 doesn't have enums with all fields, so it's hardcoded, but it's the same as gfx11.
token_exclude |= (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=reg_include, token_exclude=token_exclude, bop_events_token_include=1)
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT) if self.dev.target < (12,0,0) else 0x927
token_mask = {} if self.dev.target < (12,0,0) else {'exclude_barrier_wait': 1}
self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=reg_include, token_exclude=token_exclude, bop_events_token_include=1, **token_mask)
# Enable SQTT
self.sqtt_config(tracing=True)
# Restore global broadcasting
@@ -178,9 +190,6 @@ class AMDComputeQueue(HWQueue):
# Wait for FINISH_PENDING==0
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_pending'), 4)
# Wait for FINISH_DONE!=0
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_done'), 4)
# Disable SQTT
self.sqtt_config(tracing=False)
# Wait for BUSY==0
@@ -804,7 +813,7 @@ class AMDDevice(HCQCompiled):
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
if self.sqtt_enabled:
if self.target[0] != 11: raise RuntimeError(f'SQ Thread Tracing is not supported on gc:{self.target}')
if self.target[0] < 11: raise RuntimeError(f'SQ Thread Tracing is not supported on gc:{self.target}')
if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add "
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
@@ -872,7 +881,7 @@ class AMDDevice(HCQCompiled):
self.synchronize()
if DEBUG >= 2: print(f'{self.device}: Saving SQTT in profile...')
for i,buf0 in enumerate(self.sqtt_buffers):
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - ((buf0.va_addr//32) & 0x1FFFFFFF)) * 32
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - (((buf0.va_addr//32) & 0x1FFFFFFF) if self.target < (12,0,0) else 0)) * 32
if DEBUG >= 2: print(f'\t{self.device}: SE {i} blob size {wptr:#x}')
assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen"
# When sqtt buffer overflows, wptr stops at the last dword