mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
amd: correct sqtt_read for several xccs (#13075)
* amd: correct sqtt_read for several xccs * default mask
This commit is contained in:
@@ -287,17 +287,18 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
# For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends
|
||||
for se in range(ses):
|
||||
self.set_grbm(se=se, sh=0)
|
||||
with self.pred_exec(xcc_mask=1<<(se // (ses_per_xcc:=(self.dev.se_cnt // self.dev.xccs)))):
|
||||
self.set_grbm(se=se % ses_per_xcc, sh=0)
|
||||
|
||||
status_reg = self.gc.regSQ_THREAD_TRACE_STATUS.addr[0] - (self.pm4.PACKET3_SET_UCONFIG_REG_START if self.dev.target[0] == 9 else 0)
|
||||
if self.dev.target >= (10, 0, 0):
|
||||
self.wait_reg_mem(reg=status_reg, mask=self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_pending'), op=WAIT_REG_MEM_FUNCTION_EQ, value=0)
|
||||
self.sqtt_config(tracing=False)
|
||||
self.wait_reg_mem(reg=status_reg, mask=self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('busy'), op=WAIT_REG_MEM_FUNCTION_EQ, value=0)
|
||||
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
||||
regstatus = self.gc.regSQ_THREAD_TRACE_STATUS.addr[0] - (self.pm4.PACKET3_SET_UCONFIG_REG_START if self.dev.target[0] == 9 else 0)
|
||||
if self.dev.target >= (10,0,0):
|
||||
self.wait_reg_mem(reg=regstatus, mask=self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_pending'), op=WAIT_REG_MEM_FUNCTION_EQ, value=0)
|
||||
self.sqtt_config(tracing=False)
|
||||
self.wait_reg_mem(reg=regstatus, mask=self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('busy'), op=WAIT_REG_MEM_FUNCTION_EQ, value=0)
|
||||
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
||||
|
||||
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True)
|
||||
self.pkt3(self.pm4.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, self.gc.regSQ_THREAD_TRACE_WPTR.addr[0], 0, *data64_le(wptrs.va_addr+(se*4)))
|
||||
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True)
|
||||
self.pkt3(self.pm4.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, self.gc.regSQ_THREAD_TRACE_WPTR.addr[0], 0, *data64_le(wptrs.va_addr+(se*4)))
|
||||
|
||||
self.set_grbm()
|
||||
if self.dev.target[0] > 9: self.spi_config(tracing=False)
|
||||
@@ -928,7 +929,8 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
SQTT_BUFFER_SIZE = getenv("SQTT_BUFFER_SIZE", 256) # in mb, per shader engine
|
||||
self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE << 20, BufferSpec(nolru=True, uncached=True)) for _ in range(self.se_cnt)]
|
||||
self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", -1 if SQTT >= 2 else (1 << 1)) # se bitmask: -1 enable all, 0 disable all
|
||||
default_mask = functools.reduce(int.__or__, (1<<i for i in range(self.se_cnt) if self.target[0] > 9 or i % 2 == 0)) if SQTT >= 2 else (1 << 1)
|
||||
self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", default_mask)
|
||||
self.sqtt_next_cmd_id = itertools.count(0)
|
||||
cast(AMDComputeQueue, self.hw_compute_queue_t()).sqtt_start(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user