mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-09 22:26:26 -05:00
amd: retire pm4 xcc sync (#11835)
* amd: aql default when several xccs * amd: retire om4 xcc sync * remove more * more * more
This commit is contained in:
@@ -7,8 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, DMAFdRef, BufferSpec
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, all_same, flatten, DEBUG, AMD_LLVM, PROFILE, ProfileEvent, suppress_finalizing
|
||||
from tinygrad.helpers import lo32, hi32
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, AMD_LLVM, PROFILE, ProfileEvent, suppress_finalizing, lo32, hi32
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt
|
||||
@@ -109,17 +108,6 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
self.pkt3(self.pm4.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
|
||||
|
||||
def xcc_barrier(self):
|
||||
if self.dev.xcc_sync is None: return self
|
||||
assert self.dev.xccs == 8, 'only 8 XCCs supported'
|
||||
a, b = self.dev.xcc_sync
|
||||
mem_eq = self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | self.pm4.WAIT_REG_MEM_MEM_SPACE(1)
|
||||
self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(a.value_addr), *data64_le(1), *data64_le(0), 0x10) # a += 1
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(a.value_addr), 0, 0b111, 0x80) # a == 0 (mod 8) via bitmask
|
||||
self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(b.value_addr), *data64_le(1), *data64_le(0), 0x10) # b += 1
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(b.value_addr), 0, 0b111, 0x80) # b == 0 (mod 8) via bitmask
|
||||
return self
|
||||
|
||||
def memory_barrier(self):
|
||||
pf = '' if self.nbio.version[0] == 2 else '0' if self.nbio.version[:2] != (7, 11) else '1'
|
||||
self.wait_reg_mem(reg_req=getattr(self.nbio, f'regBIF_BX_PF{pf}_GPU_HDP_FLUSH_REQ').addr[0],
|
||||
@@ -127,13 +115,6 @@ class AMDComputeQueue(HWQueue):
|
||||
self.acquire_mem()
|
||||
return self
|
||||
|
||||
def xcc_config(self):
|
||||
self.wreg(self.gc.regCOMPUTE_TG_CHUNK_SIZE, 1)
|
||||
for xcc_id in range(self.dev.xccs):
|
||||
with self.pred_exec(xcc_mask=1 << xcc_id):
|
||||
self.wreg(self.dev.regCOMPUTE_CURRENT_LOGIC_XCC_ID, xcc_id)
|
||||
return self
|
||||
|
||||
def spi_config(self, tracing:bool):
|
||||
self.wreg(self.gc.regSPI_CONFIG_CNTL, ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
|
||||
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
|
||||
@@ -278,16 +259,10 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
if prg.dev.sqtt_enabled: self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_MARKER) | self.pm4.EVENT_INDEX(0))
|
||||
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
||||
|
||||
if self.dev.xccs > 1:
|
||||
self.release_mem(cache_flush=True)
|
||||
self.acquire_mem(gli=0)
|
||||
self.xcc_barrier()
|
||||
return self
|
||||
|
||||
def wait(self, signal:AMDSignal, value:sint=0):
|
||||
self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
|
||||
if self.dev.xccs > 1 and not self.dev.is_aql: self.xcc_barrier()
|
||||
return self
|
||||
|
||||
def timestamp(self, signal:AMDSignal):
|
||||
@@ -538,12 +513,6 @@ class AMDQueueDesc:
|
||||
@property
|
||||
def read_ptr(self): return min(p[0] for p in self.read_ptrs)
|
||||
|
||||
@classmethod
|
||||
def multi(cls, *queues: AMDQueueDesc):
|
||||
assert all_same([(q.ring.addr, q.put_value) for q in queues]), f"All queues must have the same ring and put_value: {queues}"
|
||||
return cls(ring=queues[0].ring, put_value=queues[0].put_value, doorbells=flatten(q.doorbells for q in queues),
|
||||
read_ptrs=flatten(q.read_ptrs for q in queues), write_ptrs=flatten(q.write_ptrs for q in queues))
|
||||
|
||||
def signal_doorbell(self, dev, doorbell_value:int|None=None):
|
||||
for write_ptr in self.write_ptrs: write_ptr[0] = self.put_value
|
||||
|
||||
@@ -800,9 +769,6 @@ class AMDDevice(HCQCompiled):
|
||||
self.sdma = import_module('sdma', min(self.iface.ip_versions[am.SDMA0_HWIP], (6, 0, 0)))
|
||||
self.gc = AMDIP('gc', self.iface.ip_versions[am.GC_HWIP], self.iface.ip_offsets[am.GC_HWIP])
|
||||
|
||||
# Define the regCOMPUTE_CURRENT_LOGIC_XCC_ID register, which is missing from the asic_regs files.
|
||||
if self.target[:2] in {(9,4),(9,5)}: self.regCOMPUTE_CURRENT_LOGIC_XCC_ID = AMDReg("regCOMPUTE_CURRENT_LOGIC_XCC_ID", 0xe25, 0, {}, self.gc.bases)
|
||||
|
||||
nbio_name = 'nbio' if self.target[0] < 12 else 'nbif'
|
||||
nbio_pad = (0,) if self.target[0] == 9 else ()
|
||||
self.nbio = AMDIP(nbio_name, self.iface.ip_versions[am.NBIF_HWIP], {i:nbio_pad+x for i,x in self.iface.ip_offsets[am.NBIF_HWIP].items()})
|
||||
@@ -829,13 +795,6 @@ class AMDDevice(HCQCompiled):
|
||||
self.max_private_segment_size = 0
|
||||
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
|
||||
|
||||
# XCC setup
|
||||
self.xcc_sync: tuple[AMDSignal, AMDSignal]|None = None
|
||||
if self.xccs > 1 and not self.is_aql:
|
||||
self.xcc_sync_area = self.allocator.alloc(0x1000, BufferSpec(nolru=True, cpu_access=True))
|
||||
self.xcc_sync = (AMDSignal(base_buf=self.xcc_sync_area), AMDSignal(base_buf=self.xcc_sync_area.offset(256)))
|
||||
cast(AMDComputeQueue, self.hw_compute_queue_t()).xcc_config().submit(self)
|
||||
|
||||
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
|
||||
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
|
||||
if self.sqtt_enabled:
|
||||
@@ -866,10 +825,9 @@ class AMDDevice(HCQCompiled):
|
||||
cwsr_buffer = self.iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None
|
||||
eop_buffer = self.iface.alloc(eop_buffer_size) if eop_buffer_size else None
|
||||
|
||||
return AMDQueueDesc.multi(*(self.iface.create_queue(queue_type, ring, gart, rptr=getattr(hsa.amd_queue_t, 'read_dispatch_id').offset,
|
||||
wptr=getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer,
|
||||
xcc_id=xcc_id, ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
|
||||
for xcc_id in range(self.xccs if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE else 1)))
|
||||
return (self.iface.create_queue(queue_type, ring, gart, rptr=getattr(hsa.amd_queue_t, 'read_dispatch_id').offset,
|
||||
wptr=getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer,
|
||||
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size))
|
||||
|
||||
def _ensure_has_local_memory(self, required):
|
||||
if self.max_private_segment_size >= required: return
|
||||
|
||||
Reference in New Issue
Block a user