diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index f4a98b3194..b3a45dec96 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -446,8 +446,8 @@ class AMDComputeAQLQueue(AMDComputeQueue): dev.compute_queue.signal_doorbell(dev, doorbell_value=dev.compute_queue.put_value-1) class AMDCopyQueue(HWQueue): - def __init__(self, dev, max_copy_size=0x40000000): - self.dev, self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev, dev.sdma, [], max_copy_size + def __init__(self, dev, max_copy_size=0x40000000, queue_idx=0): + self.dev, self.sdma, self.internal_cmd_sizes, self.max_copy_size, self.queue_idx = dev, dev.sdma, [], max_copy_size, queue_idx super().__init__() def q(self, *arr): @@ -501,41 +501,42 @@ class AMDCopyQueue(HWQueue): self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)] def _submit(self, dev:AMDDevice): + sdma_queue = dev.sdma_queue(self.queue_idx) if self.binded_device == dev: # An IB packet must end on a 8 DW boundary. - add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8 + add = (8 - (((sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8 cmds, cmd_sizes = ([0] * add) + self.indirect_cmd, [len(self.indirect_cmd) + add] - if len(cmds) * 4 >= (dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes): + if len(cmds) * 4 >= (sdma_queue.ring.nbytes - sdma_queue.put_value % sdma_queue.ring.nbytes): cmds, cmd_sizes = [0, 0] + self.indirect_cmd, [8] else: cmds, cmd_sizes = self._q, self.internal_cmd_sizes tail_blit_dword = 0 for cmdsz in cmd_sizes: - if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break + if (tail_blit_dword + cmdsz) * 4 >= sdma_queue.ring.nbytes - sdma_queue.put_value % sdma_queue.ring.nbytes: break tail_blit_dword += cmdsz # Force align of submits to hit our usb layer write cache. if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0 and dev.is_usb(): tail_blit_dword = 0 # USB devices run in single-step mode, so they can't overrun the queue. - total_bytes = (tail_blit_dword * 4 if rem_packet_cnt == 0 else -dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) + rem_packet_cnt * 4 - assert total_bytes < dev.sdma_queue.ring.nbytes, "SDMA queue overrun" - while not dev.is_usb() and dev.sdma_queue.put_value + total_bytes - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: pass + total_bytes = (tail_blit_dword * 4 if rem_packet_cnt == 0 else -sdma_queue.put_value % sdma_queue.ring.nbytes) + rem_packet_cnt * 4 + assert total_bytes < sdma_queue.ring.nbytes, "SDMA queue overrun" + while not dev.is_usb() and sdma_queue.put_value + total_bytes - sdma_queue.read_ptr > sdma_queue.ring.nbytes: pass - start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4 - dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword]) - dev.sdma_queue.put_value += tail_blit_dword * 4 + start_idx = (sdma_queue.put_value % sdma_queue.ring.nbytes) // 4 + sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword]) + sdma_queue.put_value += tail_blit_dword * 4 if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0: - zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes - dev.sdma_queue.ring.view(dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes, zero_fill, fmt='B')[:] = bytes(zero_fill) - dev.sdma_queue.put_value += zero_fill + zero_fill = sdma_queue.ring.nbytes - sdma_queue.put_value % sdma_queue.ring.nbytes + sdma_queue.ring.view(sdma_queue.put_value % sdma_queue.ring.nbytes, zero_fill, fmt='B')[:] = bytes(zero_fill) + sdma_queue.put_value += zero_fill - dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:]) - dev.sdma_queue.put_value += rem_packet_cnt * 4 + sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:]) + sdma_queue.put_value += rem_packet_cnt * 4 - dev.sdma_queue.signal_doorbell(dev) + sdma_queue.signal_doorbell(dev) class AMDProgram(HCQProgram): def __init__(self, dev:AMDDevice, name:str, lib:bytes): @@ -756,7 +757,8 @@ class KFDIface: stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=1) assert stm.n_success == 1 - def create_queue(self, queue_type, ring, gart, rptr, wptr, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0): + def create_queue(self, queue_type, ring, gart, rptr, wptr, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, + xcc_id=0, idx=0): queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id, queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE|(xcc_id<<8), queue_priority=getenv("AMD_KFD_QUEUE_PRIORITY", 7), eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size, @@ -826,12 +828,14 @@ class PCIIface(PCIIfaceBase): 'simd_arrays_per_engine': max_sh_per_se, 'lds_size_in_kb': self.dev_impl.gc_info.gc_lds_size, 'num_xcc': self.dev_impl.gfx.xccs, 'gfx_target_version': {90403: 90402}.get(gfxver, gfxver)} - def create_queue(self, queue_type, ring, gart, rptr, wptr, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0): + def create_queue(self, queue_type, ring, gart, rptr, wptr, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, + xcc_id=0, idx=0): assert cwsr_buffer is None, "no cwsr buffer for am" if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA: + assert idx <= 3, "only 4 SDMA queues supported in am" pv = self.dev_impl.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr+rptr, wptr_addr=gart.va_addr+wptr, - doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0) + doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0 + idx * 0xA * 4), pipe=0, queue=idx) else: pv = self.dev_impl.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr+rptr, wptr_addr=gart.va_addr+wptr, eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, @@ -875,9 +879,10 @@ class USBIface(PCIIface): barview = self.pci_dev.map_bar(bar=0, off=mapping.paddrs[0][0], size=mapping.size) if cpu_access else None return HCQBuffer(mapping.va_addr, size, meta=PCIAllocationMeta(mapping, has_cpu_mapping=False), view=barview, owner=self.dev) - def create_queue(self, queue_type, ring, gart, rptr, wptr, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0): + def create_queue(self, queue_type, ring, gart, rptr, wptr, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, + xcc_id=0, idx=0): if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE: self.pci_dev.usb._pci_cacheable += [(ring.cpu_view().addr, ring.size)] - return super().create_queue(queue_type, ring, gart, rptr, wptr, eop_buffer, cwsr_buffer, ctl_stack_size, ctx_save_restore_size, xcc_id) + return super().create_queue(queue_type, ring, gart, rptr, wptr, eop_buffer, cwsr_buffer, ctl_stack_size, ctx_save_restore_size, xcc_id, idx) def sleep(self, timeout): pass @@ -931,8 +936,7 @@ class AMDDevice(HCQCompiled): 0x2000 if self.is_usb() else (16 << 20), eop_buffer_size=0x1000, ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size) - max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000 - self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20)) + self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000 compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None), CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM), @@ -940,7 +944,7 @@ class AMDDevice(HCQCompiled): super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal, functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self), - functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size), + functools.partial(AMDCopyQueue, self, max_copy_size=self.max_copy_size), kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000) # Scratch setup @@ -976,7 +980,7 @@ class AMDDevice(HCQCompiled): self.sqtt_wptrs = self.allocator.alloc(round_up(self.se_cnt * 4, 0x1000), BufferSpec(cpu_access=True, nolru=True)) self.sqtt_next_cmd_id = itertools.count(0) - def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0): + def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0, idx=0): ring = self.iface.alloc(ring_size, uncached=True, cpu_access=True) gart = self.iface.alloc(0x100, uncached=True, cpu_access=True) @@ -993,7 +997,10 @@ class AMDDevice(HCQCompiled): return (self.iface.create_queue(queue_type, ring, gart, rptr=getattr(hsa.amd_queue_t, 'read_dispatch_id').offset, wptr=getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer, - ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)) + ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size, idx=idx)) + + @functools.lru_cache(None) + def sdma_queue(self, idx:int=0): return self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx) def _ensure_has_local_memory(self, private_segment_size): if self.max_private_segment_size >= private_segment_size: return diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index e2eb712151..f7f6069ef5 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -165,6 +165,10 @@ class NVComputeQueue(NVCommandQueue): def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.compute_gpfifo) class NVCopyQueue(NVCommandQueue): + def __init__(self, queue_idx=0): + self.queue_idx = queue_idx + super().__init__() + def copy(self, dest:sint, src:sint, copy_size:int): for off in range(0, copy_size, step:=(1 << 31)): self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src+off), *data64(dest+off))