diff --git a/test/external/external_test_hcq.py b/test/external/external_test_hcq.py index 24f57bf0b1..752678f247 100644 --- a/test/external/external_test_hcq.py +++ b/test/external/external_test_hcq.py @@ -8,7 +8,7 @@ def _time_queue(q, d): st = time.perf_counter() q.signal(d.completion_signal) q.submit(d) - d._wait_on(d.completion_signal.event_id) + d._wait_signal(d.completion_signal) return time.perf_counter() - st class TestHCQ(unittest.TestCase): @@ -38,7 +38,7 @@ class TestHCQ(unittest.TestCase): q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.runner.global_size, TestHCQ.runner.local_size) q.signal(TestHCQ.d0.completion_signal) q.submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 2000.0, f"got val {val}" def test_run_1000_times(self): @@ -48,10 +48,10 @@ class TestHCQ(unittest.TestCase): TestHCQ.runner.local_size, TestHCQ.d0.completion_signal) for _ in range(1000): q.submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) # confirm signal was reset with self.assertRaises(RuntimeError): - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id, timeout=50) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50) assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 2000.0, f"got val {val}" def test_run_to_3(self): @@ -60,32 +60,32 @@ class TestHCQ(unittest.TestCase): q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.runner.global_size, TestHCQ.runner.local_size) q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size, TestHCQ.d0.completion_signal) q.submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 3.0, f"got val {val}" def test_wait_signal(self): TestHCQ.d0.completion_signal.value = 1 HWComputeQueue().wait(TestHCQ.d0.completion_signal).signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0) with self.assertRaises(RuntimeError): - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id, timeout=50) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50) # clean up TestHCQ.d0.completion_signal.value = 0 - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id, timeout=1000) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000) def test_wait_copy_signal(self): TestHCQ.d0.completion_signal.value = 1 HWCopyQueue().wait(TestHCQ.d0.completion_signal).signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0) with self.assertRaises(RuntimeError): - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id, timeout=50) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50) # clean up TestHCQ.d0.completion_signal.value = 0 - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id, timeout=1000) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000) def test_run_normal(self): q = HWComputeQueue() q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size, TestHCQ.d0.completion_signal) q.submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}" def test_submit_empty_queues(self): @@ -94,22 +94,22 @@ class TestHCQ(unittest.TestCase): def test_signal_timeout(self): with self.assertRaises(RuntimeError): - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id, timeout=50) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50) def test_signal(self): HWComputeQueue().signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) def test_copy_signal(self): HWCopyQueue().signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) def test_run_signal(self): q = HWComputeQueue() q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size) q.signal(TestHCQ.d0.completion_signal) q.submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}" def test_copy_1000_times(self): @@ -119,10 +119,10 @@ class TestHCQ(unittest.TestCase): q.signal(TestHCQ.d0.completion_signal) for _ in range(1000): q.submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) # confirm signal was reset with self.assertRaises(RuntimeError): - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id, timeout=50) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50) assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 0.0, f"got val {val}" def test_copy(self): @@ -130,7 +130,7 @@ class TestHCQ(unittest.TestCase): q.copy(TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr, 8) q.signal(TestHCQ.d0.completion_signal) q.submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 1.0, f"got val {val}" def test_copy_bandwidth(self): @@ -169,7 +169,7 @@ class TestHCQ(unittest.TestCase): qc.submit(TestHCQ.d0) time.sleep(0.02) # give it time for the wait to fail q.submit(TestHCQ.d0) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}" def test_cross_device_signal(self): @@ -179,7 +179,7 @@ class TestHCQ(unittest.TestCase): q2.wait(TestHCQ.d0.completion_signal) q2.submit(TestHCQ.d0) q1.submit(TestHCQ.d1) - TestHCQ.d0._wait_on(TestHCQ.d0.completion_signal.event_id) + TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal) if __name__ == "__main__": unittest.main() diff --git a/tinygrad/runtime/ops_kfd.py b/tinygrad/runtime/ops_kfd.py index c1b7311fef..e85681a167 100644 --- a/tinygrad/runtime/ops_kfd.py +++ b/tinygrad/runtime/ops_kfd.py @@ -54,10 +54,9 @@ def create_sdma_packets(): fname = union_fields[0] if fname in names: fname = pkt_fields[0]+fname names.add(fname) - if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): - fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64]) # merge together 64-bit fields - else: - fields.append(tuple([fname, *union_fields[1:]])) + # merge together 64-bit fields, otherwise just append them + if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64]) + else: fields.append(tuple([fname, *union_fields[1:]])) new_name = name[16:-4].lower() structs[new_name] = init_c_struct_t(tuple(fields)) assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}" @@ -95,41 +94,44 @@ SHT_PROGBITS = 0x1 SHF_ALLOC = 0x2 EMPTY_SIGNAL = hsa.hsa_signal_t() +SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset class HWComputeQueue: - def __init__(self): - self.q = [] + def __init__(self): self.q = [] def exec(self, prg:KFDProgram, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), completion_signal=None): + if completion_signal is not None: completion_signal.value = 1 self.q.append(hsa.hsa_kernel_dispatch_packet_t( - setup=DISPATCH_KERNEL_SETUP, header=DISPATCH_KERNEL_HEADER, + header=DISPATCH_KERNEL_HEADER, setup=DISPATCH_KERNEL_SETUP, workgroup_size_x=local_size[0], workgroup_size_y=local_size[1], workgroup_size_z=local_size[2], grid_size_x=global_size[0]*local_size[0], grid_size_y=global_size[1]*local_size[1], grid_size_z=global_size[2]*local_size[2], - kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size, - kernarg_address=kernargs, + kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size, kernarg_address=kernargs, completion_signal=hsa.hsa_signal_t(ctypes.addressof(completion_signal)) if completion_signal is not None else EMPTY_SIGNAL)) return self - def signal(self, signal): + def signal(self, signal:hsa.amd_signal_t): + signal.value = 1 self.q.append(hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER, completion_signal=hsa.hsa_signal_t(ctypes.addressof(signal)))) return self - def wait(self, signal): + def wait(self, signal:hsa.amd_signal_t): sig = hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER) sig.dep_signal[0] = hsa.hsa_signal_t(ctypes.addressof(signal)) self.q.append(sig) return self def submit(self, device:KFDDevice): - read_ptr = device.amd_aql_queue.read_dispatch_id + if not len(self.q): return + write_ptr, read_ptr = device.amd_aql_queue.write_dispatch_id, device.amd_aql_queue.read_dispatch_id + if (len(self.q)+write_ptr-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun") for cmd in self.q: - ring_addr = device.aql_ring.va_addr + (device.amd_aql_queue.write_dispatch_id*AQL_PACKET_SIZE) % device.aql_ring.size + ring_addr = device.aql_ring.va_addr + (write_ptr*AQL_PACKET_SIZE) % device.aql_ring.size ctypes.memmove(ring_addr, ctypes.addressof(cmd), AQL_PACKET_SIZE) - device.amd_aql_queue.write_dispatch_id += 1 - if (device.amd_aql_queue.write_dispatch_id-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun") - if len(self.q): - device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1 - device.aql_doorbell_value += len(self.q) + write_ptr += 1 + # TODO: add CPU memory barrier here + device.amd_aql_queue.write_dispatch_id = write_ptr + device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1 + device.aql_doorbell_value += len(self.q) return self # prebuilt sdma packets @@ -141,19 +143,17 @@ sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_ GCR_CONTROL_GL2_RANGE=0) class HWCopyQueue: - def __init__(self): - self.q = [] + def __init__(self): self.q = [] def submit(self, device:KFDDevice): - def blit_sdma_command(cmd): + read_ptr = device.sdma_read_pointer[0] + if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun") + for cmd in self.q: if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size): ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill) device.sdma_doorbell_value += fill ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz) device.sdma_doorbell_value += cmdsz - read_ptr = device.sdma_read_pointer[0] - for cmd in self.q: blit_sdma_command(cmd) - if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun") device.sdma_write_pointer[0] = device.sdma_doorbell_value device.sdma_doorbell[0] = device.sdma_doorbell_value return self @@ -175,17 +175,18 @@ class HWCopyQueue: self.q.append(sdma_cache_wb) return self - def signal(self, completion_signal): + def signal(self, signal:hsa.amd_signal_t): + signal.value = 1 self.q.append(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64, - addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset, src_data=(1<<64)-1)) - if completion_signal.event_mailbox_ptr != 0: - self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr, data=completion_signal.event_id)) - self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=completion_signal.event_id)) + addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, src_data=(1<<64)-1)) + if signal.event_mailbox_ptr != 0: + self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id)) + self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id)) return self - def wait(self, completion_signal): + def wait(self, signal:hsa.amd_signal_t): self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, - addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset, + addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff)) return self @@ -224,26 +225,26 @@ class KFDProgram: assert self.device.kernargs_ptr < (self.device.kernargs.va_addr + self.device.kernargs.size + self.kernargs_segment_size), "kernargs overrun" if not hasattr(self, "args_struct_t"): self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] + - [(f'v{i}', ctypes.c_int) for i in range(len(vals))])) + [(f'v{i}', ctypes.c_int) for i in range(len(vals))])) if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size: raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}") args_st = self.args_struct_t.from_address(self.device.kernargs_ptr) for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr) for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i]) - self.device.completion_signal.value = 1 # reset the signal before call HWComputeQueue().exec(self, self.device.kernargs_ptr, global_size, local_size, self.device.completion_signal if wait else None).submit(self.device) self.device.kernargs_ptr += self.kernargs_segment_size if wait: - self.device._wait_on(self.device.completion_signal.event_id) + self.device._wait_signal(self.device.completion_signal) assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}" return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8 class KFDAllocator(LRUAllocator): def __init__(self, device:KFDDevice): self.device = device + # NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)] super().__init__() @@ -260,6 +261,13 @@ class KFDAllocator(LRUAllocator): # self.device.synchronize() # return to_mv(src.va_addr, src.size) + def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice): + dest_dev._gpu_map(src) + q = HWComputeQueue().signal(sig := KFDDevice._get_signal()) + HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev) + HWComputeQueue().wait(sigc).submit(dest_dev) + q.wait(sigc).submit(src_dev) + def copy_from_fd(self, dest, fd, offset, size): fo = io.FileIO(fd, "a+b", closefd=False) fo.seek(offset - (minor_offset:=offset % PAGE_SIZE)) @@ -270,38 +278,27 @@ class KFDAllocator(LRUAllocator): if copy_size == 0: break fo.readinto(to_mv(self.b[1].va_addr, local_size)) - if i != 0: self.device._wait_on(self.device.completion_signal.event_id) + if i != 0: self.device._wait_signal(self.device.signal_sdma) self.b = self.b[::-1] - self.device.completion_signal.value = 1 # TODO: when do we have to reset it? - self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.completion_signal) + self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma) copied_in += copy_size minor_offset = 0 # only on the first - self.device._wait_on(self.device.completion_signal.event_id) - - def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None): - dest_dev._gpu_map(src) - q = HWComputeQueue().signal(sig := KFDDevice._get_signal()) - HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev) - HWComputeQueue().wait(sigc).submit(dest_dev) - q.wait(sigc).submit(src_dev) + self.device._wait_signal(self.device.signal_sdma) def copyin(self, dest, src: memoryview): for i in range(0, src.nbytes, self.b[0].size): ctypes.memmove(self.b[1].va_addr, from_mv(src[i:]), lsize:=min(self.b[0].size, src.nbytes-i)) - if i != 0: self.device._wait_on(self.device.completion_signal.event_id) + if i != 0: self.device._wait_signal(self.device.signal_sdma) self.b = self.b[::-1] - self.device.completion_signal.value = 1 # TODO: when do we have to reset it? - self.device._submit_sdma(dest.va_addr+i, self.b[0].va_addr, lsize, completion_signal=self.device.completion_signal) - self.device._wait_on(self.device.completion_signal.event_id) + self.device._submit_sdma(dest.va_addr+i, self.b[0].va_addr, lsize, completion_signal=self.device.signal_sdma) + self.device._wait_signal(self.device.signal_sdma) def copyout(self, dest:memoryview, src): self.device.synchronize() for i in range(0, dest.nbytes, self.b[0].size): - self.device.completion_signal.value = 1 - self.device._submit_sdma(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i), - completion_signal=self.device.completion_signal) - self.device._wait_on(self.device.completion_signal.event_id) + self.device._submit_sdma(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i), completion_signal=self.device.signal_sdma) + self.device._wait_signal(self.device.signal_sdma) ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize) MAP_FIXED, MAP_NORESERVE = 0x10, 0x400 @@ -309,7 +306,7 @@ class KFDDevice(Compiled): kfd:int = -1 event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args] signals_page:Any = None - signal_number:int = 10 + signal_number:int = 16 def _gpu_map(self, mem): if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return @@ -318,13 +315,6 @@ class KFDDevice(Compiled): stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids)) assert stm.n_success == len(mem.mapped_gpu_ids) - @classmethod - def _wait_on(self, event_id, timeout=10000): - evt_arr = (kfd.struct_kfd_event_data * 1)() - evt_arr[0].event_id = event_id - ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout) - if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!") - def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True): flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED @@ -337,7 +327,6 @@ class KFDDevice(Compiled): mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf) if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR): buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset) - assert buf != 0xffffffffffffffff assert addr == buf == mem.va_addr if map_to_gpu: self._gpu_map(mem) return mem @@ -351,15 +340,26 @@ class KFDDevice(Compiled): kio.free_memory_of_gpu(self.kfd, handle=mem.handle) @classmethod - def _get_signal(self, num=None): + def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t: if num is None: num = KFDDevice.signal_number KFDDevice.signal_number += 1 - if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 10 + if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16 ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num) - ret.value = 1 + ret.kind = hsa.AMD_SIGNAL_KIND_USER + if sync_event is not None: + ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8 + ret.event_id = sync_event.event_id return ret + @classmethod + def _wait_signal(self, signal:hsa.amd_signal_t, timeout=10000): + assert signal.event_id != 0, "can't wait on this signal" + evt_arr = (kfd.struct_kfd_event_data * 1)() + evt_arr[0].event_id = signal.event_id + ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout) + if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!") + def __init__(self, device:str=""): if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR) self.device_id = int(device.split(":")[1]) if ":" in device else 0 @@ -370,31 +370,27 @@ class KFDDevice(Compiled): kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id) if KFDDevice.event_page is None: - KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) - for i in range(SIGNAL_COUNT): - sig = KFDDevice._get_signal(i) - sig.value = 1 - sig.kind = hsa.AMD_SIGNAL_KIND_USER + KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) KFDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) - self.sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1) + sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1) else: self._gpu_map(KFDDevice.signals_page) self._gpu_map(KFDDevice.event_page) - self.sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1) + sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1) - self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) - self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) + self.completion_signal = KFDDevice._get_signal(self.device_id*2, sync_event=sync_event) + self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1)) + + self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) + self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) + self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) self.kernargs_ptr = self.kernargs.va_addr self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) - self.completion_signal = KFDDevice._get_signal(self.device_id) - self.completion_signal.event_mailbox_ptr = KFDDevice.event_page.va_addr + self.sync_event.event_slot_index*8 - self.completion_signal.event_id = self.sync_event.event_id - # AQL Queue - self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart.va_addr) + self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart_aql.va_addr) self.amd_aql_queue.write_dispatch_id = 0 self.amd_aql_queue.read_dispatch_id = 0 self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset @@ -423,19 +419,19 @@ class KFDDevice(Compiled): eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size, ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size, ctl_stack_size = 0xa000, - write_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, - read_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset) + write_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, + read_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset) - self.doorbells_base = self.aql_queue.doorbell_offset & (~0xfff) - self.doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base) + self.doorbells_base = self.aql_queue.doorbell_offset & (~0x1fff) # doorbell is two pages + self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base) self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I") self.aql_doorbell_value = 0 # SDMA Queue - self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) + self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id, queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, - write_pointer_address=self.gart.va_addr + 0x100, read_pointer_address=self.gart.va_addr + 0x108) + write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr) self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q") self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q") @@ -443,7 +439,7 @@ class KFDDevice(Compiled): self.sdma_doorbell_value = 0 # PM4 stuff - self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) + self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff, (self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa) ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd)) @@ -474,12 +470,12 @@ class KFDDevice(Compiled): q = HWComputeQueue() q.q.append(self.pm4_packet) q.submit(self) - self._wait_on(self.completion_signal.event_id) + self._wait_signal(self.completion_signal) assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}" def synchronize(self): HWComputeQueue().signal(self.completion_signal).submit(self) - self._wait_on(self.completion_signal.event_id) + self._wait_signal(self.completion_signal) assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}" # reset kernargs