address kfd feedback (#4087)

* address kfd feedback

* signals cleanup

* signals cleanup

* handle 2 doorbell pages correctly

* signal reset cleanup

* signals cleanup

* more GTT

* cleanups

* minor cleanups
This commit is contained in:
George Hotz
2024-04-05 15:24:41 -07:00
committed by GitHub
parent dafa42e864
commit 164329a8ea
2 changed files with 103 additions and 107 deletions

View File

@@ -54,10 +54,9 @@ def create_sdma_packets():
fname = union_fields[0]
if fname in names: fname = pkt_fields[0]+fname
names.add(fname)
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"):
fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64]) # merge together 64-bit fields
else:
fields.append(tuple([fname, *union_fields[1:]]))
# merge together 64-bit fields, otherwise just append them
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
else: fields.append(tuple([fname, *union_fields[1:]]))
new_name = name[16:-4].lower()
structs[new_name] = init_c_struct_t(tuple(fields))
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
@@ -95,41 +94,44 @@ SHT_PROGBITS = 0x1
SHF_ALLOC = 0x2
EMPTY_SIGNAL = hsa.hsa_signal_t()
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
class HWComputeQueue:
def __init__(self):
self.q = []
def __init__(self): self.q = []
def exec(self, prg:KFDProgram, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), completion_signal=None):
if completion_signal is not None: completion_signal.value = 1
self.q.append(hsa.hsa_kernel_dispatch_packet_t(
setup=DISPATCH_KERNEL_SETUP, header=DISPATCH_KERNEL_HEADER,
header=DISPATCH_KERNEL_HEADER, setup=DISPATCH_KERNEL_SETUP,
workgroup_size_x=local_size[0], workgroup_size_y=local_size[1], workgroup_size_z=local_size[2],
grid_size_x=global_size[0]*local_size[0], grid_size_y=global_size[1]*local_size[1], grid_size_z=global_size[2]*local_size[2],
kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size,
kernarg_address=kernargs,
kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size, kernarg_address=kernargs,
completion_signal=hsa.hsa_signal_t(ctypes.addressof(completion_signal)) if completion_signal is not None else EMPTY_SIGNAL))
return self
def signal(self, signal):
def signal(self, signal:hsa.amd_signal_t):
signal.value = 1
self.q.append(hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER, completion_signal=hsa.hsa_signal_t(ctypes.addressof(signal))))
return self
def wait(self, signal):
def wait(self, signal:hsa.amd_signal_t):
sig = hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER)
sig.dep_signal[0] = hsa.hsa_signal_t(ctypes.addressof(signal))
self.q.append(sig)
return self
def submit(self, device:KFDDevice):
read_ptr = device.amd_aql_queue.read_dispatch_id
if not len(self.q): return
write_ptr, read_ptr = device.amd_aql_queue.write_dispatch_id, device.amd_aql_queue.read_dispatch_id
if (len(self.q)+write_ptr-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun")
for cmd in self.q:
ring_addr = device.aql_ring.va_addr + (device.amd_aql_queue.write_dispatch_id*AQL_PACKET_SIZE) % device.aql_ring.size
ring_addr = device.aql_ring.va_addr + (write_ptr*AQL_PACKET_SIZE) % device.aql_ring.size
ctypes.memmove(ring_addr, ctypes.addressof(cmd), AQL_PACKET_SIZE)
device.amd_aql_queue.write_dispatch_id += 1
if (device.amd_aql_queue.write_dispatch_id-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun")
if len(self.q):
device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1
device.aql_doorbell_value += len(self.q)
write_ptr += 1
# TODO: add CPU memory barrier here
device.amd_aql_queue.write_dispatch_id = write_ptr
device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1
device.aql_doorbell_value += len(self.q)
return self
# prebuilt sdma packets
@@ -141,19 +143,17 @@ sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_
GCR_CONTROL_GL2_RANGE=0)
class HWCopyQueue:
def __init__(self):
self.q = []
def __init__(self): self.q = []
def submit(self, device:KFDDevice):
def blit_sdma_command(cmd):
read_ptr = device.sdma_read_pointer[0]
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
for cmd in self.q:
if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
device.sdma_doorbell_value += fill
ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
device.sdma_doorbell_value += cmdsz
read_ptr = device.sdma_read_pointer[0]
for cmd in self.q: blit_sdma_command(cmd)
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
device.sdma_write_pointer[0] = device.sdma_doorbell_value
device.sdma_doorbell[0] = device.sdma_doorbell_value
return self
@@ -175,17 +175,18 @@ class HWCopyQueue:
self.q.append(sdma_cache_wb)
return self
def signal(self, completion_signal):
def signal(self, signal:hsa.amd_signal_t):
signal.value = 1
self.q.append(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset, src_data=(1<<64)-1))
if completion_signal.event_mailbox_ptr != 0:
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr, data=completion_signal.event_id))
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, src_data=(1<<64)-1))
if signal.event_mailbox_ptr != 0:
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
return self
def wait(self, completion_signal):
def wait(self, signal:hsa.amd_signal_t):
self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset,
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
return self
@@ -224,26 +225,26 @@ class KFDProgram:
assert self.device.kernargs_ptr < (self.device.kernargs.va_addr + self.device.kernargs.size + self.kernargs_segment_size), "kernargs overrun"
if not hasattr(self, "args_struct_t"):
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
self.device.completion_signal.value = 1 # reset the signal before call
HWComputeQueue().exec(self, self.device.kernargs_ptr, global_size, local_size,
self.device.completion_signal if wait else None).submit(self.device)
self.device.kernargs_ptr += self.kernargs_segment_size
if wait:
self.device._wait_on(self.device.completion_signal.event_id)
self.device._wait_signal(self.device.completion_signal)
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
class KFDAllocator(LRUAllocator):
def __init__(self, device:KFDDevice):
self.device = device
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)]
super().__init__()
@@ -260,6 +261,13 @@ class KFDAllocator(LRUAllocator):
# self.device.synchronize()
# return to_mv(src.va_addr, src.size)
def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice):
dest_dev._gpu_map(src)
q = HWComputeQueue().signal(sig := KFDDevice._get_signal())
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
HWComputeQueue().wait(sigc).submit(dest_dev)
q.wait(sigc).submit(src_dev)
def copy_from_fd(self, dest, fd, offset, size):
fo = io.FileIO(fd, "a+b", closefd=False)
fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
@@ -270,38 +278,27 @@ class KFDAllocator(LRUAllocator):
if copy_size == 0: break
fo.readinto(to_mv(self.b[1].va_addr, local_size))
if i != 0: self.device._wait_on(self.device.completion_signal.event_id)
if i != 0: self.device._wait_signal(self.device.signal_sdma)
self.b = self.b[::-1]
self.device.completion_signal.value = 1 # TODO: when do we have to reset it?
self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.completion_signal)
self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma)
copied_in += copy_size
minor_offset = 0 # only on the first
self.device._wait_on(self.device.completion_signal.event_id)
def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None):
dest_dev._gpu_map(src)
q = HWComputeQueue().signal(sig := KFDDevice._get_signal())
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
HWComputeQueue().wait(sigc).submit(dest_dev)
q.wait(sigc).submit(src_dev)
self.device._wait_signal(self.device.signal_sdma)
def copyin(self, dest, src: memoryview):
for i in range(0, src.nbytes, self.b[0].size):
ctypes.memmove(self.b[1].va_addr, from_mv(src[i:]), lsize:=min(self.b[0].size, src.nbytes-i))
if i != 0: self.device._wait_on(self.device.completion_signal.event_id)
if i != 0: self.device._wait_signal(self.device.signal_sdma)
self.b = self.b[::-1]
self.device.completion_signal.value = 1 # TODO: when do we have to reset it?
self.device._submit_sdma(dest.va_addr+i, self.b[0].va_addr, lsize, completion_signal=self.device.completion_signal)
self.device._wait_on(self.device.completion_signal.event_id)
self.device._submit_sdma(dest.va_addr+i, self.b[0].va_addr, lsize, completion_signal=self.device.signal_sdma)
self.device._wait_signal(self.device.signal_sdma)
def copyout(self, dest:memoryview, src):
self.device.synchronize()
for i in range(0, dest.nbytes, self.b[0].size):
self.device.completion_signal.value = 1
self.device._submit_sdma(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i),
completion_signal=self.device.completion_signal)
self.device._wait_on(self.device.completion_signal.event_id)
self.device._submit_sdma(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i), completion_signal=self.device.signal_sdma)
self.device._wait_signal(self.device.signal_sdma)
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
@@ -309,7 +306,7 @@ class KFDDevice(Compiled):
kfd:int = -1
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
signals_page:Any = None
signal_number:int = 10
signal_number:int = 16
def _gpu_map(self, mem):
if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
@@ -318,13 +315,6 @@ class KFDDevice(Compiled):
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
assert stm.n_success == len(mem.mapped_gpu_ids)
@classmethod
def _wait_on(self, event_id, timeout=10000):
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = event_id
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
@@ -337,7 +327,6 @@ class KFDDevice(Compiled):
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
assert buf != 0xffffffffffffffff
assert addr == buf == mem.va_addr
if map_to_gpu: self._gpu_map(mem)
return mem
@@ -351,15 +340,26 @@ class KFDDevice(Compiled):
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
@classmethod
def _get_signal(self, num=None):
def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t:
if num is None:
num = KFDDevice.signal_number
KFDDevice.signal_number += 1
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 10
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16
ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
ret.value = 1
ret.kind = hsa.AMD_SIGNAL_KIND_USER
if sync_event is not None:
ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8
ret.event_id = sync_event.event_id
return ret
@classmethod
def _wait_signal(self, signal:hsa.amd_signal_t, timeout=10000):
assert signal.event_id != 0, "can't wait on this signal"
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = signal.event_id
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
def __init__(self, device:str=""):
if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
self.device_id = int(device.split(":")[1]) if ":" in device else 0
@@ -370,31 +370,27 @@ class KFDDevice(Compiled):
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
if KFDDevice.event_page is None:
KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
for i in range(SIGNAL_COUNT):
sig = KFDDevice._get_signal(i)
sig.value = 1
sig.kind = hsa.AMD_SIGNAL_KIND_USER
KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
KFDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1)
sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1)
else:
self._gpu_map(KFDDevice.signals_page)
self._gpu_map(KFDDevice.event_page)
self.sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1)
sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1)
self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.completion_signal = KFDDevice._get_signal(self.device_id*2, sync_event=sync_event)
self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1))
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.kernargs_ptr = self.kernargs.va_addr
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.completion_signal = KFDDevice._get_signal(self.device_id)
self.completion_signal.event_mailbox_ptr = KFDDevice.event_page.va_addr + self.sync_event.event_slot_index*8
self.completion_signal.event_id = self.sync_event.event_id
# AQL Queue
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart.va_addr)
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart_aql.va_addr)
self.amd_aql_queue.write_dispatch_id = 0
self.amd_aql_queue.read_dispatch_id = 0
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
@@ -423,19 +419,19 @@ class KFDDevice(Compiled):
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
ctl_stack_size = 0xa000,
write_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
read_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
write_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
read_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
self.doorbells_base = self.aql_queue.doorbell_offset & (~0xfff)
self.doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
self.doorbells_base = self.aql_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
self.aql_doorbell_value = 0
# SDMA Queue
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
write_pointer_address=self.gart.va_addr + 0x100, read_pointer_address=self.gart.va_addr + 0x108)
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr)
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
@@ -443,7 +439,7 @@ class KFDDevice(Compiled):
self.sdma_doorbell_value = 0
# PM4 stuff
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff,
(self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa)
ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd))
@@ -474,12 +470,12 @@ class KFDDevice(Compiled):
q = HWComputeQueue()
q.q.append(self.pm4_packet)
q.submit(self)
self._wait_on(self.completion_signal.event_id)
self._wait_signal(self.completion_signal)
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
def synchronize(self):
HWComputeQueue().signal(self.completion_signal).submit(self)
self._wait_on(self.completion_signal.event_id)
self._wait_signal(self.completion_signal)
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
# reset kernargs