mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-13 08:05:10 -05:00
address kfd feedback (#4087)
* address kfd feedback * signals cleanup * signals cleanup * handle 2 doorbell pages correctly * signal reset cleanup * signals cleanup * more GTT * cleanups * minor cleanups
This commit is contained in:
@@ -54,10 +54,9 @@ def create_sdma_packets():
|
||||
fname = union_fields[0]
|
||||
if fname in names: fname = pkt_fields[0]+fname
|
||||
names.add(fname)
|
||||
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"):
|
||||
fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64]) # merge together 64-bit fields
|
||||
else:
|
||||
fields.append(tuple([fname, *union_fields[1:]]))
|
||||
# merge together 64-bit fields, otherwise just append them
|
||||
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
|
||||
else: fields.append(tuple([fname, *union_fields[1:]]))
|
||||
new_name = name[16:-4].lower()
|
||||
structs[new_name] = init_c_struct_t(tuple(fields))
|
||||
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
|
||||
@@ -95,41 +94,44 @@ SHT_PROGBITS = 0x1
|
||||
SHF_ALLOC = 0x2
|
||||
|
||||
EMPTY_SIGNAL = hsa.hsa_signal_t()
|
||||
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
|
||||
|
||||
class HWComputeQueue:
|
||||
def __init__(self):
|
||||
self.q = []
|
||||
def __init__(self): self.q = []
|
||||
|
||||
def exec(self, prg:KFDProgram, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), completion_signal=None):
|
||||
if completion_signal is not None: completion_signal.value = 1
|
||||
self.q.append(hsa.hsa_kernel_dispatch_packet_t(
|
||||
setup=DISPATCH_KERNEL_SETUP, header=DISPATCH_KERNEL_HEADER,
|
||||
header=DISPATCH_KERNEL_HEADER, setup=DISPATCH_KERNEL_SETUP,
|
||||
workgroup_size_x=local_size[0], workgroup_size_y=local_size[1], workgroup_size_z=local_size[2],
|
||||
grid_size_x=global_size[0]*local_size[0], grid_size_y=global_size[1]*local_size[1], grid_size_z=global_size[2]*local_size[2],
|
||||
kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size,
|
||||
kernarg_address=kernargs,
|
||||
kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size, kernarg_address=kernargs,
|
||||
completion_signal=hsa.hsa_signal_t(ctypes.addressof(completion_signal)) if completion_signal is not None else EMPTY_SIGNAL))
|
||||
return self
|
||||
|
||||
def signal(self, signal):
|
||||
def signal(self, signal:hsa.amd_signal_t):
|
||||
signal.value = 1
|
||||
self.q.append(hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER, completion_signal=hsa.hsa_signal_t(ctypes.addressof(signal))))
|
||||
return self
|
||||
|
||||
def wait(self, signal):
|
||||
def wait(self, signal:hsa.amd_signal_t):
|
||||
sig = hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER)
|
||||
sig.dep_signal[0] = hsa.hsa_signal_t(ctypes.addressof(signal))
|
||||
self.q.append(sig)
|
||||
return self
|
||||
|
||||
def submit(self, device:KFDDevice):
|
||||
read_ptr = device.amd_aql_queue.read_dispatch_id
|
||||
if not len(self.q): return
|
||||
write_ptr, read_ptr = device.amd_aql_queue.write_dispatch_id, device.amd_aql_queue.read_dispatch_id
|
||||
if (len(self.q)+write_ptr-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun")
|
||||
for cmd in self.q:
|
||||
ring_addr = device.aql_ring.va_addr + (device.amd_aql_queue.write_dispatch_id*AQL_PACKET_SIZE) % device.aql_ring.size
|
||||
ring_addr = device.aql_ring.va_addr + (write_ptr*AQL_PACKET_SIZE) % device.aql_ring.size
|
||||
ctypes.memmove(ring_addr, ctypes.addressof(cmd), AQL_PACKET_SIZE)
|
||||
device.amd_aql_queue.write_dispatch_id += 1
|
||||
if (device.amd_aql_queue.write_dispatch_id-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun")
|
||||
if len(self.q):
|
||||
device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1
|
||||
device.aql_doorbell_value += len(self.q)
|
||||
write_ptr += 1
|
||||
# TODO: add CPU memory barrier here
|
||||
device.amd_aql_queue.write_dispatch_id = write_ptr
|
||||
device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1
|
||||
device.aql_doorbell_value += len(self.q)
|
||||
return self
|
||||
|
||||
# prebuilt sdma packets
|
||||
@@ -141,19 +143,17 @@ sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_
|
||||
GCR_CONTROL_GL2_RANGE=0)
|
||||
|
||||
class HWCopyQueue:
|
||||
def __init__(self):
|
||||
self.q = []
|
||||
def __init__(self): self.q = []
|
||||
|
||||
def submit(self, device:KFDDevice):
|
||||
def blit_sdma_command(cmd):
|
||||
read_ptr = device.sdma_read_pointer[0]
|
||||
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
||||
for cmd in self.q:
|
||||
if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
|
||||
ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
|
||||
device.sdma_doorbell_value += fill
|
||||
ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
|
||||
device.sdma_doorbell_value += cmdsz
|
||||
read_ptr = device.sdma_read_pointer[0]
|
||||
for cmd in self.q: blit_sdma_command(cmd)
|
||||
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
||||
device.sdma_write_pointer[0] = device.sdma_doorbell_value
|
||||
device.sdma_doorbell[0] = device.sdma_doorbell_value
|
||||
return self
|
||||
@@ -175,17 +175,18 @@ class HWCopyQueue:
|
||||
self.q.append(sdma_cache_wb)
|
||||
return self
|
||||
|
||||
def signal(self, completion_signal):
|
||||
def signal(self, signal:hsa.amd_signal_t):
|
||||
signal.value = 1
|
||||
self.q.append(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64,
|
||||
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset, src_data=(1<<64)-1))
|
||||
if completion_signal.event_mailbox_ptr != 0:
|
||||
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr, data=completion_signal.event_id))
|
||||
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
|
||||
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, src_data=(1<<64)-1))
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
|
||||
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
|
||||
return self
|
||||
|
||||
def wait(self, completion_signal):
|
||||
def wait(self, signal:hsa.amd_signal_t):
|
||||
self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3,
|
||||
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset,
|
||||
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
|
||||
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
|
||||
return self
|
||||
|
||||
@@ -224,26 +225,26 @@ class KFDProgram:
|
||||
assert self.device.kernargs_ptr < (self.device.kernargs.va_addr + self.device.kernargs.size + self.kernargs_segment_size), "kernargs overrun"
|
||||
if not hasattr(self, "args_struct_t"):
|
||||
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
||||
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
||||
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
||||
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
||||
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
||||
args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
|
||||
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
||||
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
||||
|
||||
self.device.completion_signal.value = 1 # reset the signal before call
|
||||
HWComputeQueue().exec(self, self.device.kernargs_ptr, global_size, local_size,
|
||||
self.device.completion_signal if wait else None).submit(self.device)
|
||||
self.device.kernargs_ptr += self.kernargs_segment_size
|
||||
|
||||
if wait:
|
||||
self.device._wait_on(self.device.completion_signal.event_id)
|
||||
self.device._wait_signal(self.device.completion_signal)
|
||||
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
|
||||
|
||||
class KFDAllocator(LRUAllocator):
|
||||
def __init__(self, device:KFDDevice):
|
||||
self.device = device
|
||||
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
|
||||
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)]
|
||||
super().__init__()
|
||||
|
||||
@@ -260,6 +261,13 @@ class KFDAllocator(LRUAllocator):
|
||||
# self.device.synchronize()
|
||||
# return to_mv(src.va_addr, src.size)
|
||||
|
||||
def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice):
|
||||
dest_dev._gpu_map(src)
|
||||
q = HWComputeQueue().signal(sig := KFDDevice._get_signal())
|
||||
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
|
||||
HWComputeQueue().wait(sigc).submit(dest_dev)
|
||||
q.wait(sigc).submit(src_dev)
|
||||
|
||||
def copy_from_fd(self, dest, fd, offset, size):
|
||||
fo = io.FileIO(fd, "a+b", closefd=False)
|
||||
fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
|
||||
@@ -270,38 +278,27 @@ class KFDAllocator(LRUAllocator):
|
||||
if copy_size == 0: break
|
||||
|
||||
fo.readinto(to_mv(self.b[1].va_addr, local_size))
|
||||
if i != 0: self.device._wait_on(self.device.completion_signal.event_id)
|
||||
if i != 0: self.device._wait_signal(self.device.signal_sdma)
|
||||
self.b = self.b[::-1]
|
||||
self.device.completion_signal.value = 1 # TODO: when do we have to reset it?
|
||||
self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.completion_signal)
|
||||
self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma)
|
||||
|
||||
copied_in += copy_size
|
||||
minor_offset = 0 # only on the first
|
||||
self.device._wait_on(self.device.completion_signal.event_id)
|
||||
|
||||
def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None):
|
||||
dest_dev._gpu_map(src)
|
||||
q = HWComputeQueue().signal(sig := KFDDevice._get_signal())
|
||||
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
|
||||
HWComputeQueue().wait(sigc).submit(dest_dev)
|
||||
q.wait(sigc).submit(src_dev)
|
||||
self.device._wait_signal(self.device.signal_sdma)
|
||||
|
||||
def copyin(self, dest, src: memoryview):
|
||||
for i in range(0, src.nbytes, self.b[0].size):
|
||||
ctypes.memmove(self.b[1].va_addr, from_mv(src[i:]), lsize:=min(self.b[0].size, src.nbytes-i))
|
||||
if i != 0: self.device._wait_on(self.device.completion_signal.event_id)
|
||||
if i != 0: self.device._wait_signal(self.device.signal_sdma)
|
||||
self.b = self.b[::-1]
|
||||
self.device.completion_signal.value = 1 # TODO: when do we have to reset it?
|
||||
self.device._submit_sdma(dest.va_addr+i, self.b[0].va_addr, lsize, completion_signal=self.device.completion_signal)
|
||||
self.device._wait_on(self.device.completion_signal.event_id)
|
||||
self.device._submit_sdma(dest.va_addr+i, self.b[0].va_addr, lsize, completion_signal=self.device.signal_sdma)
|
||||
self.device._wait_signal(self.device.signal_sdma)
|
||||
|
||||
def copyout(self, dest:memoryview, src):
|
||||
self.device.synchronize()
|
||||
for i in range(0, dest.nbytes, self.b[0].size):
|
||||
self.device.completion_signal.value = 1
|
||||
self.device._submit_sdma(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i),
|
||||
completion_signal=self.device.completion_signal)
|
||||
self.device._wait_on(self.device.completion_signal.event_id)
|
||||
self.device._submit_sdma(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i), completion_signal=self.device.signal_sdma)
|
||||
self.device._wait_signal(self.device.signal_sdma)
|
||||
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
@@ -309,7 +306,7 @@ class KFDDevice(Compiled):
|
||||
kfd:int = -1
|
||||
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
||||
signals_page:Any = None
|
||||
signal_number:int = 10
|
||||
signal_number:int = 16
|
||||
|
||||
def _gpu_map(self, mem):
|
||||
if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
|
||||
@@ -318,13 +315,6 @@ class KFDDevice(Compiled):
|
||||
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
|
||||
assert stm.n_success == len(mem.mapped_gpu_ids)
|
||||
|
||||
@classmethod
|
||||
def _wait_on(self, event_id, timeout=10000):
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = event_id
|
||||
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
|
||||
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
|
||||
flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
|
||||
@@ -337,7 +327,6 @@ class KFDDevice(Compiled):
|
||||
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
||||
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
||||
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
||||
assert buf != 0xffffffffffffffff
|
||||
assert addr == buf == mem.va_addr
|
||||
if map_to_gpu: self._gpu_map(mem)
|
||||
return mem
|
||||
@@ -351,15 +340,26 @@ class KFDDevice(Compiled):
|
||||
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
|
||||
|
||||
@classmethod
|
||||
def _get_signal(self, num=None):
|
||||
def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t:
|
||||
if num is None:
|
||||
num = KFDDevice.signal_number
|
||||
KFDDevice.signal_number += 1
|
||||
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 10
|
||||
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16
|
||||
ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
|
||||
ret.value = 1
|
||||
ret.kind = hsa.AMD_SIGNAL_KIND_USER
|
||||
if sync_event is not None:
|
||||
ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
||||
ret.event_id = sync_event.event_id
|
||||
return ret
|
||||
|
||||
@classmethod
|
||||
def _wait_signal(self, signal:hsa.amd_signal_t, timeout=10000):
|
||||
assert signal.event_id != 0, "can't wait on this signal"
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = signal.event_id
|
||||
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
|
||||
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
@@ -370,31 +370,27 @@ class KFDDevice(Compiled):
|
||||
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||
|
||||
if KFDDevice.event_page is None:
|
||||
KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
for i in range(SIGNAL_COUNT):
|
||||
sig = KFDDevice._get_signal(i)
|
||||
sig.value = 1
|
||||
sig.kind = hsa.AMD_SIGNAL_KIND_USER
|
||||
KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
KFDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1)
|
||||
sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1)
|
||||
else:
|
||||
self._gpu_map(KFDDevice.signals_page)
|
||||
self._gpu_map(KFDDevice.event_page)
|
||||
self.sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1)
|
||||
sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1)
|
||||
|
||||
self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.completion_signal = KFDDevice._get_signal(self.device_id*2, sync_event=sync_event)
|
||||
self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1))
|
||||
|
||||
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.kernargs_ptr = self.kernargs.va_addr
|
||||
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
|
||||
self.completion_signal = KFDDevice._get_signal(self.device_id)
|
||||
self.completion_signal.event_mailbox_ptr = KFDDevice.event_page.va_addr + self.sync_event.event_slot_index*8
|
||||
self.completion_signal.event_id = self.sync_event.event_id
|
||||
|
||||
# AQL Queue
|
||||
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart.va_addr)
|
||||
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart_aql.va_addr)
|
||||
self.amd_aql_queue.write_dispatch_id = 0
|
||||
self.amd_aql_queue.read_dispatch_id = 0
|
||||
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
|
||||
@@ -423,19 +419,19 @@ class KFDDevice(Compiled):
|
||||
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
|
||||
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
|
||||
ctl_stack_size = 0xa000,
|
||||
write_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
|
||||
read_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
|
||||
write_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
|
||||
read_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
|
||||
|
||||
self.doorbells_base = self.aql_queue.doorbell_offset & (~0xfff)
|
||||
self.doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
||||
self.doorbells_base = self.aql_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
||||
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
||||
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
self.aql_doorbell_value = 0
|
||||
|
||||
# SDMA Queue
|
||||
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
write_pointer_address=self.gart.va_addr + 0x100, read_pointer_address=self.gart.va_addr + 0x108)
|
||||
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr)
|
||||
|
||||
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
||||
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
||||
@@ -443,7 +439,7 @@ class KFDDevice(Compiled):
|
||||
self.sdma_doorbell_value = 0
|
||||
|
||||
# PM4 stuff
|
||||
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff,
|
||||
(self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa)
|
||||
ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd))
|
||||
@@ -474,12 +470,12 @@ class KFDDevice(Compiled):
|
||||
q = HWComputeQueue()
|
||||
q.q.append(self.pm4_packet)
|
||||
q.submit(self)
|
||||
self._wait_on(self.completion_signal.event_id)
|
||||
self._wait_signal(self.completion_signal)
|
||||
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
|
||||
def synchronize(self):
|
||||
HWComputeQueue().signal(self.completion_signal).submit(self)
|
||||
self._wait_on(self.completion_signal.event_id)
|
||||
self._wait_signal(self.completion_signal)
|
||||
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
|
||||
# reset kernargs
|
||||
|
||||
Reference in New Issue
Block a user