This commit is contained in:
George Hotz
2024-04-23 09:00:28 +04:00
committed by GitHub
parent 2ae4f45272
commit 9a95781d51
6 changed files with 51 additions and 51 deletions

View File

@@ -5,7 +5,7 @@ import tinygrad.runtime.autogen.amd_gpu as amd_gpu
import tinygrad.runtime.autogen.kfd as kfd import tinygrad.runtime.autogen.kfd as kfd
import tinygrad.runtime.autogen.hsa as hsa import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.engine.schedule import create_schedule from tinygrad.engine.schedule import create_schedule
from tinygrad.runtime.ops_kfd import kio, KFDProgram from tinygrad.runtime.ops_amd import kio, KFDProgram
from tinygrad.helpers import to_mv from tinygrad.helpers import to_mv
DISPATCH_INIT_VALUE = 0x21 | 0x8000 DISPATCH_INIT_VALUE = 0x21 | 0x8000

View File

@@ -1,21 +1,21 @@
import ctypes, mmap, time import ctypes, mmap, time
from tinygrad.runtime.ops_kfd import KFDDevice, kio, sdma_pkts, libc from tinygrad.runtime.ops_amd import AMDDevice, kio, sdma_pkts, libc
import tinygrad.runtime.autogen.amd_sdma as amd_sdma import tinygrad.runtime.autogen.amd_sdma as amd_sdma
import tinygrad.runtime.autogen.kfd as kfd import tinygrad.runtime.autogen.kfd as kfd
from tinygrad.helpers import to_mv from tinygrad.helpers import to_mv
if __name__ == "__main__": if __name__ == "__main__":
dev = KFDDevice() dev = AMDDevice()
sdma_ring = dev._gpu_alloc(1 << 22, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) sdma_ring = dev._gpu_alloc(1 << 22, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
gart = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) gart = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
sdma_queue = kio.create_queue(KFDDevice.kfd, sdma_queue = kio.create_queue(AMDDevice.kfd,
ring_base_address=sdma_ring.va_addr, ring_size=sdma_ring.size, gpu_id=dev.gpu_id, ring_base_address=sdma_ring.va_addr, ring_size=sdma_ring.size, gpu_id=dev.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
write_pointer_address=gart.va_addr + 0x100, read_pointer_address=gart.va_addr + 0x108) write_pointer_address=gart.va_addr + 0x100, read_pointer_address=gart.va_addr + 0x108)
doorbells_base = sdma_queue.doorbell_offset & (~0xfff) doorbells_base = sdma_queue.doorbell_offset & (~0xfff)
doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, doorbells_base) doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, doorbells_base)
sdma_read_pointer = to_mv(sdma_queue.read_pointer_address, 8).cast("Q") sdma_read_pointer = to_mv(sdma_queue.read_pointer_address, 8).cast("Q")
sdma_write_pointer = to_mv(sdma_queue.write_pointer_address, 8).cast("Q") sdma_write_pointer = to_mv(sdma_queue.write_pointer_address, 8).cast("Q")

View File

@@ -2,7 +2,7 @@ import unittest, ctypes, struct, time
from tinygrad import Device, Tensor, dtypes from tinygrad import Device, Tensor, dtypes
from tinygrad.buffer import Buffer, BufferOptions from tinygrad.buffer import Buffer, BufferOptions
from tinygrad.engine.schedule import create_schedule from tinygrad.engine.schedule import create_schedule
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWPM4Queue from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWPM4Queue
def _time_queue(q, d): def _time_queue(q, d):
st = time.perf_counter() st = time.perf_counter()
@@ -14,8 +14,8 @@ def _time_queue(q, d):
class TestHCQ(unittest.TestCase): class TestHCQ(unittest.TestCase):
@classmethod @classmethod
def setUpClass(self): def setUpClass(self):
TestHCQ.d0: KFDDevice = Device["KFD"] TestHCQ.d0: AMDDevice = Device["KFD"]
#TestHCQ.d1: KFDDevice = Device["KFD:1"] #TestHCQ.d1: AMDDevice = Device["KFD:1"]
TestHCQ.a = Tensor([0.,1.], device="KFD").realize() TestHCQ.a = Tensor([0.,1.], device="KFD").realize()
TestHCQ.b = self.a + 1 TestHCQ.b = self.a + 1
si = create_schedule([self.b.lazydata])[-1] si = create_schedule([self.b.lazydata])[-1]
@@ -165,7 +165,7 @@ class TestHCQ(unittest.TestCase):
q = TestHCQ.compute_queue() q = TestHCQ.compute_queue()
qc = HWCopyQueue() qc = HWCopyQueue()
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size) # b = [1, 2] q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size) # b = [1, 2]
q.signal(sig:=KFDDevice._get_signal(10)) q.signal(sig:=AMDDevice._get_signal(10))
qc.wait(sig) qc.wait(sig)
qc.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8) qc.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8)
qc.signal(TestHCQ.d0.completion_signal) qc.signal(TestHCQ.d0.completion_signal)

View File

@@ -3,10 +3,10 @@ import random
from tqdm import trange from tqdm import trange
from typing import List from typing import List
from tinygrad import Device from tinygrad import Device
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWComputeQueue from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWComputeQueue
if __name__ == "__main__": if __name__ == "__main__":
dev: List[KFDDevice] = [Device[f"KFD:{i}"] for i in range(6)] dev: List[AMDDevice] = [Device[f"KFD:{i}"] for i in range(6)]
print(f"got {len(dev)} devices") print(f"got {len(dev)} devices")
buffers = [(rd:=random.choice(dev), rd.allocator.alloc(random.randint(1, 10000))) for i in range(100)] buffers = [(rd:=random.choice(dev), rd.allocator.alloc(random.randint(1, 10000))) for i in range(100)]
@@ -16,7 +16,7 @@ if __name__ == "__main__":
d2, b2 = random.choice(buffers) d2, b2 = random.choice(buffers)
d1._gpu_map(b2) d1._gpu_map(b2)
q = HWComputeQueue() q = HWComputeQueue()
q.signal(sig:=KFDDevice._get_signal(10)) q.signal(sig:=AMDDevice._get_signal(10))
qc = HWCopyQueue() qc = HWCopyQueue()
qc.wait(sig) qc.wait(sig)
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size)) qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
@@ -25,4 +25,4 @@ if __name__ == "__main__":
qc.submit(d1) qc.submit(d1)
q.wait(d1.completion_signal) q.wait(d1.completion_signal)
q.submit(d1) q.submit(d1)
KFDDevice._wait_on(d1.completion_signal.event_id) AMDDevice._wait_on(d1.completion_signal.event_id)

View File

@@ -53,7 +53,7 @@ tensor_cores: Dict[str, List[TensorCore]] = {
"HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[2],[0],[0],[-1],[1]], [[0],[2],[1],[-1],[0]], [[-2],[2],[1],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501 "HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[2],[0],[0],[-1],[1]], [[0],[2],[1],[-1],[0]], [[-2],[2],[1],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501
"CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[-2],[5],[0],[0],[-1,1,2,-3],[3,4]], [[5],[0],[0],[4],[3],[-1,1,2,-2],[0]], [[2],[-2],[5],[1],[-1],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501 "CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[-2],[5],[0],[0],[-1,1,2,-3],[3,4]], [[5],[0],[0],[4],[3],[-1,1,2,-2],[0]], [[2],[-2],[5],[1],[-1],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501
} }
tensor_cores["KFD"] = tensor_cores["HSA"] tensor_cores["AMD"] = tensor_cores["HSA"]
class LocalBuffer(NamedTuple): class LocalBuffer(NamedTuple):
name: str name: str

View File

@@ -68,8 +68,8 @@ def create_sdma_packets():
return type("SDMA_PKTS", (object, ), structs) return type("SDMA_PKTS", (object, ), structs)
sdma_pkts = create_sdma_packets() sdma_pkts = create_sdma_packets()
class KFDCompiler(Compiler): class AMDCompiler(Compiler):
compiler_opts = CompilerOptions("KFD", has_tensor_cores=True, shared_max=65536) compiler_opts = CompilerOptions("AMD", has_tensor_cores=True, shared_max=65536)
def __init__(self, arch:str): def __init__(self, arch:str):
self.arch = arch self.arch = arch
super().__init__(f"compile_hip_{self.arch}") super().__init__(f"compile_hip_{self.arch}")
@@ -238,7 +238,7 @@ class HWPM4Queue:
global_size[0],global_size[1],global_size[2], CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN] global_size[0],global_size[1],global_size[2], CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
# have to self wait since flush doesn't work # have to self wait since flush doesn't work
self.signal(sig:=KFDDevice._get_signal()) self.signal(sig:=AMDDevice._get_signal())
self.wait(sig) self.wait(sig)
if completion_signal: self.signal(completion_signal) if completion_signal: self.signal(completion_signal)
@@ -280,7 +280,7 @@ class HWPM4Queue:
signal.event_id] signal.event_id]
return self return self
def submit(self, device:KFDDevice): def submit(self, device:AMDDevice):
wptr = device.pm4_write_pointer[0] wptr = device.pm4_write_pointer[0]
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I") pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
@@ -299,7 +299,7 @@ sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_
class HWCopyQueue: class HWCopyQueue:
def __init__(self): self.q = [] def __init__(self): self.q = []
def submit(self, device:KFDDevice): def submit(self, device:AMDDevice):
read_ptr = device.sdma_read_pointer[0] read_ptr = device.sdma_read_pointer[0]
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun") if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
for cmd in self.q: for cmd in self.q:
@@ -345,7 +345,7 @@ class HWCopyQueue:
return self return self
class KFDProgram: class KFDProgram:
def __init__(self, device:KFDDevice, name:str, lib:bytes): def __init__(self, device:AMDDevice, name:str, lib:bytes):
# TODO; this API needs the type signature of the function and global_size/local_size # TODO; this API needs the type signature of the function and global_size/local_size
self.device, self.name, self.lib = device, name, lib self.device, self.name, self.lib = device, name, lib
@@ -399,8 +399,8 @@ class KFDProgram:
#assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}" #assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8 return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
class KFDAllocator(LRUAllocator): class AMDAllocator(LRUAllocator):
def __init__(self, device:KFDDevice): def __init__(self, device:AMDDevice):
self.device = device self.device = device
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto # NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)] self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)]
@@ -452,15 +452,15 @@ class KFDAllocator(LRUAllocator):
self.device._wait_signal(self.device.signal_sdma) self.device._wait_signal(self.device.signal_sdma)
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize) ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice): def transfer(self, dest, src, sz:int, src_dev:AMDDevice, dest_dev:AMDDevice):
dest_dev._gpu_map(src) dest_dev._gpu_map(src)
q = HWPM4Queue().signal(sig := KFDDevice._get_signal()) q = HWPM4Queue().signal(sig := AMDDevice._get_signal())
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev) HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := AMDDevice._get_signal()).submit(dest_dev)
HWPM4Queue().wait(sigc).submit(dest_dev) HWPM4Queue().wait(sigc).submit(dest_dev)
q.wait(sigc).submit(src_dev) q.wait(sigc).submit(src_dev)
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400 MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
class KFDDevice(Compiled): class AMDDevice(Compiled):
kfd:int = -1 kfd:int = -1
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args] event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
signals_page:Any = None signals_page:Any = None
@@ -501,15 +501,15 @@ class KFDDevice(Compiled):
@classmethod @classmethod
def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t: def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t:
if num is None: if num is None:
num = KFDDevice.signal_number num = AMDDevice.signal_number
KFDDevice.signal_number += 1 AMDDevice.signal_number += 1
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16 if AMDDevice.signal_number == SIGNAL_COUNT: AMDDevice.signal_number = 16
#print("signal", num) #print("signal", num)
ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num) ret = hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
ret.value = 0 ret.value = 0
ret.kind = hsa.AMD_SIGNAL_KIND_USER ret.kind = hsa.AMD_SIGNAL_KIND_USER
if sync_event is not None: if sync_event is not None:
ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8 ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
ret.event_id = sync_event.event_id ret.event_id = sync_event.event_id
return ret return ret
@@ -518,7 +518,7 @@ class KFDDevice(Compiled):
assert signal.event_id != 0, "can't wait on this signal" assert signal.event_id != 0, "can't wait on this signal"
evt_arr = (kfd.struct_kfd_event_data * 1)() evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = signal.event_id evt_arr[0].event_id = signal.event_id
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout) ret = kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!") if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
#val = signal.value #val = signal.value
@@ -526,28 +526,28 @@ class KFDDevice(Compiled):
assert skip_check or signal.value == 0, f"not set to 0, but {signal.value}" assert skip_check or signal.value == 0, f"not set to 0, but {signal.value}"
def __init__(self, device:str=""): def __init__(self, device:str=""):
if KFDDevice.kfd == -1: if AMDDevice.kfd == -1:
KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR) AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
KFDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)] AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
self.device_id = int(device.split(":")[1]) if ":" in device else 0 self.device_id = int(device.split(":")[1]) if ":" in device else 0
with open(f"{KFDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read()) with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
with open(f"{KFDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f} with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR) self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
target = int(self.properties['gfx_target_version']) target = int(self.properties['gfx_target_version'])
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100) self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id) kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
if KFDDevice.event_page is None: if AMDDevice.event_page is None:
KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
KFDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1) sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
else: else:
self._gpu_map(KFDDevice.signals_page) self._gpu_map(AMDDevice.signals_page)
self._gpu_map(KFDDevice.event_page) self._gpu_map(AMDDevice.event_page)
sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1) sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
self.completion_signal = KFDDevice._get_signal(self.device_id*2, sync_event=sync_event) self.completion_signal = AMDDevice._get_signal(self.device_id*2, sync_event=sync_event)
self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1)) self.signal_sdma = AMDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
@@ -567,13 +567,13 @@ class KFDDevice(Compiled):
# SDMA Queue # SDMA Queue
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id, self.sdma_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8) write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
# doorbell page # doorbell page
self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base) self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q") self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q") self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
@@ -585,7 +585,7 @@ class KFDDevice(Compiled):
self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.pm4_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id, self.pm4_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size, eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
# TODO: are these needed? (i know eop is) # TODO: are these needed? (i know eop is)
@@ -597,7 +597,7 @@ class KFDDevice(Compiled):
self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q") self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 4).cast("I") self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self)) super().__init__(device, AMDAllocator(self), AMDCompiler(self.arch), functools.partial(KFDProgram, self))
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None): def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
q = HWCopyQueue() q = HWCopyQueue()