mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
renamed (#4260)
This commit is contained in:
@@ -5,7 +5,7 @@ import tinygrad.runtime.autogen.amd_gpu as amd_gpu
|
|||||||
import tinygrad.runtime.autogen.kfd as kfd
|
import tinygrad.runtime.autogen.kfd as kfd
|
||||||
import tinygrad.runtime.autogen.hsa as hsa
|
import tinygrad.runtime.autogen.hsa as hsa
|
||||||
from tinygrad.engine.schedule import create_schedule
|
from tinygrad.engine.schedule import create_schedule
|
||||||
from tinygrad.runtime.ops_kfd import kio, KFDProgram
|
from tinygrad.runtime.ops_amd import kio, KFDProgram
|
||||||
from tinygrad.helpers import to_mv
|
from tinygrad.helpers import to_mv
|
||||||
|
|
||||||
DISPATCH_INIT_VALUE = 0x21 | 0x8000
|
DISPATCH_INIT_VALUE = 0x21 | 0x8000
|
||||||
|
|||||||
@@ -1,21 +1,21 @@
|
|||||||
import ctypes, mmap, time
|
import ctypes, mmap, time
|
||||||
from tinygrad.runtime.ops_kfd import KFDDevice, kio, sdma_pkts, libc
|
from tinygrad.runtime.ops_amd import AMDDevice, kio, sdma_pkts, libc
|
||||||
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
|
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
|
||||||
import tinygrad.runtime.autogen.kfd as kfd
|
import tinygrad.runtime.autogen.kfd as kfd
|
||||||
from tinygrad.helpers import to_mv
|
from tinygrad.helpers import to_mv
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
dev = KFDDevice()
|
dev = AMDDevice()
|
||||||
|
|
||||||
sdma_ring = dev._gpu_alloc(1 << 22, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
sdma_ring = dev._gpu_alloc(1 << 22, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||||
gart = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
gart = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
sdma_queue = kio.create_queue(KFDDevice.kfd,
|
sdma_queue = kio.create_queue(AMDDevice.kfd,
|
||||||
ring_base_address=sdma_ring.va_addr, ring_size=sdma_ring.size, gpu_id=dev.gpu_id,
|
ring_base_address=sdma_ring.va_addr, ring_size=sdma_ring.size, gpu_id=dev.gpu_id,
|
||||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||||
write_pointer_address=gart.va_addr + 0x100, read_pointer_address=gart.va_addr + 0x108)
|
write_pointer_address=gart.va_addr + 0x100, read_pointer_address=gart.va_addr + 0x108)
|
||||||
|
|
||||||
doorbells_base = sdma_queue.doorbell_offset & (~0xfff)
|
doorbells_base = sdma_queue.doorbell_offset & (~0xfff)
|
||||||
doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, doorbells_base)
|
doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, doorbells_base)
|
||||||
|
|
||||||
sdma_read_pointer = to_mv(sdma_queue.read_pointer_address, 8).cast("Q")
|
sdma_read_pointer = to_mv(sdma_queue.read_pointer_address, 8).cast("Q")
|
||||||
sdma_write_pointer = to_mv(sdma_queue.write_pointer_address, 8).cast("Q")
|
sdma_write_pointer = to_mv(sdma_queue.write_pointer_address, 8).cast("Q")
|
||||||
|
|||||||
8
test/external/external_test_hcq.py
vendored
8
test/external/external_test_hcq.py
vendored
@@ -2,7 +2,7 @@ import unittest, ctypes, struct, time
|
|||||||
from tinygrad import Device, Tensor, dtypes
|
from tinygrad import Device, Tensor, dtypes
|
||||||
from tinygrad.buffer import Buffer, BufferOptions
|
from tinygrad.buffer import Buffer, BufferOptions
|
||||||
from tinygrad.engine.schedule import create_schedule
|
from tinygrad.engine.schedule import create_schedule
|
||||||
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWPM4Queue
|
from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWPM4Queue
|
||||||
|
|
||||||
def _time_queue(q, d):
|
def _time_queue(q, d):
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
@@ -14,8 +14,8 @@ def _time_queue(q, d):
|
|||||||
class TestHCQ(unittest.TestCase):
|
class TestHCQ(unittest.TestCase):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(self):
|
def setUpClass(self):
|
||||||
TestHCQ.d0: KFDDevice = Device["KFD"]
|
TestHCQ.d0: AMDDevice = Device["KFD"]
|
||||||
#TestHCQ.d1: KFDDevice = Device["KFD:1"]
|
#TestHCQ.d1: AMDDevice = Device["KFD:1"]
|
||||||
TestHCQ.a = Tensor([0.,1.], device="KFD").realize()
|
TestHCQ.a = Tensor([0.,1.], device="KFD").realize()
|
||||||
TestHCQ.b = self.a + 1
|
TestHCQ.b = self.a + 1
|
||||||
si = create_schedule([self.b.lazydata])[-1]
|
si = create_schedule([self.b.lazydata])[-1]
|
||||||
@@ -165,7 +165,7 @@ class TestHCQ(unittest.TestCase):
|
|||||||
q = TestHCQ.compute_queue()
|
q = TestHCQ.compute_queue()
|
||||||
qc = HWCopyQueue()
|
qc = HWCopyQueue()
|
||||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size) # b = [1, 2]
|
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size) # b = [1, 2]
|
||||||
q.signal(sig:=KFDDevice._get_signal(10))
|
q.signal(sig:=AMDDevice._get_signal(10))
|
||||||
qc.wait(sig)
|
qc.wait(sig)
|
||||||
qc.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8)
|
qc.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8)
|
||||||
qc.signal(TestHCQ.d0.completion_signal)
|
qc.signal(TestHCQ.d0.completion_signal)
|
||||||
|
|||||||
8
test/external/fuzz_kfd.py
vendored
8
test/external/fuzz_kfd.py
vendored
@@ -3,10 +3,10 @@ import random
|
|||||||
from tqdm import trange
|
from tqdm import trange
|
||||||
from typing import List
|
from typing import List
|
||||||
from tinygrad import Device
|
from tinygrad import Device
|
||||||
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWComputeQueue
|
from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWComputeQueue
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
dev: List[KFDDevice] = [Device[f"KFD:{i}"] for i in range(6)]
|
dev: List[AMDDevice] = [Device[f"KFD:{i}"] for i in range(6)]
|
||||||
print(f"got {len(dev)} devices")
|
print(f"got {len(dev)} devices")
|
||||||
|
|
||||||
buffers = [(rd:=random.choice(dev), rd.allocator.alloc(random.randint(1, 10000))) for i in range(100)]
|
buffers = [(rd:=random.choice(dev), rd.allocator.alloc(random.randint(1, 10000))) for i in range(100)]
|
||||||
@@ -16,7 +16,7 @@ if __name__ == "__main__":
|
|||||||
d2, b2 = random.choice(buffers)
|
d2, b2 = random.choice(buffers)
|
||||||
d1._gpu_map(b2)
|
d1._gpu_map(b2)
|
||||||
q = HWComputeQueue()
|
q = HWComputeQueue()
|
||||||
q.signal(sig:=KFDDevice._get_signal(10))
|
q.signal(sig:=AMDDevice._get_signal(10))
|
||||||
qc = HWCopyQueue()
|
qc = HWCopyQueue()
|
||||||
qc.wait(sig)
|
qc.wait(sig)
|
||||||
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
|
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
|
||||||
@@ -25,4 +25,4 @@ if __name__ == "__main__":
|
|||||||
qc.submit(d1)
|
qc.submit(d1)
|
||||||
q.wait(d1.completion_signal)
|
q.wait(d1.completion_signal)
|
||||||
q.submit(d1)
|
q.submit(d1)
|
||||||
KFDDevice._wait_on(d1.completion_signal.event_id)
|
AMDDevice._wait_on(d1.completion_signal.event_id)
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ tensor_cores: Dict[str, List[TensorCore]] = {
|
|||||||
"HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[2],[0],[0],[-1],[1]], [[0],[2],[1],[-1],[0]], [[-2],[2],[1],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501
|
"HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[2],[0],[0],[-1],[1]], [[0],[2],[1],[-1],[0]], [[-2],[2],[1],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501
|
||||||
"CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[-2],[5],[0],[0],[-1,1,2,-3],[3,4]], [[5],[0],[0],[4],[3],[-1,1,2,-2],[0]], [[2],[-2],[5],[1],[-1],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501
|
"CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[-2],[5],[0],[0],[-1,1,2,-3],[3,4]], [[5],[0],[0],[4],[3],[-1,1,2,-2],[0]], [[2],[-2],[5],[1],[-1],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501
|
||||||
}
|
}
|
||||||
tensor_cores["KFD"] = tensor_cores["HSA"]
|
tensor_cores["AMD"] = tensor_cores["HSA"]
|
||||||
|
|
||||||
class LocalBuffer(NamedTuple):
|
class LocalBuffer(NamedTuple):
|
||||||
name: str
|
name: str
|
||||||
|
|||||||
@@ -68,8 +68,8 @@ def create_sdma_packets():
|
|||||||
return type("SDMA_PKTS", (object, ), structs)
|
return type("SDMA_PKTS", (object, ), structs)
|
||||||
sdma_pkts = create_sdma_packets()
|
sdma_pkts = create_sdma_packets()
|
||||||
|
|
||||||
class KFDCompiler(Compiler):
|
class AMDCompiler(Compiler):
|
||||||
compiler_opts = CompilerOptions("KFD", has_tensor_cores=True, shared_max=65536)
|
compiler_opts = CompilerOptions("AMD", has_tensor_cores=True, shared_max=65536)
|
||||||
def __init__(self, arch:str):
|
def __init__(self, arch:str):
|
||||||
self.arch = arch
|
self.arch = arch
|
||||||
super().__init__(f"compile_hip_{self.arch}")
|
super().__init__(f"compile_hip_{self.arch}")
|
||||||
@@ -238,7 +238,7 @@ class HWPM4Queue:
|
|||||||
global_size[0],global_size[1],global_size[2], CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
global_size[0],global_size[1],global_size[2], CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
||||||
|
|
||||||
# have to self wait since flush doesn't work
|
# have to self wait since flush doesn't work
|
||||||
self.signal(sig:=KFDDevice._get_signal())
|
self.signal(sig:=AMDDevice._get_signal())
|
||||||
self.wait(sig)
|
self.wait(sig)
|
||||||
|
|
||||||
if completion_signal: self.signal(completion_signal)
|
if completion_signal: self.signal(completion_signal)
|
||||||
@@ -280,7 +280,7 @@ class HWPM4Queue:
|
|||||||
signal.event_id]
|
signal.event_id]
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def submit(self, device:KFDDevice):
|
def submit(self, device:AMDDevice):
|
||||||
wptr = device.pm4_write_pointer[0]
|
wptr = device.pm4_write_pointer[0]
|
||||||
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
|
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
|
||||||
for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
|
for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
|
||||||
@@ -299,7 +299,7 @@ sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_
|
|||||||
class HWCopyQueue:
|
class HWCopyQueue:
|
||||||
def __init__(self): self.q = []
|
def __init__(self): self.q = []
|
||||||
|
|
||||||
def submit(self, device:KFDDevice):
|
def submit(self, device:AMDDevice):
|
||||||
read_ptr = device.sdma_read_pointer[0]
|
read_ptr = device.sdma_read_pointer[0]
|
||||||
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
||||||
for cmd in self.q:
|
for cmd in self.q:
|
||||||
@@ -345,7 +345,7 @@ class HWCopyQueue:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
class KFDProgram:
|
class KFDProgram:
|
||||||
def __init__(self, device:KFDDevice, name:str, lib:bytes):
|
def __init__(self, device:AMDDevice, name:str, lib:bytes):
|
||||||
# TODO; this API needs the type signature of the function and global_size/local_size
|
# TODO; this API needs the type signature of the function and global_size/local_size
|
||||||
self.device, self.name, self.lib = device, name, lib
|
self.device, self.name, self.lib = device, name, lib
|
||||||
|
|
||||||
@@ -399,8 +399,8 @@ class KFDProgram:
|
|||||||
#assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
#assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||||
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
|
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
|
||||||
|
|
||||||
class KFDAllocator(LRUAllocator):
|
class AMDAllocator(LRUAllocator):
|
||||||
def __init__(self, device:KFDDevice):
|
def __init__(self, device:AMDDevice):
|
||||||
self.device = device
|
self.device = device
|
||||||
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
|
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
|
||||||
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)]
|
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)]
|
||||||
@@ -452,15 +452,15 @@ class KFDAllocator(LRUAllocator):
|
|||||||
self.device._wait_signal(self.device.signal_sdma)
|
self.device._wait_signal(self.device.signal_sdma)
|
||||||
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
||||||
|
|
||||||
def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice):
|
def transfer(self, dest, src, sz:int, src_dev:AMDDevice, dest_dev:AMDDevice):
|
||||||
dest_dev._gpu_map(src)
|
dest_dev._gpu_map(src)
|
||||||
q = HWPM4Queue().signal(sig := KFDDevice._get_signal())
|
q = HWPM4Queue().signal(sig := AMDDevice._get_signal())
|
||||||
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
|
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := AMDDevice._get_signal()).submit(dest_dev)
|
||||||
HWPM4Queue().wait(sigc).submit(dest_dev)
|
HWPM4Queue().wait(sigc).submit(dest_dev)
|
||||||
q.wait(sigc).submit(src_dev)
|
q.wait(sigc).submit(src_dev)
|
||||||
|
|
||||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||||
class KFDDevice(Compiled):
|
class AMDDevice(Compiled):
|
||||||
kfd:int = -1
|
kfd:int = -1
|
||||||
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
||||||
signals_page:Any = None
|
signals_page:Any = None
|
||||||
@@ -501,15 +501,15 @@ class KFDDevice(Compiled):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t:
|
def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t:
|
||||||
if num is None:
|
if num is None:
|
||||||
num = KFDDevice.signal_number
|
num = AMDDevice.signal_number
|
||||||
KFDDevice.signal_number += 1
|
AMDDevice.signal_number += 1
|
||||||
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16
|
if AMDDevice.signal_number == SIGNAL_COUNT: AMDDevice.signal_number = 16
|
||||||
#print("signal", num)
|
#print("signal", num)
|
||||||
ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
|
ret = hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
|
||||||
ret.value = 0
|
ret.value = 0
|
||||||
ret.kind = hsa.AMD_SIGNAL_KIND_USER
|
ret.kind = hsa.AMD_SIGNAL_KIND_USER
|
||||||
if sync_event is not None:
|
if sync_event is not None:
|
||||||
ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
||||||
ret.event_id = sync_event.event_id
|
ret.event_id = sync_event.event_id
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
@@ -518,7 +518,7 @@ class KFDDevice(Compiled):
|
|||||||
assert signal.event_id != 0, "can't wait on this signal"
|
assert signal.event_id != 0, "can't wait on this signal"
|
||||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||||
evt_arr[0].event_id = signal.event_id
|
evt_arr[0].event_id = signal.event_id
|
||||||
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
|
ret = kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
|
||||||
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
|
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
|
||||||
|
|
||||||
#val = signal.value
|
#val = signal.value
|
||||||
@@ -526,28 +526,28 @@ class KFDDevice(Compiled):
|
|||||||
assert skip_check or signal.value == 0, f"not set to 0, but {signal.value}"
|
assert skip_check or signal.value == 0, f"not set to 0, but {signal.value}"
|
||||||
|
|
||||||
def __init__(self, device:str=""):
|
def __init__(self, device:str=""):
|
||||||
if KFDDevice.kfd == -1:
|
if AMDDevice.kfd == -1:
|
||||||
KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||||
KFDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
||||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||||
with open(f"{KFDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
||||||
with open(f"{KFDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
||||||
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
||||||
target = int(self.properties['gfx_target_version'])
|
target = int(self.properties['gfx_target_version'])
|
||||||
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
|
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
|
||||||
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||||
|
|
||||||
if KFDDevice.event_page is None:
|
if AMDDevice.event_page is None:
|
||||||
KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
KFDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1)
|
sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
|
||||||
else:
|
else:
|
||||||
self._gpu_map(KFDDevice.signals_page)
|
self._gpu_map(AMDDevice.signals_page)
|
||||||
self._gpu_map(KFDDevice.event_page)
|
self._gpu_map(AMDDevice.event_page)
|
||||||
sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1)
|
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
||||||
|
|
||||||
self.completion_signal = KFDDevice._get_signal(self.device_id*2, sync_event=sync_event)
|
self.completion_signal = AMDDevice._get_signal(self.device_id*2, sync_event=sync_event)
|
||||||
self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1))
|
self.signal_sdma = AMDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))
|
||||||
|
|
||||||
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
@@ -567,13 +567,13 @@ class KFDDevice(Compiled):
|
|||||||
# SDMA Queue
|
# SDMA Queue
|
||||||
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
self.sdma_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
||||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||||
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
|
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
|
||||||
|
|
||||||
# doorbell page
|
# doorbell page
|
||||||
self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
||||||
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
|
||||||
|
|
||||||
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
||||||
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
||||||
@@ -585,7 +585,7 @@ class KFDDevice(Compiled):
|
|||||||
self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||||
self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||||
self.pm4_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
|
self.pm4_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
|
||||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||||
eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
|
eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
|
||||||
# TODO: are these needed? (i know eop is)
|
# TODO: are these needed? (i know eop is)
|
||||||
@@ -597,7 +597,7 @@ class KFDDevice(Compiled):
|
|||||||
self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
|
self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
|
||||||
self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||||
|
|
||||||
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
|
super().__init__(device, AMDAllocator(self), AMDCompiler(self.arch), functools.partial(KFDProgram, self))
|
||||||
|
|
||||||
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
|
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
|
||||||
q = HWCopyQueue()
|
q = HWCopyQueue()
|
||||||
Reference in New Issue
Block a user