mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
renamed (#4260)
This commit is contained in:
@@ -5,7 +5,7 @@ import tinygrad.runtime.autogen.amd_gpu as amd_gpu
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
import tinygrad.runtime.autogen.hsa as hsa
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.runtime.ops_kfd import kio, KFDProgram
|
||||
from tinygrad.runtime.ops_amd import kio, KFDProgram
|
||||
from tinygrad.helpers import to_mv
|
||||
|
||||
DISPATCH_INIT_VALUE = 0x21 | 0x8000
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
import ctypes, mmap, time
|
||||
from tinygrad.runtime.ops_kfd import KFDDevice, kio, sdma_pkts, libc
|
||||
from tinygrad.runtime.ops_amd import AMDDevice, kio, sdma_pkts, libc
|
||||
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
from tinygrad.helpers import to_mv
|
||||
|
||||
if __name__ == "__main__":
|
||||
dev = KFDDevice()
|
||||
dev = AMDDevice()
|
||||
|
||||
sdma_ring = dev._gpu_alloc(1 << 22, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
gart = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
sdma_queue = kio.create_queue(KFDDevice.kfd,
|
||||
sdma_queue = kio.create_queue(AMDDevice.kfd,
|
||||
ring_base_address=sdma_ring.va_addr, ring_size=sdma_ring.size, gpu_id=dev.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
write_pointer_address=gart.va_addr + 0x100, read_pointer_address=gart.va_addr + 0x108)
|
||||
|
||||
doorbells_base = sdma_queue.doorbell_offset & (~0xfff)
|
||||
doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, doorbells_base)
|
||||
doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, doorbells_base)
|
||||
|
||||
sdma_read_pointer = to_mv(sdma_queue.read_pointer_address, 8).cast("Q")
|
||||
sdma_write_pointer = to_mv(sdma_queue.write_pointer_address, 8).cast("Q")
|
||||
|
||||
8
test/external/external_test_hcq.py
vendored
8
test/external/external_test_hcq.py
vendored
@@ -2,7 +2,7 @@ import unittest, ctypes, struct, time
|
||||
from tinygrad import Device, Tensor, dtypes
|
||||
from tinygrad.buffer import Buffer, BufferOptions
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWPM4Queue
|
||||
from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWPM4Queue
|
||||
|
||||
def _time_queue(q, d):
|
||||
st = time.perf_counter()
|
||||
@@ -14,8 +14,8 @@ def _time_queue(q, d):
|
||||
class TestHCQ(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
TestHCQ.d0: KFDDevice = Device["KFD"]
|
||||
#TestHCQ.d1: KFDDevice = Device["KFD:1"]
|
||||
TestHCQ.d0: AMDDevice = Device["KFD"]
|
||||
#TestHCQ.d1: AMDDevice = Device["KFD:1"]
|
||||
TestHCQ.a = Tensor([0.,1.], device="KFD").realize()
|
||||
TestHCQ.b = self.a + 1
|
||||
si = create_schedule([self.b.lazydata])[-1]
|
||||
@@ -165,7 +165,7 @@ class TestHCQ(unittest.TestCase):
|
||||
q = TestHCQ.compute_queue()
|
||||
qc = HWCopyQueue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size) # b = [1, 2]
|
||||
q.signal(sig:=KFDDevice._get_signal(10))
|
||||
q.signal(sig:=AMDDevice._get_signal(10))
|
||||
qc.wait(sig)
|
||||
qc.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8)
|
||||
qc.signal(TestHCQ.d0.completion_signal)
|
||||
|
||||
8
test/external/fuzz_kfd.py
vendored
8
test/external/fuzz_kfd.py
vendored
@@ -3,10 +3,10 @@ import random
|
||||
from tqdm import trange
|
||||
from typing import List
|
||||
from tinygrad import Device
|
||||
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWComputeQueue
|
||||
from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWComputeQueue
|
||||
|
||||
if __name__ == "__main__":
|
||||
dev: List[KFDDevice] = [Device[f"KFD:{i}"] for i in range(6)]
|
||||
dev: List[AMDDevice] = [Device[f"KFD:{i}"] for i in range(6)]
|
||||
print(f"got {len(dev)} devices")
|
||||
|
||||
buffers = [(rd:=random.choice(dev), rd.allocator.alloc(random.randint(1, 10000))) for i in range(100)]
|
||||
@@ -16,7 +16,7 @@ if __name__ == "__main__":
|
||||
d2, b2 = random.choice(buffers)
|
||||
d1._gpu_map(b2)
|
||||
q = HWComputeQueue()
|
||||
q.signal(sig:=KFDDevice._get_signal(10))
|
||||
q.signal(sig:=AMDDevice._get_signal(10))
|
||||
qc = HWCopyQueue()
|
||||
qc.wait(sig)
|
||||
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
|
||||
@@ -25,4 +25,4 @@ if __name__ == "__main__":
|
||||
qc.submit(d1)
|
||||
q.wait(d1.completion_signal)
|
||||
q.submit(d1)
|
||||
KFDDevice._wait_on(d1.completion_signal.event_id)
|
||||
AMDDevice._wait_on(d1.completion_signal.event_id)
|
||||
|
||||
@@ -53,7 +53,7 @@ tensor_cores: Dict[str, List[TensorCore]] = {
|
||||
"HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[2],[0],[0],[-1],[1]], [[0],[2],[1],[-1],[0]], [[-2],[2],[1],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501
|
||||
"CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[-2],[5],[0],[0],[-1,1,2,-3],[3,4]], [[5],[0],[0],[4],[3],[-1,1,2,-2],[0]], [[2],[-2],[5],[1],[-1],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501
|
||||
}
|
||||
tensor_cores["KFD"] = tensor_cores["HSA"]
|
||||
tensor_cores["AMD"] = tensor_cores["HSA"]
|
||||
|
||||
class LocalBuffer(NamedTuple):
|
||||
name: str
|
||||
|
||||
@@ -68,8 +68,8 @@ def create_sdma_packets():
|
||||
return type("SDMA_PKTS", (object, ), structs)
|
||||
sdma_pkts = create_sdma_packets()
|
||||
|
||||
class KFDCompiler(Compiler):
|
||||
compiler_opts = CompilerOptions("KFD", has_tensor_cores=True, shared_max=65536)
|
||||
class AMDCompiler(Compiler):
|
||||
compiler_opts = CompilerOptions("AMD", has_tensor_cores=True, shared_max=65536)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
super().__init__(f"compile_hip_{self.arch}")
|
||||
@@ -238,7 +238,7 @@ class HWPM4Queue:
|
||||
global_size[0],global_size[1],global_size[2], CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
||||
|
||||
# have to self wait since flush doesn't work
|
||||
self.signal(sig:=KFDDevice._get_signal())
|
||||
self.signal(sig:=AMDDevice._get_signal())
|
||||
self.wait(sig)
|
||||
|
||||
if completion_signal: self.signal(completion_signal)
|
||||
@@ -280,7 +280,7 @@ class HWPM4Queue:
|
||||
signal.event_id]
|
||||
return self
|
||||
|
||||
def submit(self, device:KFDDevice):
|
||||
def submit(self, device:AMDDevice):
|
||||
wptr = device.pm4_write_pointer[0]
|
||||
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
|
||||
for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
|
||||
@@ -299,7 +299,7 @@ sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_
|
||||
class HWCopyQueue:
|
||||
def __init__(self): self.q = []
|
||||
|
||||
def submit(self, device:KFDDevice):
|
||||
def submit(self, device:AMDDevice):
|
||||
read_ptr = device.sdma_read_pointer[0]
|
||||
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
||||
for cmd in self.q:
|
||||
@@ -345,7 +345,7 @@ class HWCopyQueue:
|
||||
return self
|
||||
|
||||
class KFDProgram:
|
||||
def __init__(self, device:KFDDevice, name:str, lib:bytes):
|
||||
def __init__(self, device:AMDDevice, name:str, lib:bytes):
|
||||
# TODO; this API needs the type signature of the function and global_size/local_size
|
||||
self.device, self.name, self.lib = device, name, lib
|
||||
|
||||
@@ -399,8 +399,8 @@ class KFDProgram:
|
||||
#assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
|
||||
|
||||
class KFDAllocator(LRUAllocator):
|
||||
def __init__(self, device:KFDDevice):
|
||||
class AMDAllocator(LRUAllocator):
|
||||
def __init__(self, device:AMDDevice):
|
||||
self.device = device
|
||||
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
|
||||
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)]
|
||||
@@ -452,15 +452,15 @@ class KFDAllocator(LRUAllocator):
|
||||
self.device._wait_signal(self.device.signal_sdma)
|
||||
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
||||
|
||||
def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice):
|
||||
def transfer(self, dest, src, sz:int, src_dev:AMDDevice, dest_dev:AMDDevice):
|
||||
dest_dev._gpu_map(src)
|
||||
q = HWPM4Queue().signal(sig := KFDDevice._get_signal())
|
||||
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
|
||||
q = HWPM4Queue().signal(sig := AMDDevice._get_signal())
|
||||
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := AMDDevice._get_signal()).submit(dest_dev)
|
||||
HWPM4Queue().wait(sigc).submit(dest_dev)
|
||||
q.wait(sigc).submit(src_dev)
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
class KFDDevice(Compiled):
|
||||
class AMDDevice(Compiled):
|
||||
kfd:int = -1
|
||||
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
||||
signals_page:Any = None
|
||||
@@ -501,15 +501,15 @@ class KFDDevice(Compiled):
|
||||
@classmethod
|
||||
def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t:
|
||||
if num is None:
|
||||
num = KFDDevice.signal_number
|
||||
KFDDevice.signal_number += 1
|
||||
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16
|
||||
num = AMDDevice.signal_number
|
||||
AMDDevice.signal_number += 1
|
||||
if AMDDevice.signal_number == SIGNAL_COUNT: AMDDevice.signal_number = 16
|
||||
#print("signal", num)
|
||||
ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
|
||||
ret = hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
|
||||
ret.value = 0
|
||||
ret.kind = hsa.AMD_SIGNAL_KIND_USER
|
||||
if sync_event is not None:
|
||||
ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
||||
ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
||||
ret.event_id = sync_event.event_id
|
||||
return ret
|
||||
|
||||
@@ -518,7 +518,7 @@ class KFDDevice(Compiled):
|
||||
assert signal.event_id != 0, "can't wait on this signal"
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = signal.event_id
|
||||
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
|
||||
ret = kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
|
||||
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
|
||||
|
||||
#val = signal.value
|
||||
@@ -526,28 +526,28 @@ class KFDDevice(Compiled):
|
||||
assert skip_check or signal.value == 0, f"not set to 0, but {signal.value}"
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if KFDDevice.kfd == -1:
|
||||
KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
KFDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
||||
if AMDDevice.kfd == -1:
|
||||
AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
with open(f"{KFDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
||||
with open(f"{KFDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
||||
with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
||||
with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
||||
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
||||
target = int(self.properties['gfx_target_version'])
|
||||
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
|
||||
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||
kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||
|
||||
if KFDDevice.event_page is None:
|
||||
KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
KFDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1)
|
||||
if AMDDevice.event_page is None:
|
||||
AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
|
||||
else:
|
||||
self._gpu_map(KFDDevice.signals_page)
|
||||
self._gpu_map(KFDDevice.event_page)
|
||||
sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1)
|
||||
self._gpu_map(AMDDevice.signals_page)
|
||||
self._gpu_map(AMDDevice.event_page)
|
||||
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
||||
|
||||
self.completion_signal = KFDDevice._get_signal(self.device_id*2, sync_event=sync_event)
|
||||
self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1))
|
||||
self.completion_signal = AMDDevice._get_signal(self.device_id*2, sync_event=sync_event)
|
||||
self.signal_sdma = AMDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))
|
||||
|
||||
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
@@ -567,13 +567,13 @@ class KFDDevice(Compiled):
|
||||
# SDMA Queue
|
||||
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
||||
self.sdma_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
|
||||
|
||||
# doorbell page
|
||||
self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
||||
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
||||
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
|
||||
|
||||
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
||||
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
||||
@@ -585,7 +585,7 @@ class KFDDevice(Compiled):
|
||||
self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.pm4_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
|
||||
self.pm4_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
|
||||
# TODO: are these needed? (i know eop is)
|
||||
@@ -597,7 +597,7 @@ class KFDDevice(Compiled):
|
||||
self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
|
||||
self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
|
||||
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
|
||||
super().__init__(device, AMDAllocator(self), AMDCompiler(self.arch), functools.partial(KFDProgram, self))
|
||||
|
||||
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
|
||||
q = HWCopyQueue()
|
||||
Reference in New Issue
Block a user