mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
hcq update waits and signals in place (#4984)
* hcq update waits and signals in place * start amd * amd works * prettier * test * normal messages * linetr * linter 2
This commit is contained in:
4
test/external/external_test_hcq.py
vendored
4
test/external/external_test_hcq.py
vendored
@@ -98,9 +98,9 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
def test_update_exec(self):
|
||||
q = TestHCQ.compute_queue()
|
||||
exec_ptr = q.ptr()
|
||||
exec_cmd_idx = len(q)
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size)
|
||||
q.update_exec(exec_ptr, (1,1,1), (1,1,1))
|
||||
q.update_exec(exec_cmd_idx, (1,1,1), (1,1,1))
|
||||
q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
@@ -239,6 +239,7 @@ def cpu_objdump(lib):
|
||||
def from_mv(mv:memoryview, to_type=ctypes.c_char):
|
||||
return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type * len(mv))).contents
|
||||
def to_mv(ptr, sz) -> memoryview: return memoryview(ctypes.cast(ptr, ctypes.POINTER(ctypes.c_uint8 * sz)).contents).cast("B")
|
||||
def mv_address(mv:memoryview): return ctypes.addressof(ctypes.c_char.from_buffer(mv))
|
||||
def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options]) # noqa: E501
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def init_c_struct_t(fields: Tuple[Tuple[str, ctypes._SimpleCData], ...]):
|
||||
|
||||
@@ -55,6 +55,10 @@ class HCQGraph(MultiGraphRunner):
|
||||
self.exec_ptrs: Dict[int, Tuple[Any, int]] = {}
|
||||
self.copy_to_devs: Dict[Compiled, Set[Compiled]] = {dev: set() for dev in self.devices}
|
||||
|
||||
for dev in self.devices:
|
||||
self.comp_queues[dev].memory_barrier().wait(dev.timeline_signal, dev.timeline_value - 1).wait(self.kickoff_signal, self.kickoff_value)
|
||||
self.copy_queues[dev].wait(dev.timeline_signal, dev.timeline_value - 1).wait(self.kickoff_signal, self.kickoff_value)
|
||||
|
||||
for j,ji in enumerate(self.jit_cache):
|
||||
if isinstance(ji.prg, CompiledRunner):
|
||||
exec_params = {}
|
||||
@@ -70,7 +74,7 @@ class HCQGraph(MultiGraphRunner):
|
||||
|
||||
for sig, val in deps: self.comp_queues[ji.prg.device].wait(sig, val)
|
||||
|
||||
self.exec_ptrs[j] = (self.comp_queues[ji.prg.device], self.comp_queues[ji.prg.device].ptr())
|
||||
self.exec_ptrs[j] = (self.comp_queues[ji.prg.device], len(self.comp_queues[ji.prg.device]))
|
||||
self.comp_queues[ji.prg.device].exec(ji.prg.clprg, self.kargs_addrs[j], *ji.prg.p.launch_dims(var_vals),
|
||||
signal=self.comp_signal[ji.prg.device], signal_value=sig_val, **exec_params)
|
||||
self.comp_signal_val[ji.prg.device] = sig_val
|
||||
@@ -91,6 +95,7 @@ class HCQGraph(MultiGraphRunner):
|
||||
if self.copy_signal_val[dev] > 0: self.comp_queues[dev].wait(self.copy_signal[dev], self.copy_signal_val[dev])
|
||||
for dep_dev in self.copy_to_devs[dev]: self.comp_queues[dev].wait(self.copy_signal[dep_dev], self.copy_signal_val[dep_dev])
|
||||
|
||||
self.comp_queues[dev].signal(dev.timeline_signal, dev.timeline_value)
|
||||
if hasattr(self.comp_queues[dev], 'bind'): self.comp_queues[dev].bind(dev)
|
||||
if hasattr(self.copy_queues[dev], 'bind') and self.copy_signal_val[dev] > 0: self.copy_queues[dev].bind(dev)
|
||||
|
||||
@@ -115,18 +120,12 @@ class HCQGraph(MultiGraphRunner):
|
||||
queue.update_exec(cmd_ptr, *cast(CompiledRunner, self.jit_cache[j].prg).p.launch_dims(var_vals))
|
||||
|
||||
for dev in self.devices:
|
||||
# Submit sync with world and queues.
|
||||
self.comp_hcq_t().memory_barrier().wait(dev.timeline_signal, dev.timeline_value - 1) \
|
||||
.wait(self.kickoff_signal, self.kickoff_value).submit(dev)
|
||||
self.comp_queues[dev].submit(dev)
|
||||
self.comp_queues[dev].update_wait(1, dev.timeline_signal, dev.timeline_value - 1).update_wait(2, value=self.kickoff_value) \
|
||||
.update_signal(len(self.comp_queues[dev]) - 1, dev.timeline_signal, dev.timeline_value).submit(dev)
|
||||
|
||||
if self.copy_signal_val[dev] > 0:
|
||||
self.copy_hcq_t().wait(dev.timeline_signal, dev.timeline_value - 1) \
|
||||
.wait(self.kickoff_signal, self.kickoff_value).submit(dev)
|
||||
self.copy_queues[dev].submit(dev)
|
||||
self.copy_queues[dev].update_wait(0, dev.timeline_signal, dev.timeline_value - 1).update_wait(1, value=self.kickoff_value).submit(dev)
|
||||
|
||||
# Signal the final value
|
||||
self.comp_hcq_t().signal(dev.timeline_signal, dev.timeline_value).submit(dev)
|
||||
self.graph_timeline[dev] = dev.timeline_value
|
||||
dev.timeline_value += 1
|
||||
|
||||
|
||||
@@ -75,22 +75,25 @@ class AMDCompiler(Compiler):
|
||||
try: return compile_hip(src, self.arch)
|
||||
except RuntimeError as e: raise CompileError(e)
|
||||
|
||||
class HWPM4Queue:
|
||||
def __init__(self): self.q, self.binded_device, self.ptr_to_dispatch_packet = [], None, {}
|
||||
class HWQueue:
|
||||
def __init__(self): self.q, self.cmd_offsets = [], [0]
|
||||
def _mark_command_end(self):
|
||||
self.cmd_offsets.append(len(self.q))
|
||||
return self
|
||||
def _patch(self, off, data): self.q[off:off+len(data)] = array.array('I', data)
|
||||
def __len__(self): return len(self.cmd_offsets) - 1
|
||||
|
||||
class HWPM4Queue(HWQueue):
|
||||
def __init__(self):
|
||||
self.binded_device, self.ptr_to_dispatch_packet = None, {}
|
||||
super().__init__()
|
||||
|
||||
def __del__(self):
|
||||
if self.binded_device is not None:
|
||||
self.binded_device.synchronize()
|
||||
self.binded_device._gpu_free(self.hw_page)
|
||||
|
||||
def ptr(self) -> int: return len(self.q)
|
||||
|
||||
def memory_barrier(self):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
|
||||
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
|
||||
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
|
||||
return self.invalidate_cache()
|
||||
|
||||
def invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
||||
def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
|
||||
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
|
||||
@@ -98,10 +101,16 @@ class HWPM4Queue:
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
|
||||
return self
|
||||
|
||||
def memory_barrier(self):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
|
||||
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
|
||||
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
|
||||
self._invalidate_cache()
|
||||
return self._mark_command_end()
|
||||
|
||||
def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
|
||||
self.invalidate_cache()
|
||||
self._invalidate_cache()
|
||||
|
||||
user_data = [*data64_le(kernargs)]
|
||||
if hasattr(prg, 'dispatch_packet_offset'):
|
||||
@@ -110,7 +119,7 @@ class HWPM4Queue:
|
||||
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
||||
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
|
||||
user_data = [*data64_le(dp_addr)] + user_data
|
||||
self.ptr_to_dispatch_packet[self.ptr()] = dp
|
||||
self.ptr_to_dispatch_packet[len(self)] = dp
|
||||
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
|
||||
*data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
|
||||
@@ -127,15 +136,15 @@ class HWPM4Queue:
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
|
||||
|
||||
if signal is not None: self.signal(signal, signal_value)
|
||||
return self
|
||||
return self._mark_command_end()
|
||||
|
||||
def update_exec(self, cmd_ptr, global_size, local_size):
|
||||
def update_exec(self, cmd_idx, global_size, local_size):
|
||||
# Patch the exec cmd with new launch dims
|
||||
assert self.q[cmd_ptr + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),"The pointer does not point to a packet of this type"
|
||||
self.q[cmd_ptr + 52 : cmd_ptr + 55] = array.array('I', local_size)
|
||||
self.q[cmd_ptr + 61 : cmd_ptr + 64] = array.array('I', global_size)
|
||||
assert self.q[self.cmd_offsets[cmd_idx] + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), f"Command at index {cmd_idx} is not exec"
|
||||
self.q[self.cmd_offsets[cmd_idx] + 52 : self.cmd_offsets[cmd_idx] + 55] = array.array('I', local_size)
|
||||
self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
|
||||
|
||||
if (dp:=self.ptr_to_dispatch_packet.get(cmd_ptr)) is not None:
|
||||
if (dp:=self.ptr_to_dispatch_packet.get(cmd_idx)) is not None:
|
||||
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
||||
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
||||
|
||||
@@ -144,7 +153,7 @@ class HWPM4Queue:
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
||||
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
amd_gpu.WAIT_REG_MEM_ENGINE(0), addr&0xFFFFFFFF, addr>>32, value, 0xffffffff, 4]
|
||||
return self
|
||||
return self._mark_command_end()
|
||||
|
||||
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
|
||||
cache_flush_flags = 0
|
||||
@@ -163,7 +172,7 @@ class HWPM4Queue:
|
||||
|
||||
def timestamp(self, addr):
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=addr)
|
||||
return self
|
||||
return self._mark_command_end()
|
||||
|
||||
def signal(self, signal:hsa.amd_signal_t, value=0):
|
||||
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
||||
@@ -172,6 +181,21 @@ class HWPM4Queue:
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
|
||||
value=signal.event_id, cst=signal.event_id, cache_flush=True)
|
||||
return self._mark_command_end()
|
||||
|
||||
def update_wait(self, cmd_idx, signal=None, value=None):
|
||||
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
|
||||
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
||||
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
|
||||
return self
|
||||
|
||||
def update_signal(self, cmd_idx, signal=None, value=None):
|
||||
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
|
||||
if signal is not None:
|
||||
self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
|
||||
if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
|
||||
return self
|
||||
|
||||
def bind(self, device: AMDDevice):
|
||||
@@ -185,8 +209,7 @@ class HWPM4Queue:
|
||||
self.q = hw_view # type: ignore
|
||||
|
||||
def submit(self, device: AMDDevice):
|
||||
if device == self.binded_device: cmds = self.indirect_cmd
|
||||
else: cmds = self.q
|
||||
cmds = self.indirect_cmd if device == self.binded_device else self.q
|
||||
|
||||
wptr = device.pm4_write_pointer[0]
|
||||
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
|
||||
@@ -196,12 +219,14 @@ class HWPM4Queue:
|
||||
return self
|
||||
|
||||
SDMA_MAX_COPY_SIZE = 0x400000
|
||||
class HWCopyQueue:
|
||||
def __init__(self): self.q, self.cmd_sizes = [], []
|
||||
class HWCopyQueue(HWQueue):
|
||||
def __init__(self):
|
||||
self.internal_cmd_sizes = []
|
||||
super().__init__()
|
||||
|
||||
def _q(self, arr):
|
||||
self.q += arr
|
||||
self.cmd_sizes.append(len(arr))
|
||||
self.internal_cmd_sizes.append(len(arr))
|
||||
|
||||
def copy(self, dest, src, copy_size):
|
||||
# Invalidate cache inv
|
||||
@@ -221,7 +246,7 @@ class HWCopyQueue:
|
||||
# Invalidate cache wb
|
||||
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
|
||||
|
||||
return self
|
||||
return self._mark_command_end()
|
||||
|
||||
def signal(self, signal: hsa.amd_signal_t, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
|
||||
@@ -230,13 +255,19 @@ class HWCopyQueue:
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
|
||||
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
|
||||
|
||||
return self
|
||||
return self._mark_command_end()
|
||||
|
||||
def wait(self, signal: hsa.amd_signal_t, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
|
||||
|
||||
return self._mark_command_end()
|
||||
|
||||
def update_wait(self, cmd_idx, signal=None, value=None):
|
||||
assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait"
|
||||
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
||||
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
|
||||
return self
|
||||
|
||||
def submit(self, device:AMDDevice):
|
||||
@@ -246,7 +277,7 @@ class HWCopyQueue:
|
||||
sdma_buffer_view = to_mv(device.sdma_ring.va_addr, device.sdma_ring.size).cast("I")
|
||||
|
||||
tail_blit_dword = 0
|
||||
for cmdsz in self.cmd_sizes:
|
||||
for cmdsz in self.internal_cmd_sizes:
|
||||
if (tail_blit_dword + cmdsz) * 4 >= device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size: break
|
||||
tail_blit_dword += cmdsz
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashli
|
||||
from typing import Tuple, List, Any, cast
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, BufferOptions
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
|
||||
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
|
||||
from tinygrad.renderer.cstyle import NVRenderer
|
||||
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
|
||||
import tinygrad.runtime.autogen.cuda as cuda
|
||||
@@ -85,27 +85,34 @@ class NVCompiler(Compiler):
|
||||
return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
|
||||
|
||||
class HWQueue:
|
||||
def __init__(self): self.q, self.binded_device, self.next_cmd_index = [], None, 0
|
||||
def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
|
||||
def __del__(self):
|
||||
if self.binded_device is not None:
|
||||
self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
|
||||
self.binded_device._gpu_free(self.hw_page)
|
||||
|
||||
def ptr(self) -> int: return self.next_cmd_index
|
||||
def _mark_command_end(self):
|
||||
self.cmd_offsets.append(len(self.q))
|
||||
return self
|
||||
def __len__(self): return len(self.cmd_offsets) - 1
|
||||
|
||||
def memory_barrier(self): return self
|
||||
def memory_barrier(self): return self._mark_command_end()
|
||||
|
||||
def wait(self, signal, value=0):
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
|
||||
(3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
|
||||
self.next_cmd_index += 1
|
||||
return self
|
||||
return self._mark_command_end()
|
||||
|
||||
def signal(self, signal, value=0, timestamp=False):
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
|
||||
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
||||
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
||||
self.next_cmd_index += 1
|
||||
return self._mark_command_end()
|
||||
|
||||
def update_signal(self, cmd_idx, signal=None, value=None): return self.update_wait(cmd_idx, signal, value) # the same offsets and commands
|
||||
def update_wait(self, cmd_idx, signal=None, value=None):
|
||||
if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
|
||||
if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
|
||||
return self
|
||||
|
||||
def bind(self, device: NVDevice):
|
||||
@@ -139,21 +146,20 @@ class HWQueue:
|
||||
class HWComputeQueue(HWQueue):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.ptr_to_qmd, self.ptr_to_global_dims, self.ptr_to_local_dims = {}, {}, {}
|
||||
self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
|
||||
|
||||
def copy_from_cpu(self, gpuaddr, data):
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + [x for x in data]
|
||||
self.next_cmd_index += 1
|
||||
return self
|
||||
return self._mark_command_end()
|
||||
|
||||
def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0, chain_exec_ptr=None):
|
||||
ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
|
||||
self.ptr_to_qmd[self.ptr()] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
|
||||
self.ptr_to_global_dims[self.ptr()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
|
||||
self.ptr_to_local_dims[self.ptr()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
|
||||
self.cmd_idx_to_qmd[len(self)] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
|
||||
self.cmd_idx_to_global_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
|
||||
self.cmd_idx_to_local_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
|
||||
|
||||
qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
|
||||
qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
|
||||
@@ -171,17 +177,16 @@ class HWComputeQueue(HWQueue):
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
|
||||
else:
|
||||
self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_pointer = qmd_addr >> 8
|
||||
self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_action = 1
|
||||
self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_prefetch = 1
|
||||
self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_enable = 1
|
||||
self.next_cmd_index += 1
|
||||
return self
|
||||
self.cmd_idx_to_qmd[chain_exec_ptr].dependent_qmd0_pointer = qmd_addr >> 8
|
||||
self.cmd_idx_to_qmd[chain_exec_ptr].dependent_qmd0_action = 1
|
||||
self.cmd_idx_to_qmd[chain_exec_ptr].dependent_qmd0_prefetch = 1
|
||||
self.cmd_idx_to_qmd[chain_exec_ptr].dependent_qmd0_enable = 1
|
||||
return self._mark_command_end()
|
||||
|
||||
def update_exec(self, cmd_ptr, global_size, local_size):
|
||||
def update_exec(self, cmd_idx, global_size, local_size):
|
||||
# Patch the exec cmd with new launch dims
|
||||
self.ptr_to_global_dims[cmd_ptr][:] = array.array('I', global_size)
|
||||
self.ptr_to_local_dims[cmd_ptr][:] = array.array('H', local_size)
|
||||
self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
|
||||
self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
|
||||
|
||||
def submit(self, dev:NVDevice): self._submit(dev, dev.compute_gpfifo)
|
||||
|
||||
@@ -190,13 +195,16 @@ class HWCopyQueue(HWQueue):
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
||||
self.next_cmd_index += 1
|
||||
return self
|
||||
return self._mark_command_end()
|
||||
|
||||
def signal(self, signal, value=0):
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
|
||||
self.next_cmd_index += 1
|
||||
return self._mark_command_end()
|
||||
|
||||
def update_signal(self, cmd_idx, signal=None, value=None):
|
||||
if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64(mv_address(signal))])
|
||||
if value is not None: self.q[self.cmd_offsets[cmd_idx]+3] = value
|
||||
return self
|
||||
|
||||
def submit(self, dev:NVDevice): self._submit(dev, dev.dma_gpfifo)
|
||||
|
||||
Reference in New Issue
Block a user