hcq update waits and signals in place (#4984)

* hcq update waits and signals in place

* start amd

* amd works

* prettier

* test

* normal messages

* linetr

* linter 2
This commit is contained in:
nimlgen
2024-06-17 17:19:07 +03:00
committed by GitHub
parent 603a4a0ce1
commit 794acefbf3
5 changed files with 107 additions and 68 deletions

View File

@@ -98,9 +98,9 @@ class TestHCQ(unittest.TestCase):
def test_update_exec(self):
q = TestHCQ.compute_queue()
exec_ptr = q.ptr()
exec_cmd_idx = len(q)
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size)
q.update_exec(exec_ptr, (1,1,1), (1,1,1))
q.update_exec(exec_cmd_idx, (1,1,1), (1,1,1))
q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1

View File

@@ -239,6 +239,7 @@ def cpu_objdump(lib):
def from_mv(mv:memoryview, to_type=ctypes.c_char):
return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type * len(mv))).contents
def to_mv(ptr, sz) -> memoryview: return memoryview(ctypes.cast(ptr, ctypes.POINTER(ctypes.c_uint8 * sz)).contents).cast("B")
def mv_address(mv:memoryview): return ctypes.addressof(ctypes.c_char.from_buffer(mv))
def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options]) # noqa: E501
@functools.lru_cache(maxsize=None)
def init_c_struct_t(fields: Tuple[Tuple[str, ctypes._SimpleCData], ...]):

View File

@@ -55,6 +55,10 @@ class HCQGraph(MultiGraphRunner):
self.exec_ptrs: Dict[int, Tuple[Any, int]] = {}
self.copy_to_devs: Dict[Compiled, Set[Compiled]] = {dev: set() for dev in self.devices}
for dev in self.devices:
self.comp_queues[dev].memory_barrier().wait(dev.timeline_signal, dev.timeline_value - 1).wait(self.kickoff_signal, self.kickoff_value)
self.copy_queues[dev].wait(dev.timeline_signal, dev.timeline_value - 1).wait(self.kickoff_signal, self.kickoff_value)
for j,ji in enumerate(self.jit_cache):
if isinstance(ji.prg, CompiledRunner):
exec_params = {}
@@ -70,7 +74,7 @@ class HCQGraph(MultiGraphRunner):
for sig, val in deps: self.comp_queues[ji.prg.device].wait(sig, val)
self.exec_ptrs[j] = (self.comp_queues[ji.prg.device], self.comp_queues[ji.prg.device].ptr())
self.exec_ptrs[j] = (self.comp_queues[ji.prg.device], len(self.comp_queues[ji.prg.device]))
self.comp_queues[ji.prg.device].exec(ji.prg.clprg, self.kargs_addrs[j], *ji.prg.p.launch_dims(var_vals),
signal=self.comp_signal[ji.prg.device], signal_value=sig_val, **exec_params)
self.comp_signal_val[ji.prg.device] = sig_val
@@ -91,6 +95,7 @@ class HCQGraph(MultiGraphRunner):
if self.copy_signal_val[dev] > 0: self.comp_queues[dev].wait(self.copy_signal[dev], self.copy_signal_val[dev])
for dep_dev in self.copy_to_devs[dev]: self.comp_queues[dev].wait(self.copy_signal[dep_dev], self.copy_signal_val[dep_dev])
self.comp_queues[dev].signal(dev.timeline_signal, dev.timeline_value)
if hasattr(self.comp_queues[dev], 'bind'): self.comp_queues[dev].bind(dev)
if hasattr(self.copy_queues[dev], 'bind') and self.copy_signal_val[dev] > 0: self.copy_queues[dev].bind(dev)
@@ -115,18 +120,12 @@ class HCQGraph(MultiGraphRunner):
queue.update_exec(cmd_ptr, *cast(CompiledRunner, self.jit_cache[j].prg).p.launch_dims(var_vals))
for dev in self.devices:
# Submit sync with world and queues.
self.comp_hcq_t().memory_barrier().wait(dev.timeline_signal, dev.timeline_value - 1) \
.wait(self.kickoff_signal, self.kickoff_value).submit(dev)
self.comp_queues[dev].submit(dev)
self.comp_queues[dev].update_wait(1, dev.timeline_signal, dev.timeline_value - 1).update_wait(2, value=self.kickoff_value) \
.update_signal(len(self.comp_queues[dev]) - 1, dev.timeline_signal, dev.timeline_value).submit(dev)
if self.copy_signal_val[dev] > 0:
self.copy_hcq_t().wait(dev.timeline_signal, dev.timeline_value - 1) \
.wait(self.kickoff_signal, self.kickoff_value).submit(dev)
self.copy_queues[dev].submit(dev)
self.copy_queues[dev].update_wait(0, dev.timeline_signal, dev.timeline_value - 1).update_wait(1, value=self.kickoff_value).submit(dev)
# Signal the final value
self.comp_hcq_t().signal(dev.timeline_signal, dev.timeline_value).submit(dev)
self.graph_timeline[dev] = dev.timeline_value
dev.timeline_value += 1

View File

@@ -75,22 +75,25 @@ class AMDCompiler(Compiler):
try: return compile_hip(src, self.arch)
except RuntimeError as e: raise CompileError(e)
class HWPM4Queue:
def __init__(self): self.q, self.binded_device, self.ptr_to_dispatch_packet = [], None, {}
class HWQueue:
def __init__(self): self.q, self.cmd_offsets = [], [0]
def _mark_command_end(self):
self.cmd_offsets.append(len(self.q))
return self
def _patch(self, off, data): self.q[off:off+len(data)] = array.array('I', data)
def __len__(self): return len(self.cmd_offsets) - 1
class HWPM4Queue(HWQueue):
def __init__(self):
self.binded_device, self.ptr_to_dispatch_packet = None, {}
super().__init__()
def __del__(self):
if self.binded_device is not None:
self.binded_device.synchronize()
self.binded_device._gpu_free(self.hw_page)
def ptr(self) -> int: return len(self.q)
def memory_barrier(self):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
return self.invalidate_cache()
def invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
@@ -98,10 +101,16 @@ class HWPM4Queue:
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
return self
def memory_barrier(self):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
self._invalidate_cache()
return self._mark_command_end()
def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
self.invalidate_cache()
self._invalidate_cache()
user_data = [*data64_le(kernargs)]
if hasattr(prg, 'dispatch_packet_offset'):
@@ -110,7 +119,7 @@ class HWPM4Queue:
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
user_data = [*data64_le(dp_addr)] + user_data
self.ptr_to_dispatch_packet[self.ptr()] = dp
self.ptr_to_dispatch_packet[len(self)] = dp
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
*data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
@@ -127,15 +136,15 @@ class HWPM4Queue:
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
if signal is not None: self.signal(signal, signal_value)
return self
return self._mark_command_end()
def update_exec(self, cmd_ptr, global_size, local_size):
def update_exec(self, cmd_idx, global_size, local_size):
# Patch the exec cmd with new launch dims
assert self.q[cmd_ptr + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),"The pointer does not point to a packet of this type"
self.q[cmd_ptr + 52 : cmd_ptr + 55] = array.array('I', local_size)
self.q[cmd_ptr + 61 : cmd_ptr + 64] = array.array('I', global_size)
assert self.q[self.cmd_offsets[cmd_idx] + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), f"Command at index {cmd_idx} is not exec"
self.q[self.cmd_offsets[cmd_idx] + 52 : self.cmd_offsets[cmd_idx] + 55] = array.array('I', local_size)
self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
if (dp:=self.ptr_to_dispatch_packet.get(cmd_ptr)) is not None:
if (dp:=self.ptr_to_dispatch_packet.get(cmd_idx)) is not None:
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
@@ -144,7 +153,7 @@ class HWPM4Queue:
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
amd_gpu.WAIT_REG_MEM_ENGINE(0), addr&0xFFFFFFFF, addr>>32, value, 0xffffffff, 4]
return self
return self._mark_command_end()
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
cache_flush_flags = 0
@@ -163,7 +172,7 @@ class HWPM4Queue:
def timestamp(self, addr):
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=addr)
return self
return self._mark_command_end()
def signal(self, signal:hsa.amd_signal_t, value=0):
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
@@ -172,6 +181,21 @@ class HWPM4Queue:
if signal.event_mailbox_ptr != 0:
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
value=signal.event_id, cst=signal.event_id, cache_flush=True)
return self._mark_command_end()
def update_wait(self, cmd_idx, signal=None, value=None):
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
return self
def update_signal(self, cmd_idx, signal=None, value=None):
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
if signal is not None:
self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
if signal.event_mailbox_ptr != 0:
self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
return self
def bind(self, device: AMDDevice):
@@ -185,8 +209,7 @@ class HWPM4Queue:
self.q = hw_view # type: ignore
def submit(self, device: AMDDevice):
if device == self.binded_device: cmds = self.indirect_cmd
else: cmds = self.q
cmds = self.indirect_cmd if device == self.binded_device else self.q
wptr = device.pm4_write_pointer[0]
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
@@ -196,12 +219,14 @@ class HWPM4Queue:
return self
SDMA_MAX_COPY_SIZE = 0x400000
class HWCopyQueue:
def __init__(self): self.q, self.cmd_sizes = [], []
class HWCopyQueue(HWQueue):
def __init__(self):
self.internal_cmd_sizes = []
super().__init__()
def _q(self, arr):
self.q += arr
self.cmd_sizes.append(len(arr))
self.internal_cmd_sizes.append(len(arr))
def copy(self, dest, src, copy_size):
# Invalidate cache inv
@@ -221,7 +246,7 @@ class HWCopyQueue:
# Invalidate cache wb
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
return self
return self._mark_command_end()
def signal(self, signal: hsa.amd_signal_t, value=0):
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
@@ -230,13 +255,19 @@ class HWCopyQueue:
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
return self
return self._mark_command_end()
def wait(self, signal: hsa.amd_signal_t, value=0):
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
return self._mark_command_end()
def update_wait(self, cmd_idx, signal=None, value=None):
assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait"
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
return self
def submit(self, device:AMDDevice):
@@ -246,7 +277,7 @@ class HWCopyQueue:
sdma_buffer_view = to_mv(device.sdma_ring.va_addr, device.sdma_ring.size).cast("I")
tail_blit_dword = 0
for cmdsz in self.cmd_sizes:
for cmdsz in self.internal_cmd_sizes:
if (tail_blit_dword + cmdsz) * 4 >= device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size: break
tail_blit_dword += cmdsz

View File

@@ -3,7 +3,7 @@ import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashli
from typing import Tuple, List, Any, cast
from dataclasses import dataclass
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, BufferOptions
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
import tinygrad.runtime.autogen.cuda as cuda
@@ -85,27 +85,34 @@ class NVCompiler(Compiler):
return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
class HWQueue:
def __init__(self): self.q, self.binded_device, self.next_cmd_index = [], None, 0
def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
def __del__(self):
if self.binded_device is not None:
self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
self.binded_device._gpu_free(self.hw_page)
def ptr(self) -> int: return self.next_cmd_index
def _mark_command_end(self):
self.cmd_offsets.append(len(self.q))
return self
def __len__(self): return len(self.cmd_offsets) - 1
def memory_barrier(self): return self
def memory_barrier(self): return self._mark_command_end()
def wait(self, signal, value=0):
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
(3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
self.next_cmd_index += 1
return self
return self._mark_command_end()
def signal(self, signal, value=0, timestamp=False):
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
self.next_cmd_index += 1
return self._mark_command_end()
def update_signal(self, cmd_idx, signal=None, value=None): return self.update_wait(cmd_idx, signal, value) # the same offsets and commands
def update_wait(self, cmd_idx, signal=None, value=None):
if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
return self
def bind(self, device: NVDevice):
@@ -139,21 +146,20 @@ class HWQueue:
class HWComputeQueue(HWQueue):
def __init__(self):
super().__init__()
self.ptr_to_qmd, self.ptr_to_global_dims, self.ptr_to_local_dims = {}, {}, {}
self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
def copy_from_cpu(self, gpuaddr, data):
self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + [x for x in data]
self.next_cmd_index += 1
return self
return self._mark_command_end()
def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0, chain_exec_ptr=None):
ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
self.ptr_to_qmd[self.ptr()] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
self.ptr_to_global_dims[self.ptr()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
self.ptr_to_local_dims[self.ptr()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
self.cmd_idx_to_qmd[len(self)] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
self.cmd_idx_to_global_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
self.cmd_idx_to_local_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
@@ -171,17 +177,16 @@ class HWComputeQueue(HWQueue):
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
else:
self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_pointer = qmd_addr >> 8
self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_action = 1
self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_prefetch = 1
self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_enable = 1
self.next_cmd_index += 1
return self
self.cmd_idx_to_qmd[chain_exec_ptr].dependent_qmd0_pointer = qmd_addr >> 8
self.cmd_idx_to_qmd[chain_exec_ptr].dependent_qmd0_action = 1
self.cmd_idx_to_qmd[chain_exec_ptr].dependent_qmd0_prefetch = 1
self.cmd_idx_to_qmd[chain_exec_ptr].dependent_qmd0_enable = 1
return self._mark_command_end()
def update_exec(self, cmd_ptr, global_size, local_size):
def update_exec(self, cmd_idx, global_size, local_size):
# Patch the exec cmd with new launch dims
self.ptr_to_global_dims[cmd_ptr][:] = array.array('I', global_size)
self.ptr_to_local_dims[cmd_ptr][:] = array.array('H', local_size)
self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
def submit(self, dev:NVDevice): self._submit(dev, dev.compute_gpfifo)
@@ -190,13 +195,16 @@ class HWCopyQueue(HWQueue):
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
self.next_cmd_index += 1
return self
return self._mark_command_end()
def signal(self, signal, value=0):
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
self.next_cmd_index += 1
return self._mark_command_end()
def update_signal(self, cmd_idx, signal=None, value=None):
if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64(mv_address(signal))])
if value is not None: self.q[self.cmd_offsets[cmd_idx]+3] = value
return self
def submit(self, dev:NVDevice): self._submit(dev, dev.dma_gpfifo)