mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-27 07:48:07 -05:00
@@ -4,7 +4,7 @@ from typing import Tuple, List, Any, cast
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command, \
|
||||
hcq_profile, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
|
||||
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
|
||||
from tinygrad.renderer.cstyle import NVRenderer
|
||||
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX
|
||||
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
|
||||
@@ -98,11 +98,11 @@ class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
|
||||
self.binded_device._gpu_free(self.hw_page)
|
||||
|
||||
def _wait(self, signal, value=0):
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(mv_address(signal)), *nvdata64_le(value),
|
||||
(3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
|
||||
|
||||
def _signal(self, signal, value=0, timestamp=False):
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(mv_address(signal)), *nvdata64_le(value),
|
||||
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
||||
def _timestamp(self, signal): return NVCommandQueue._signal(self, signal, timestamp=True)
|
||||
@@ -181,14 +181,14 @@ class NVComputeQueue(NVCommandQueue, HWComputeQueue):
|
||||
|
||||
def _signal(self, signal, value=0):
|
||||
if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 2)) is None or prev_qmd.release0_enable == 1: return super()._signal(signal, value)
|
||||
prev_qmd.release0_address_upper, prev_qmd.release0_address_lower = nvdata64(ctypes.addressof(from_mv(signal)))
|
||||
prev_qmd.release0_address_upper, prev_qmd.release0_address_lower = nvdata64(mv_address(signal))
|
||||
prev_qmd.release0_payload_upper, prev_qmd.release0_payload_lower = nvdata64(value)
|
||||
prev_qmd.release0_enable = 1
|
||||
self.cmd_idx_to_qmd[len(self) - 1] = prev_qmd # this command is embedded into qmd.
|
||||
|
||||
def _update_signal(self, cmd_idx, signal=None, value=None):
|
||||
if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_signal(cmd_idx, signal, value)
|
||||
if signal is not None: qmd.release0_address_upper, qmd.release0_address_lower = nvdata64(ctypes.addressof(from_mv(signal)))
|
||||
if signal is not None: qmd.release0_address_upper, qmd.release0_address_lower = nvdata64(mv_address(signal))
|
||||
if value is not None: qmd.release0_payload_upper, qmd.release0_payload_lower = nvdata64(value)
|
||||
|
||||
def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).compute_gpfifo)
|
||||
@@ -204,7 +204,7 @@ class NVCopyQueue(NVCommandQueue, HWCopyQueue):
|
||||
if src is not None: self._patch(cmd_idx, offset=1, data=nvdata64(src))
|
||||
|
||||
def _signal(self, signal, value=0):
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(mv_address(signal)), value, 4]
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
|
||||
|
||||
def _update_signal(self, cmd_idx, signal=None, value=None):
|
||||
|
||||
Reference in New Issue
Block a user