diff --git a/tinygrad/device.py b/tinygrad/device.py index 02ac91fcf4..725cceaf38 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -411,8 +411,8 @@ def hcq_profile(dev, enabled, desc, queue_type=None, queue=None): if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t)) class HCQProgram: - def __init__(self, device:HCQCompiled, kernargs_alloc_size:int, kernargs_args_offset:int=0): - self.device, self.kernargs_alloc_size, self.kernargs_args_offset = device, kernargs_alloc_size, kernargs_args_offset + def __init__(self, device:HCQCompiled, name:str, kernargs_alloc_size:int, kernargs_args_offset:int=0): + self.device, self.name, self.kernargs_alloc_size, self.kernargs_args_offset = device, name, kernargs_alloc_size, kernargs_args_offset def fill_kernargs(self, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=(), kernargs_ptr:Optional[int]=None): """ @@ -420,9 +420,23 @@ class HCQProgram: """ self._fill_kernargs(ptr:=(kernargs_ptr or self.device._alloc_kernargs(self.kernargs_alloc_size)), bufs, vals) return ptr - def _fill_kernargs(self, kernargs_ptr:int, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=()): raise NotImplementedError("need fill_kernargs") + def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False): + kernargs_ptr = self.fill_kernargs(args, vals) + + q = self.device.hw_compute_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier() + + with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en): + q.exec(self, kernargs_ptr, global_size, local_size) + + q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) + self.device.timeline_value += 1 + + if wait: + self.device.timeline_signal.wait(self.device.timeline_value - 1) + return (sig_en.timestamp - sig_st.timestamp) / 1e6 + class HCQCompiled(Compiled): """ A base class for devices compatible with the HCQ (Hardware Command Queue) API. diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index cc6037bc43..78fb731185 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -2,9 +2,9 @@ from __future__ import annotations from typing import Tuple, List, Any, cast import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, errno, subprocess, time, array from dataclasses import dataclass -from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, hcq_profile, \ +from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, \ HCQSignal, HCQProgram, Compiler, CompileError, BufferOptions -from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, PROFILE, mv_address +from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, mv_address from tinygrad.renderer.cstyle import AMDRenderer from tinygrad.runtime.support.hip_comgr import compile_hip import tinygrad.runtime.autogen.kfd as kfd @@ -311,7 +311,7 @@ class AMDProgram(HCQProgram): # If required, allocate space for the dispatch packet in the kernargs to pass it to the GPU. args_alloc_sz = self.kernargs_segment_size + (ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.kernel_code_properties & 0x2 else 0) - super().__init__(self.device, kernargs_alloc_size=args_alloc_sz) + super().__init__(self.device, self.name, kernargs_alloc_size=args_alloc_sz) def __del__(self): if hasattr(self, 'lib_gpu'): cast(AMDDevice, self.device)._gpu_free(self.lib_gpu) @@ -321,21 +321,6 @@ class AMDProgram(HCQProgram): if len(bufs): to_mv(kernargs_ptr, len(bufs) * 8).cast('Q')[:] = array.array('Q', [b.va_addr for b in bufs]) if len(vals): to_mv(kernargs_ptr + len(bufs) * 8, len(vals) * 4).cast('I')[:] = array.array('I', vals) - def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False): - kernargs_ptr = self.fill_kernargs(args, vals) - - q = AMDComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier() - - with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en): - q.exec(self, kernargs_ptr, global_size, local_size) - - q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) - self.device.timeline_value += 1 - - if wait: - self.device.timeline_signal.wait(self.device.timeline_value - 1) - return (sig_en.timestamp - sig_st.timestamp) / 1e6 - class AMDAllocator(HCQAllocator): def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index d7e9dd2783..3e13676f57 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -3,8 +3,8 @@ import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, temp from typing import Tuple, List, Any, cast, Union, Dict from dataclasses import dataclass from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command, \ - HCQProgram, HCQSignal, hcq_profile, Compiler, CompileError, BufferOptions -from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, to_char_p_p, DEBUG, prod, PROFILE + HCQProgram, HCQSignal, Compiler, CompileError, BufferOptions +from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, to_char_p_p, DEBUG, prod from tinygrad.renderer.cstyle import NVRenderer from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX import tinygrad.runtime.autogen.nv_gpu as nv_gpu @@ -290,7 +290,7 @@ class NVProgram(HCQProgram): self.max_threads = ((65536 // round_up(max(1, self.registers_usage) * 32, 256)) // 4) * 4 * 32 # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel. - super().__init__(self.device, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8), kernargs_args_offset=0x160) + super().__init__(self.device, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8), kernargs_args_offset=0x160) def __del__(self): if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True)) @@ -305,20 +305,7 @@ class NVProgram(HCQProgram): if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch") if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])): raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}") - - kernargs_ptr = self.fill_kernargs(args, vals) - - q = NVComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier() - - with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en): - q.exec(self, kernargs_ptr, global_size, local_size) - - q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) - self.device.timeline_value += 1 - - if wait: - self.device.timeline_signal.wait(self.device.timeline_value - 1) - return (sig_en.timestamp - sig_st.timestamp) / 1e6 + return super().__call__(*args, global_size=global_size, local_size=local_size, vals=vals, wait=wait) class NVAllocator(HCQAllocator): def __init__(self, device:NVDevice): super().__init__(device)