mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-23 13:58:00 -05:00
hcq move out program call to base class (#5638)
* hcq move out program call to base class * fix
This commit is contained in:
@@ -411,8 +411,8 @@ def hcq_profile(dev, enabled, desc, queue_type=None, queue=None):
|
||||
if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))
|
||||
|
||||
class HCQProgram:
|
||||
def __init__(self, device:HCQCompiled, kernargs_alloc_size:int, kernargs_args_offset:int=0):
|
||||
self.device, self.kernargs_alloc_size, self.kernargs_args_offset = device, kernargs_alloc_size, kernargs_args_offset
|
||||
def __init__(self, device:HCQCompiled, name:str, kernargs_alloc_size:int, kernargs_args_offset:int=0):
|
||||
self.device, self.name, self.kernargs_alloc_size, self.kernargs_args_offset = device, name, kernargs_alloc_size, kernargs_args_offset
|
||||
|
||||
def fill_kernargs(self, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=(), kernargs_ptr:Optional[int]=None):
|
||||
"""
|
||||
@@ -420,9 +420,23 @@ class HCQProgram:
|
||||
"""
|
||||
self._fill_kernargs(ptr:=(kernargs_ptr or self.device._alloc_kernargs(self.kernargs_alloc_size)), bufs, vals)
|
||||
return ptr
|
||||
|
||||
def _fill_kernargs(self, kernargs_ptr:int, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=()): raise NotImplementedError("need fill_kernargs")
|
||||
|
||||
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
kernargs_ptr = self.fill_kernargs(args, vals)
|
||||
|
||||
q = self.device.hw_compute_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
|
||||
|
||||
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
|
||||
q.exec(self, kernargs_ptr, global_size, local_size)
|
||||
|
||||
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
||||
self.device.timeline_value += 1
|
||||
|
||||
if wait:
|
||||
self.device.timeline_signal.wait(self.device.timeline_value - 1)
|
||||
return (sig_en.timestamp - sig_st.timestamp) / 1e6
|
||||
|
||||
class HCQCompiled(Compiled):
|
||||
"""
|
||||
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
|
||||
|
||||
@@ -2,9 +2,9 @@ from __future__ import annotations
|
||||
from typing import Tuple, List, Any, cast
|
||||
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, errno, subprocess, time, array
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, hcq_profile, \
|
||||
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, \
|
||||
HCQSignal, HCQProgram, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, PROFILE, mv_address
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, mv_address
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.runtime.support.hip_comgr import compile_hip
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
@@ -311,7 +311,7 @@ class AMDProgram(HCQProgram):
|
||||
|
||||
# If required, allocate space for the dispatch packet in the kernargs to pass it to the GPU.
|
||||
args_alloc_sz = self.kernargs_segment_size + (ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.kernel_code_properties & 0x2 else 0)
|
||||
super().__init__(self.device, kernargs_alloc_size=args_alloc_sz)
|
||||
super().__init__(self.device, self.name, kernargs_alloc_size=args_alloc_sz)
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): cast(AMDDevice, self.device)._gpu_free(self.lib_gpu)
|
||||
@@ -321,21 +321,6 @@ class AMDProgram(HCQProgram):
|
||||
if len(bufs): to_mv(kernargs_ptr, len(bufs) * 8).cast('Q')[:] = array.array('Q', [b.va_addr for b in bufs])
|
||||
if len(vals): to_mv(kernargs_ptr + len(bufs) * 8, len(vals) * 4).cast('I')[:] = array.array('I', vals)
|
||||
|
||||
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
kernargs_ptr = self.fill_kernargs(args, vals)
|
||||
|
||||
q = AMDComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
|
||||
|
||||
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
|
||||
q.exec(self, kernargs_ptr, global_size, local_size)
|
||||
|
||||
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
||||
self.device.timeline_value += 1
|
||||
|
||||
if wait:
|
||||
self.device.timeline_signal.wait(self.device.timeline_value - 1)
|
||||
return (sig_en.timestamp - sig_st.timestamp) / 1e6
|
||||
|
||||
class AMDAllocator(HCQAllocator):
|
||||
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
|
||||
|
||||
|
||||
@@ -3,8 +3,8 @@ import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, temp
|
||||
from typing import Tuple, List, Any, cast, Union, Dict
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command, \
|
||||
HCQProgram, HCQSignal, hcq_profile, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, to_char_p_p, DEBUG, prod, PROFILE
|
||||
HCQProgram, HCQSignal, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, to_char_p_p, DEBUG, prod
|
||||
from tinygrad.renderer.cstyle import NVRenderer
|
||||
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX
|
||||
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
|
||||
@@ -290,7 +290,7 @@ class NVProgram(HCQProgram):
|
||||
self.max_threads = ((65536 // round_up(max(1, self.registers_usage) * 32, 256)) // 4) * 4 * 32
|
||||
|
||||
# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
|
||||
super().__init__(self.device, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8), kernargs_args_offset=0x160)
|
||||
super().__init__(self.device, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8), kernargs_args_offset=0x160)
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True))
|
||||
@@ -305,20 +305,7 @@ class NVProgram(HCQProgram):
|
||||
if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
|
||||
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
|
||||
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
||||
|
||||
kernargs_ptr = self.fill_kernargs(args, vals)
|
||||
|
||||
q = NVComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
|
||||
|
||||
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
|
||||
q.exec(self, kernargs_ptr, global_size, local_size)
|
||||
|
||||
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
||||
self.device.timeline_value += 1
|
||||
|
||||
if wait:
|
||||
self.device.timeline_signal.wait(self.device.timeline_value - 1)
|
||||
return (sig_en.timestamp - sig_st.timestamp) / 1e6
|
||||
return super().__call__(*args, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
||||
|
||||
class NVAllocator(HCQAllocator):
|
||||
def __init__(self, device:NVDevice): super().__init__(device)
|
||||
|
||||
Reference in New Issue
Block a user