hcq move out program call to base class (#5638)

* hcq move out program call to base class

* fix
This commit is contained in:
nimlgen
2024-07-23 14:25:38 +03:00
committed by GitHub
parent 7cb67e6fb2
commit a93982ef42
3 changed files with 24 additions and 38 deletions

View File

@@ -411,8 +411,8 @@ def hcq_profile(dev, enabled, desc, queue_type=None, queue=None):
if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))
class HCQProgram:
def __init__(self, device:HCQCompiled, kernargs_alloc_size:int, kernargs_args_offset:int=0):
self.device, self.kernargs_alloc_size, self.kernargs_args_offset = device, kernargs_alloc_size, kernargs_args_offset
def __init__(self, device:HCQCompiled, name:str, kernargs_alloc_size:int, kernargs_args_offset:int=0):
self.device, self.name, self.kernargs_alloc_size, self.kernargs_args_offset = device, name, kernargs_alloc_size, kernargs_args_offset
def fill_kernargs(self, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=(), kernargs_ptr:Optional[int]=None):
"""
@@ -420,9 +420,23 @@ class HCQProgram:
"""
self._fill_kernargs(ptr:=(kernargs_ptr or self.device._alloc_kernargs(self.kernargs_alloc_size)), bufs, vals)
return ptr
def _fill_kernargs(self, kernargs_ptr:int, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=()): raise NotImplementedError("need fill_kernargs")
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
kernargs_ptr = self.fill_kernargs(args, vals)
q = self.device.hw_compute_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
q.exec(self, kernargs_ptr, global_size, local_size)
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
self.device.timeline_value += 1
if wait:
self.device.timeline_signal.wait(self.device.timeline_value - 1)
return (sig_en.timestamp - sig_st.timestamp) / 1e6
class HCQCompiled(Compiled):
"""
A base class for devices compatible with the HCQ (Hardware Command Queue) API.

View File

@@ -2,9 +2,9 @@ from __future__ import annotations
from typing import Tuple, List, Any, cast
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, errno, subprocess, time, array
from dataclasses import dataclass
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, hcq_profile, \
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, \
HCQSignal, HCQProgram, Compiler, CompileError, BufferOptions
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, PROFILE, mv_address
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, mv_address
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.runtime.support.hip_comgr import compile_hip
import tinygrad.runtime.autogen.kfd as kfd
@@ -311,7 +311,7 @@ class AMDProgram(HCQProgram):
# If required, allocate space for the dispatch packet in the kernargs to pass it to the GPU.
args_alloc_sz = self.kernargs_segment_size + (ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.kernel_code_properties & 0x2 else 0)
super().__init__(self.device, kernargs_alloc_size=args_alloc_sz)
super().__init__(self.device, self.name, kernargs_alloc_size=args_alloc_sz)
def __del__(self):
if hasattr(self, 'lib_gpu'): cast(AMDDevice, self.device)._gpu_free(self.lib_gpu)
@@ -321,21 +321,6 @@ class AMDProgram(HCQProgram):
if len(bufs): to_mv(kernargs_ptr, len(bufs) * 8).cast('Q')[:] = array.array('Q', [b.va_addr for b in bufs])
if len(vals): to_mv(kernargs_ptr + len(bufs) * 8, len(vals) * 4).cast('I')[:] = array.array('I', vals)
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
kernargs_ptr = self.fill_kernargs(args, vals)
q = AMDComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
q.exec(self, kernargs_ptr, global_size, local_size)
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
self.device.timeline_value += 1
if wait:
self.device.timeline_signal.wait(self.device.timeline_value - 1)
return (sig_en.timestamp - sig_st.timestamp) / 1e6
class AMDAllocator(HCQAllocator):
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)

View File

@@ -3,8 +3,8 @@ import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, temp
from typing import Tuple, List, Any, cast, Union, Dict
from dataclasses import dataclass
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command, \
HCQProgram, HCQSignal, hcq_profile, Compiler, CompileError, BufferOptions
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, to_char_p_p, DEBUG, prod, PROFILE
HCQProgram, HCQSignal, Compiler, CompileError, BufferOptions
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, to_char_p_p, DEBUG, prod
from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
@@ -290,7 +290,7 @@ class NVProgram(HCQProgram):
self.max_threads = ((65536 // round_up(max(1, self.registers_usage) * 32, 256)) // 4) * 4 * 32
# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
super().__init__(self.device, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8), kernargs_args_offset=0x160)
super().__init__(self.device, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8), kernargs_args_offset=0x160)
def __del__(self):
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True))
@@ -305,20 +305,7 @@ class NVProgram(HCQProgram):
if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
kernargs_ptr = self.fill_kernargs(args, vals)
q = NVComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
q.exec(self, kernargs_ptr, global_size, local_size)
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
self.device.timeline_value += 1
if wait:
self.device.timeline_signal.wait(self.device.timeline_value - 1)
return (sig_en.timestamp - sig_st.timestamp) / 1e6
return super().__call__(*args, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
class NVAllocator(HCQAllocator):
def __init__(self, device:NVDevice): super().__init__(device)