hcq c-like args state (#8020)

* hcq c-like args state

* ugh

* Dfix

* rename

* i
This commit is contained in:
nimlgen
2024-12-03 23:53:35 +03:00
committed by GitHub
parent 099364ed32
commit 7fda464b08
3 changed files with 22 additions and 28 deletions

View File

@@ -3,7 +3,7 @@ from typing import Tuple, List, Any, Optional
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys
assert sys.platform != 'win32'
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram
from tinygrad.ops import sint
from tinygrad.device import BufferSpec
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
@@ -79,7 +79,7 @@ class AMDComputeQueue(HWQueue):
self.acquire_mem()
return self
def exec(self, prg:AMDProgram, args_state:AMDArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
self.acquire_mem(gli=0, gl2=0)
if prg.enable_private_segment_sgpr:
@@ -221,19 +221,6 @@ class AMDCopyQueue(HWQueue):
dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value
dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value
class AMDArgsState(HCQArgsState):
def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
super().__init__(ptr, prg, bufs, vals=vals)
self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
self.vals[:] = array.array('I', vals)
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
def update_var(self, index:int, val:int): self.vals[index] = val
class AMDProgram(HCQProgram):
def __init__(self, dev:AMDDevice, name:str, lib:bytes):
# TODO; this API needs the type signature of the function and global_size/local_size
@@ -266,7 +253,7 @@ class AMDProgram(HCQProgram):
self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
super().__init__(AMDArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
def __del__(self):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))

View File

@@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
assert sys.platform != 'win32'
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQProgram, HCQSignal
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal
from tinygrad.ops import sint
from tinygrad.device import BufferSpec
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
@@ -190,18 +190,10 @@ class NVCopyQueue(NVCommandQueue):
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
class NVArgsState(HCQArgsState):
class NVArgsState(CLikeArgsState):
def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
super().__init__(ptr, prg, bufs, vals=vals)
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
def update_var(self, index:int, val:int): self.vals[index] = val
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
class NVProgram(HCQProgram):
def __init__(self, dev:NVDevice, name:str, lib:bytes):

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any
import contextlib, decimal, statistics, random, json, atexit, time, ctypes
import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv
from tinygrad.renderer import Renderer
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator
@@ -228,6 +228,21 @@ class HCQArgsState(Generic[ProgramType]):
def update_buffer(self, index:int, buf:HCQBuffer): raise NotImplementedError("need update_buffer")
def update_var(self, index:int, val:int): raise NotImplementedError("need update_var")
class CLikeArgsState(HCQArgsState[ProgramType]):
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=(), prefix:Optional[List[int]]=None):
super().__init__(ptr, prg, bufs, vals=vals)
if prefix is not None: to_mv(self.ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
self.bufs = to_mv(self.ptr + len(prefix or []) * 4, len(bufs) * 8).cast('Q')
self.vals = to_mv(self.ptr + len(prefix or []) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
self.vals[:] = array.array('I', vals)
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
def update_var(self, index:int, val:int): self.vals[index] = val
class HCQProgram(Generic[DeviceType]):
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size