mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-16 01:26:29 -05:00
hcq c-like args state (#8020)
* hcq c-like args state * ugh * Dfix * rename * i
This commit is contained in:
@@ -3,7 +3,7 @@ from typing import Tuple, List, Any, Optional
|
||||
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram
|
||||
from tinygrad.ops import sint
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
|
||||
@@ -79,7 +79,7 @@ class AMDComputeQueue(HWQueue):
|
||||
self.acquire_mem()
|
||||
return self
|
||||
|
||||
def exec(self, prg:AMDProgram, args_state:AMDArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
|
||||
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
|
||||
self.acquire_mem(gli=0, gl2=0)
|
||||
|
||||
if prg.enable_private_segment_sgpr:
|
||||
@@ -221,19 +221,6 @@ class AMDCopyQueue(HWQueue):
|
||||
dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value
|
||||
dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value
|
||||
|
||||
class AMDArgsState(HCQArgsState):
|
||||
def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
|
||||
self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
|
||||
self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
|
||||
|
||||
self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
|
||||
self.vals[:] = array.array('I', vals)
|
||||
|
||||
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
||||
def update_var(self, index:int, val:int): self.vals[index] = val
|
||||
|
||||
class AMDProgram(HCQProgram):
|
||||
def __init__(self, dev:AMDDevice, name:str, lib:bytes):
|
||||
# TODO; this API needs the type signature of the function and global_size/local_size
|
||||
@@ -266,7 +253,7 @@ class AMDProgram(HCQProgram):
|
||||
self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
|
||||
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
|
||||
|
||||
super().__init__(AMDArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
||||
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
||||
|
||||
@@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQProgram, HCQSignal
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal
|
||||
from tinygrad.ops import sint
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
|
||||
@@ -190,18 +190,10 @@ class NVCopyQueue(NVCommandQueue):
|
||||
|
||||
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
|
||||
|
||||
class NVArgsState(HCQArgsState):
|
||||
class NVArgsState(CLikeArgsState):
|
||||
def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
|
||||
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
|
||||
kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
|
||||
to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
|
||||
self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
|
||||
self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
|
||||
|
||||
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
||||
def update_var(self, index:int, val:int): self.vals[index] = val
|
||||
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
|
||||
|
||||
class NVProgram(HCQProgram):
|
||||
def __init__(self, dev:NVDevice, name:str, lib:bytes):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any
|
||||
import contextlib, decimal, statistics, random, json, atexit, time, ctypes
|
||||
import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array
|
||||
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator
|
||||
@@ -228,6 +228,21 @@ class HCQArgsState(Generic[ProgramType]):
|
||||
def update_buffer(self, index:int, buf:HCQBuffer): raise NotImplementedError("need update_buffer")
|
||||
def update_var(self, index:int, val:int): raise NotImplementedError("need update_var")
|
||||
|
||||
class CLikeArgsState(HCQArgsState[ProgramType]):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=(), prefix:Optional[List[int]]=None):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
|
||||
if prefix is not None: to_mv(self.ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
|
||||
|
||||
self.bufs = to_mv(self.ptr + len(prefix or []) * 4, len(bufs) * 8).cast('Q')
|
||||
self.vals = to_mv(self.ptr + len(prefix or []) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
|
||||
|
||||
self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
|
||||
self.vals[:] = array.array('I', vals)
|
||||
|
||||
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
||||
def update_var(self, index:int, val:int): self.vals[index] = val
|
||||
|
||||
class HCQProgram(Generic[DeviceType]):
|
||||
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
|
||||
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
|
||||
|
||||
Reference in New Issue
Block a user