From 7fda464b08837c88487e2814950a28f93fa1495c Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 3 Dec 2024 23:53:35 +0300 Subject: [PATCH] hcq c-like args state (#8020) * hcq c-like args state * ugh * Dfix * rename * i --- tinygrad/runtime/ops_amd.py | 19 +++---------------- tinygrad/runtime/ops_nv.py | 14 +++----------- tinygrad/runtime/support/hcq.py | 17 ++++++++++++++++- 3 files changed, 22 insertions(+), 28 deletions(-) diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 62c53c506b..ed86173ef4 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -3,7 +3,7 @@ from typing import Tuple, List, Any, Optional import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys assert sys.platform != 'win32' from dataclasses import dataclass -from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram +from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram from tinygrad.ops import sint from tinygrad.device import BufferSpec from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address @@ -79,7 +79,7 @@ class AMDComputeQueue(HWQueue): self.acquire_mem() return self - def exec(self, prg:AMDProgram, args_state:AMDArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]): + def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]): self.acquire_mem(gli=0, gl2=0) if prg.enable_private_segment_sgpr: @@ -221,19 +221,6 @@ class AMDCopyQueue(HWQueue): dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value -class AMDArgsState(HCQArgsState): - def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()): - super().__init__(ptr, prg, bufs, vals=vals) - - self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q') - self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I') - - self.bufs[:] = array.array('Q', [b.va_addr for b in bufs]) - self.vals[:] = array.array('I', vals) - - def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr - def update_var(self, index:int, val:int): self.vals[index] = val - class AMDProgram(HCQProgram): def __init__(self, dev:AMDDevice, name:str, lib:bytes): # TODO; this API needs the type signature of the function and global_size/local_size @@ -266,7 +253,7 @@ class AMDProgram(HCQProgram): self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0 - super().__init__(AMDArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz) + super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz) def __del__(self): if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True)) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 9f4113f4e7..d0bd13b26b 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys assert sys.platform != 'win32' from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional from dataclasses import dataclass -from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQProgram, HCQSignal +from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal from tinygrad.ops import sint from tinygrad.device import BufferSpec from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod @@ -190,18 +190,10 @@ class NVCopyQueue(NVCommandQueue): def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo) -class NVArgsState(HCQArgsState): +class NVArgsState(CLikeArgsState): def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()): - super().__init__(ptr, prg, bufs, vals=vals) - if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)] - kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals) - to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs) - self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q') - self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I') - - def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr - def update_var(self, index:int, val:int): self.vals[index] = val + super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0) class NVProgram(HCQProgram): def __init__(self, dev:NVDevice, name:str, lib:bytes): diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 16250d415f..e713453ad0 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any -import contextlib, decimal, statistics, random, json, atexit, time, ctypes +import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv from tinygrad.renderer import Renderer from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator @@ -228,6 +228,21 @@ class HCQArgsState(Generic[ProgramType]): def update_buffer(self, index:int, buf:HCQBuffer): raise NotImplementedError("need update_buffer") def update_var(self, index:int, val:int): raise NotImplementedError("need update_var") +class CLikeArgsState(HCQArgsState[ProgramType]): + def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=(), prefix:Optional[List[int]]=None): + super().__init__(ptr, prg, bufs, vals=vals) + + if prefix is not None: to_mv(self.ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix) + + self.bufs = to_mv(self.ptr + len(prefix or []) * 4, len(bufs) * 8).cast('Q') + self.vals = to_mv(self.ptr + len(prefix or []) * 4 + len(bufs) * 8, len(vals) * 4).cast('I') + + self.bufs[:] = array.array('Q', [b.va_addr for b in bufs]) + self.vals[:] = array.array('I', vals) + + def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr + def update_var(self, index:int, val:int): self.vals[index] = val + class HCQProgram(Generic[DeviceType]): def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int): self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size