init hcq args state (#6046)

* init hcq args state

* cleaner

* amd

* fillargs

* fixes

* myoy

* docs

* fix

* not needed

* spacing
This commit is contained in:
nimlgen
2024-08-13 17:11:58 +03:00
committed by GitHub
parent 9d2ea94fe9
commit fa84e6ec48
5 changed files with 75 additions and 53 deletions

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Tuple, List, Any, cast
import os, fcntl, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal
from dataclasses import dataclass
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, \
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, \
HCQSignal, HCQProgram, BufferOptions
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, mv_address
from tinygrad.renderer.cstyle import AMDRenderer
@@ -101,18 +101,18 @@ class AMDComputeQueue(HWComputeQueue):
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
self._acquire_mem()
def _exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
def _exec(self, prg, args_state, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
self._acquire_mem(gli=0, gl2=0)
user_regs, cmd_idx = [], len(self) - 1
if prg.enable_dispatch_ptr:
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=kernargs + prg.kernargs_segment_size)
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
user_regs += [*data64_le(dp_addr)]
self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
user_regs += [*data64_le(kernargs)]
user_regs += [*data64_le(args_state.ptr)]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
*data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
@@ -256,6 +256,19 @@ class AMDCopyQueue(HWCopyQueue):
device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
class AMDArgsState(HCQArgsState):
def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
super().__init__(ptr, prg, bufs, vals=vals)
self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
self.vals[:] = array.array('I', vals)
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
def update_var(self, index:int, val:int): self.vals[index] = val
class AMDProgram(HCQProgram):
def __init__(self, device:AMDDevice, name:str, lib:bytes):
# TODO; this API needs the type signature of the function and global_size/local_size
@@ -288,16 +301,11 @@ class AMDProgram(HCQProgram):
self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
super().__init__(self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
super().__init__(AMDArgsState, self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
def __del__(self):
if hasattr(self, 'lib_gpu'): cast(AMDDevice, self.device)._gpu_free(self.lib_gpu)
def _fill_kernargs(self, kernargs_ptr:int, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=()):
if (given:=len(bufs)*8 + len(vals)*4) != (want:=self.kernargs_segment_size): raise RuntimeError(f'incorrect args size {given=} != {want=}')
if len(bufs): to_mv(kernargs_ptr, len(bufs) * 8).cast('Q')[:] = array.array('Q', [b.va_addr for b in bufs])
if len(vals): to_mv(kernargs_ptr + len(bufs) * 8, len(vals) * 4).cast('I')[:] = array.array('I', vals)
class AMDAllocator(HCQAllocator):
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)