mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-15 17:15:48 -05:00
init hcq args state (#6046)
* init hcq args state * cleaner * amd * fillargs * fixes * myoy * docs * fix * not needed * spacing
This commit is contained in:
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
from typing import Tuple, List, Any, cast
|
||||
import os, fcntl, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, \
|
||||
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, \
|
||||
HCQSignal, HCQProgram, BufferOptions
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, mv_address
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
@@ -101,18 +101,18 @@ class AMDComputeQueue(HWComputeQueue):
|
||||
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
|
||||
self._acquire_mem()
|
||||
|
||||
def _exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
|
||||
def _exec(self, prg, args_state, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
|
||||
self._acquire_mem(gli=0, gl2=0)
|
||||
|
||||
user_regs, cmd_idx = [], len(self) - 1
|
||||
if prg.enable_dispatch_ptr:
|
||||
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=kernargs + prg.kernargs_segment_size)
|
||||
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
|
||||
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
||||
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
||||
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
|
||||
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
|
||||
user_regs += [*data64_le(dp_addr)]
|
||||
self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
|
||||
user_regs += [*data64_le(kernargs)]
|
||||
user_regs += [*data64_le(args_state.ptr)]
|
||||
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
|
||||
*data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
|
||||
@@ -256,6 +256,19 @@ class AMDCopyQueue(HWCopyQueue):
|
||||
device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
|
||||
device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
|
||||
|
||||
class AMDArgsState(HCQArgsState):
|
||||
def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
|
||||
self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
|
||||
self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
|
||||
|
||||
self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
|
||||
self.vals[:] = array.array('I', vals)
|
||||
|
||||
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
||||
def update_var(self, index:int, val:int): self.vals[index] = val
|
||||
|
||||
class AMDProgram(HCQProgram):
|
||||
def __init__(self, device:AMDDevice, name:str, lib:bytes):
|
||||
# TODO; this API needs the type signature of the function and global_size/local_size
|
||||
@@ -288,16 +301,11 @@ class AMDProgram(HCQProgram):
|
||||
self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
||||
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
|
||||
|
||||
super().__init__(self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
||||
super().__init__(AMDArgsState, self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): cast(AMDDevice, self.device)._gpu_free(self.lib_gpu)
|
||||
|
||||
def _fill_kernargs(self, kernargs_ptr:int, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=()):
|
||||
if (given:=len(bufs)*8 + len(vals)*4) != (want:=self.kernargs_segment_size): raise RuntimeError(f'incorrect args size {given=} != {want=}')
|
||||
if len(bufs): to_mv(kernargs_ptr, len(bufs) * 8).cast('Q')[:] = array.array('Q', [b.va_addr for b in bufs])
|
||||
if len(vals): to_mv(kernargs_ptr + len(bufs) * 8, len(vals) * 4).cast('I')[:] = array.array('I', vals)
|
||||
|
||||
class AMDAllocator(HCQAllocator):
|
||||
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user