hcq bump alloc (#8078)

* hcq bump alloc

* hm

* nv

* typo
This commit is contained in:
nimlgen
2024-12-06 19:19:04 +03:00
committed by GitHub
parent df84dc6444
commit d1282da7e8
3 changed files with 46 additions and 55 deletions

View File

@@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
assert sys.platform != 'win32'
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.ops import sint
from tinygrad.device import BufferSpec
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
@@ -117,14 +117,9 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo):
if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
else:
if dev.cmdq_wptr + len(self._q) * 4 > dev.cmdq_page.size:
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self._q) * 4 or \
gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
dev.cmdq_wptr = 0
dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self._q)] = array.array('I', self._q)
cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
dev.cmdq_wptr += len(self._q) * 4
cmdq_addr = dev.cmdq_allocator.alloc(len(self._q) * 4)
cmdq_wptr = (cmdq_addr - dev.cmdq_page.va_addr) // 4
dev.cmdq[cmdq_wptr : cmdq_wptr + len(self._q)] = array.array('I', self._q)
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
@@ -292,8 +287,12 @@ class NVDevice(HCQCompiled[NVSignal]):
gpus_info: Union[List, ctypes.Array] = []
signals_page: Any = None
signals_pool: List[int] = []
low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
uvm_vaddr: int = 0x2000000000 # 0x2000000000+
# TODO: Need a proper allocator for va addresses
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
# VA space is 48bits.
low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, start=0x1000000000, wrap=False)
uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, start=0x2000000000, wrap=False)
host_object_enumerator: int = 0x1000
def _new_gpu_fd(self):
@@ -374,11 +373,7 @@ class NVDevice(HCQCompiled[NVSignal]):
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
if force_low:
NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size
assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses"
else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
return res_va
return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
def _setup_nvclasses(self):
classlist = memoryview(bytearray(100 * 4)).cast('I')
@@ -454,8 +449,8 @@ class NVDevice(HCQCompiled[NVSignal]):
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, start=self.cmdq_page.va_addr, wrap=True)
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
self.cmdq_wptr: int = 0 # in bytes
self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')

View File

@@ -4,7 +4,7 @@ assert sys.platform != 'win32'
from types import SimpleNamespace
from typing import Tuple, List, Any, cast, Optional
from tinygrad.device import BufferSpec
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
from tinygrad.runtime.autogen import kgsl, adreno, libc
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
from tinygrad.renderer.cstyle import QCOMRenderer
@@ -86,7 +86,7 @@ class QCOMComputeQueue(HWQueue):
return self
def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None):
to_mv((hw_page_addr:=hw_addr or dev._alloc_cmd_buf(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q)
to_mv((hw_page_addr:=hw_addr or dev.cmd_buf_allocator.alloc(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q)
obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self._q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=dev.ctx,
cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object))
@@ -147,7 +147,7 @@ class QCOMComputeQueue(HWQueue):
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
*data64_le(args_state.ptr + args_state.prg.samp_off))
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev._border_color_base()))
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev.border_color_buf.va_addr))
if args_state.prg.tex_cnt > 0:
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
@@ -336,28 +336,31 @@ class QCOMDevice(HCQCompiled):
QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)]
info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
| kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
self.ctx = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=flags).drawctxt_id
self.cmd_buf = self._gpu_alloc(16 << 20)
self.cmd_buf_allocator = BumpAllocator(size=self.cmd_buf.size, start=self.cmd_buf.va_addr, wrap=True)
self.border_color_buf = self._gpu_alloc(0x1000, fill_zeroes=True)
self.last_cmd:int = 0
# Set max power
struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, self.ctx, mv_address(_:=memoryview(array.array('I', [1]))), 4)
kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes)
# Load info about qcom device
info = kgsl.struct_kgsl_devinfo()
kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")
super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
QCOMSignal, QCOMComputeQueue, None)
def _ctx_create(self):
cr = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=(kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT |
kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC | kgsl.KGSL_CONTEXT_PRIORITY(8) |
kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)))
# Set power to maximum.
struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, cr.drawctxt_id, mv_address(_:=memoryview(array.array('I', [1]))), 4)
kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes)
return cr.drawctxt_id
def _info(self):
info = kgsl.struct_kgsl_devinfo()
kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
return info
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False):
flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)
@@ -372,14 +375,6 @@ class QCOMDevice(HCQCompiled):
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id)
libc.munmap(mem.va_addr, mem.info.mmapsize)
def _alloc_cmd_buf(self, sz: int):
self.cmd_buf_ptr = (cur_ptr:=self.cmd_buf_ptr if self.cmd_buf_ptr + sz < self.cmd_buf.size else 0) + sz
return self.cmd_buf.va_addr + cur_ptr
def _border_color_base(self):
if not hasattr(self, '_border_color_gpu'): self._border_color_gpu = self._gpu_alloc(0x1000, fill_zeroes=True)
return self._border_color_gpu.va_addr
def _ensure_stack_size(self, sz):
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)
elif self._stack.size < sz:

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any
import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv, round_up
from tinygrad.renderer import Renderer
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator
from tinygrad.ops import sym_infer, sint, Variable
@@ -14,6 +14,15 @@ ProgramType = TypeVar('ProgramType', bound='HCQProgram')
ArgsStateType = TypeVar('ArgsStateType', bound='HCQArgsState')
QueueType = TypeVar('QueueType', bound='HWQueue')
class BumpAllocator:
def __init__(self, size:int, start:int=0, wrap:bool=True): self.size, self.ptr, self.start_off, self.wrap = size, 0, start, wrap
def alloc(self, size:int, alignment:int=1) -> int:
if round_up(self.ptr, alignment) + size > self.size:
if not self.wrap: raise RuntimeError("Out of memory")
self.ptr = 0
self.ptr = (res:=round_up(self.ptr, alignment)) + size
return res + self.start_off
class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
"""
A base class for hardware command queues in the HCQ (Hardware Command Queue) API.
@@ -257,7 +266,7 @@ class HCQProgram(Generic[DeviceType]):
Returns:
Arguments state with the given buffers and values set for the program.
"""
return self.args_state_t(kernargs_ptr or self.dev._alloc_kernargs(self.kernargs_alloc_size), self, bufs, vals=vals)
return self.args_state_t(kernargs_ptr or self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1),
vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]:
@@ -349,7 +358,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
self.kernargs_ptr:int = self.kernargs_page.va_addr
self.kernargs_alloctor = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True)
self.devices.append(self)
def synchronize(self):
@@ -363,14 +372,6 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self.raw_prof_records += [(st.timestamp, en.timestamp, name, is_cp, None) for st, en, name, is_cp in self.sig_prof_records]
self.sig_prof_records = []
def _alloc_kernargs(self, alloc_size:int) -> int:
"""
Allocates space for arguments passed to the kernel.
"""
if self.kernargs_ptr >= (self.kernargs_page.va_addr + self.kernargs_page.size - alloc_size): self.kernargs_ptr = self.kernargs_page.va_addr
self.kernargs_ptr = (res:=self.kernargs_ptr) + alloc_size
return res
def _ensure_shared_time_base(self):
if not self.gpu2cpu_compute_time_diff.is_nan(): return