From d1282da7e8cc1e8c2d39bad7541e2159ce1d0945 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Fri, 6 Dec 2024 19:19:04 +0300 Subject: [PATCH] hcq bump alloc (#8078) * hcq bump alloc * hm * nv * typo --- tinygrad/runtime/ops_nv.py | 29 ++++++++----------- tinygrad/runtime/ops_qcom.py | 49 +++++++++++++++------------------ tinygrad/runtime/support/hcq.py | 23 ++++++++-------- 3 files changed, 46 insertions(+), 55 deletions(-) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 946d263148..4836aece38 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys assert sys.platform != 'win32' from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional from dataclasses import dataclass -from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal +from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator from tinygrad.ops import sint from tinygrad.device import BufferSpec from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod @@ -117,14 +117,9 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo): if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr else: - if dev.cmdq_wptr + len(self._q) * 4 > dev.cmdq_page.size: - assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self._q) * 4 or \ - gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun" - dev.cmdq_wptr = 0 - - dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self._q)] = array.array('I', self._q) - cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr - dev.cmdq_wptr += len(self._q) * 4 + cmdq_addr = dev.cmdq_allocator.alloc(len(self._q) * 4) + cmdq_wptr = (cmdq_addr - dev.cmdq_page.va_addr) // 4 + dev.cmdq[cmdq_wptr : cmdq_wptr + len(self._q)] = array.array('I', self._q) gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41) gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count @@ -292,8 +287,12 @@ class NVDevice(HCQCompiled[NVSignal]): gpus_info: Union[List, ctypes.Array] = [] signals_page: Any = None signals_pool: List[int] = [] - low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings - uvm_vaddr: int = 0x2000000000 # 0x2000000000+ + + # TODO: Need a proper allocator for va addresses + # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings + # VA space is 48bits. + low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, start=0x1000000000, wrap=False) + uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, start=0x2000000000, wrap=False) host_object_enumerator: int = 0x1000 def _new_gpu_fd(self): @@ -374,11 +373,7 @@ class NVDevice(HCQCompiled[NVSignal]): self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem") def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False): - if force_low: - NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size - assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses" - else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size - return res_va + return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment) def _setup_nvclasses(self): classlist = memoryview(bytearray(100 * 4)).cast('I') @@ -454,8 +449,8 @@ class NVDevice(HCQCompiled[NVSignal]): rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1) self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq") + self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, start=self.cmdq_page.va_addr, wrap=True) self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I") - self.cmdq_wptr: int = 0 # in bytes self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs', 'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version') diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index ef9b2357c1..68d7a257d1 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -4,7 +4,7 @@ assert sys.platform != 'win32' from types import SimpleNamespace from typing import Tuple, List, Any, cast, Optional from tinygrad.device import BufferSpec -from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState +from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator from tinygrad.runtime.autogen import kgsl, adreno, libc from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice from tinygrad.renderer.cstyle import QCOMRenderer @@ -86,7 +86,7 @@ class QCOMComputeQueue(HWQueue): return self def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None): - to_mv((hw_page_addr:=hw_addr or dev._alloc_cmd_buf(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q) + to_mv((hw_page_addr:=hw_addr or dev.cmd_buf_allocator.alloc(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q) obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self._q) * 4, flags=kgsl.KGSL_CMDLIST_IB) submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=dev.ctx, cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object)) @@ -147,7 +147,7 @@ class QCOMComputeQueue(HWQueue): state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt), *data64_le(args_state.ptr + args_state.prg.samp_off)) self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off)) - self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev._border_color_base())) + self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev.border_color_buf.va_addr)) if args_state.prg.tex_cnt > 0: self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT, @@ -336,28 +336,31 @@ class QCOMDevice(HCQCompiled): QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True) QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)] - info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0 + + flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \ + | kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN) + self.ctx = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=flags).drawctxt_id + + self.cmd_buf = self._gpu_alloc(16 << 20) + self.cmd_buf_allocator = BumpAllocator(size=self.cmd_buf.size, start=self.cmd_buf.va_addr, wrap=True) + + self.border_color_buf = self._gpu_alloc(0x1000, fill_zeroes=True) + + self.last_cmd:int = 0 + + # Set max power + struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, self.ctx, mv_address(_:=memoryview(array.array('I', [1]))), 4) + kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes) + + # Load info about qcom device + info = kgsl.struct_kgsl_devinfo() + kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info)) QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF) if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}") super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self), QCOMSignal, QCOMComputeQueue, None) - def _ctx_create(self): - cr = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=(kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | - kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC | kgsl.KGSL_CONTEXT_PRIORITY(8) | - kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN))) - - # Set power to maximum. - struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, cr.drawctxt_id, mv_address(_:=memoryview(array.array('I', [1]))), 4) - kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes) - return cr.drawctxt_id - - def _info(self): - info = kgsl.struct_kgsl_devinfo() - kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info)) - return info - def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False): flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED) @@ -372,14 +375,6 @@ class QCOMDevice(HCQCompiled): kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id) libc.munmap(mem.va_addr, mem.info.mmapsize) - def _alloc_cmd_buf(self, sz: int): - self.cmd_buf_ptr = (cur_ptr:=self.cmd_buf_ptr if self.cmd_buf_ptr + sz < self.cmd_buf.size else 0) + sz - return self.cmd_buf.va_addr + cur_ptr - - def _border_color_base(self): - if not hasattr(self, '_border_color_gpu'): self._border_color_gpu = self._gpu_alloc(0x1000, fill_zeroes=True) - return self._border_color_gpu.va_addr - def _ensure_stack_size(self, sz): if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz) elif self._stack.size < sz: diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index b1bb59118d..c35fc22315 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array -from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv +from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv, round_up from tinygrad.renderer import Renderer from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator from tinygrad.ops import sym_infer, sint, Variable @@ -14,6 +14,15 @@ ProgramType = TypeVar('ProgramType', bound='HCQProgram') ArgsStateType = TypeVar('ArgsStateType', bound='HCQArgsState') QueueType = TypeVar('QueueType', bound='HWQueue') +class BumpAllocator: + def __init__(self, size:int, start:int=0, wrap:bool=True): self.size, self.ptr, self.start_off, self.wrap = size, 0, start, wrap + def alloc(self, size:int, alignment:int=1) -> int: + if round_up(self.ptr, alignment) + size > self.size: + if not self.wrap: raise RuntimeError("Out of memory") + self.ptr = 0 + self.ptr = (res:=round_up(self.ptr, alignment)) + size + return res + self.start_off + class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): """ A base class for hardware command queues in the HCQ (Hardware Command Queue) API. @@ -257,7 +266,7 @@ class HCQProgram(Generic[DeviceType]): Returns: Arguments state with the given buffers and values set for the program. """ - return self.args_state_t(kernargs_ptr or self.dev._alloc_kernargs(self.kernargs_alloc_size), self, bufs, vals=vals) + return self.args_state_t(kernargs_ptr or self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size), self, bufs, vals=vals) def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]: @@ -349,7 +358,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph) self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True)) - self.kernargs_ptr:int = self.kernargs_page.va_addr + self.kernargs_alloctor = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True) self.devices.append(self) def synchronize(self): @@ -363,14 +372,6 @@ class HCQCompiled(Compiled, Generic[SignalType]): self.raw_prof_records += [(st.timestamp, en.timestamp, name, is_cp, None) for st, en, name, is_cp in self.sig_prof_records] self.sig_prof_records = [] - def _alloc_kernargs(self, alloc_size:int) -> int: - """ - Allocates space for arguments passed to the kernel. - """ - if self.kernargs_ptr >= (self.kernargs_page.va_addr + self.kernargs_page.size - alloc_size): self.kernargs_ptr = self.kernargs_page.va_addr - self.kernargs_ptr = (res:=self.kernargs_ptr) + alloc_size - return res - def _ensure_shared_time_base(self): if not self.gpu2cpu_compute_time_diff.is_nan(): return