mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-17 01:51:40 -05:00
@@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.ops import sint
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
|
||||
@@ -117,14 +117,9 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
||||
def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo):
|
||||
if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
|
||||
else:
|
||||
if dev.cmdq_wptr + len(self._q) * 4 > dev.cmdq_page.size:
|
||||
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self._q) * 4 or \
|
||||
gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
|
||||
dev.cmdq_wptr = 0
|
||||
|
||||
dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self._q)] = array.array('I', self._q)
|
||||
cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
|
||||
dev.cmdq_wptr += len(self._q) * 4
|
||||
cmdq_addr = dev.cmdq_allocator.alloc(len(self._q) * 4)
|
||||
cmdq_wptr = (cmdq_addr - dev.cmdq_page.va_addr) // 4
|
||||
dev.cmdq[cmdq_wptr : cmdq_wptr + len(self._q)] = array.array('I', self._q)
|
||||
|
||||
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
|
||||
gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
|
||||
@@ -292,8 +287,12 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
gpus_info: Union[List, ctypes.Array] = []
|
||||
signals_page: Any = None
|
||||
signals_pool: List[int] = []
|
||||
low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
||||
uvm_vaddr: int = 0x2000000000 # 0x2000000000+
|
||||
|
||||
# TODO: Need a proper allocator for va addresses
|
||||
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
||||
# VA space is 48bits.
|
||||
low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, start=0x1000000000, wrap=False)
|
||||
uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, start=0x2000000000, wrap=False)
|
||||
host_object_enumerator: int = 0x1000
|
||||
|
||||
def _new_gpu_fd(self):
|
||||
@@ -374,11 +373,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
|
||||
|
||||
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
|
||||
if force_low:
|
||||
NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size
|
||||
assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses"
|
||||
else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
|
||||
return res_va
|
||||
return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
|
||||
|
||||
def _setup_nvclasses(self):
|
||||
classlist = memoryview(bytearray(100 * 4)).cast('I')
|
||||
@@ -454,8 +449,8 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
|
||||
|
||||
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
|
||||
self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, start=self.cmdq_page.va_addr, wrap=True)
|
||||
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
|
||||
self.cmdq_wptr: int = 0 # in bytes
|
||||
|
||||
self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
|
||||
'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
|
||||
|
||||
@@ -4,7 +4,7 @@ assert sys.platform != 'win32'
|
||||
from types import SimpleNamespace
|
||||
from typing import Tuple, List, Any, cast, Optional
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
||||
from tinygrad.runtime.autogen import kgsl, adreno, libc
|
||||
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
|
||||
from tinygrad.renderer.cstyle import QCOMRenderer
|
||||
@@ -86,7 +86,7 @@ class QCOMComputeQueue(HWQueue):
|
||||
return self
|
||||
|
||||
def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None):
|
||||
to_mv((hw_page_addr:=hw_addr or dev._alloc_cmd_buf(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q)
|
||||
to_mv((hw_page_addr:=hw_addr or dev.cmd_buf_allocator.alloc(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q)
|
||||
obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self._q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
|
||||
submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=dev.ctx,
|
||||
cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object))
|
||||
@@ -147,7 +147,7 @@ class QCOMComputeQueue(HWQueue):
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
|
||||
*data64_le(args_state.ptr + args_state.prg.samp_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
|
||||
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev._border_color_base()))
|
||||
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev.border_color_buf.va_addr))
|
||||
|
||||
if args_state.prg.tex_cnt > 0:
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
||||
@@ -336,28 +336,31 @@ class QCOMDevice(HCQCompiled):
|
||||
QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr
|
||||
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
|
||||
QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)]
|
||||
info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0
|
||||
|
||||
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
|
||||
| kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
|
||||
self.ctx = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=flags).drawctxt_id
|
||||
|
||||
self.cmd_buf = self._gpu_alloc(16 << 20)
|
||||
self.cmd_buf_allocator = BumpAllocator(size=self.cmd_buf.size, start=self.cmd_buf.va_addr, wrap=True)
|
||||
|
||||
self.border_color_buf = self._gpu_alloc(0x1000, fill_zeroes=True)
|
||||
|
||||
self.last_cmd:int = 0
|
||||
|
||||
# Set max power
|
||||
struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, self.ctx, mv_address(_:=memoryview(array.array('I', [1]))), 4)
|
||||
kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes)
|
||||
|
||||
# Load info about qcom device
|
||||
info = kgsl.struct_kgsl_devinfo()
|
||||
kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
|
||||
QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
|
||||
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")
|
||||
|
||||
super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
|
||||
QCOMSignal, QCOMComputeQueue, None)
|
||||
|
||||
def _ctx_create(self):
|
||||
cr = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=(kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT |
|
||||
kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC | kgsl.KGSL_CONTEXT_PRIORITY(8) |
|
||||
kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)))
|
||||
|
||||
# Set power to maximum.
|
||||
struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, cr.drawctxt_id, mv_address(_:=memoryview(array.array('I', [1]))), 4)
|
||||
kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes)
|
||||
return cr.drawctxt_id
|
||||
|
||||
def _info(self):
|
||||
info = kgsl.struct_kgsl_devinfo()
|
||||
kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
|
||||
return info
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False):
|
||||
flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
|
||||
if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)
|
||||
@@ -372,14 +375,6 @@ class QCOMDevice(HCQCompiled):
|
||||
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id)
|
||||
libc.munmap(mem.va_addr, mem.info.mmapsize)
|
||||
|
||||
def _alloc_cmd_buf(self, sz: int):
|
||||
self.cmd_buf_ptr = (cur_ptr:=self.cmd_buf_ptr if self.cmd_buf_ptr + sz < self.cmd_buf.size else 0) + sz
|
||||
return self.cmd_buf.va_addr + cur_ptr
|
||||
|
||||
def _border_color_base(self):
|
||||
if not hasattr(self, '_border_color_gpu'): self._border_color_gpu = self._gpu_alloc(0x1000, fill_zeroes=True)
|
||||
return self._border_color_gpu.va_addr
|
||||
|
||||
def _ensure_stack_size(self, sz):
|
||||
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)
|
||||
elif self._stack.size < sz:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any
|
||||
import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array
|
||||
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv
|
||||
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator
|
||||
from tinygrad.ops import sym_infer, sint, Variable
|
||||
@@ -14,6 +14,15 @@ ProgramType = TypeVar('ProgramType', bound='HCQProgram')
|
||||
ArgsStateType = TypeVar('ArgsStateType', bound='HCQArgsState')
|
||||
QueueType = TypeVar('QueueType', bound='HWQueue')
|
||||
|
||||
class BumpAllocator:
|
||||
def __init__(self, size:int, start:int=0, wrap:bool=True): self.size, self.ptr, self.start_off, self.wrap = size, 0, start, wrap
|
||||
def alloc(self, size:int, alignment:int=1) -> int:
|
||||
if round_up(self.ptr, alignment) + size > self.size:
|
||||
if not self.wrap: raise RuntimeError("Out of memory")
|
||||
self.ptr = 0
|
||||
self.ptr = (res:=round_up(self.ptr, alignment)) + size
|
||||
return res + self.start_off
|
||||
|
||||
class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
"""
|
||||
A base class for hardware command queues in the HCQ (Hardware Command Queue) API.
|
||||
@@ -257,7 +266,7 @@ class HCQProgram(Generic[DeviceType]):
|
||||
Returns:
|
||||
Arguments state with the given buffers and values set for the program.
|
||||
"""
|
||||
return self.args_state_t(kernargs_ptr or self.dev._alloc_kernargs(self.kernargs_alloc_size), self, bufs, vals=vals)
|
||||
return self.args_state_t(kernargs_ptr or self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
|
||||
|
||||
def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1),
|
||||
vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]:
|
||||
@@ -349,7 +358,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
|
||||
|
||||
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
|
||||
self.kernargs_ptr:int = self.kernargs_page.va_addr
|
||||
self.kernargs_alloctor = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True)
|
||||
self.devices.append(self)
|
||||
|
||||
def synchronize(self):
|
||||
@@ -363,14 +372,6 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.raw_prof_records += [(st.timestamp, en.timestamp, name, is_cp, None) for st, en, name, is_cp in self.sig_prof_records]
|
||||
self.sig_prof_records = []
|
||||
|
||||
def _alloc_kernargs(self, alloc_size:int) -> int:
|
||||
"""
|
||||
Allocates space for arguments passed to the kernel.
|
||||
"""
|
||||
if self.kernargs_ptr >= (self.kernargs_page.va_addr + self.kernargs_page.size - alloc_size): self.kernargs_ptr = self.kernargs_page.va_addr
|
||||
self.kernargs_ptr = (res:=self.kernargs_ptr) + alloc_size
|
||||
return res
|
||||
|
||||
def _ensure_shared_time_base(self):
|
||||
if not self.gpu2cpu_compute_time_diff.is_nan(): return
|
||||
|
||||
|
||||
Reference in New Issue
Block a user