mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
replace Tuple with tuple [pr] (#8344)
* replace Tuple with tuple [pr] * replace List with list [pr] * replace Dict with dict [pr] * replace Set with set [pr]
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import ctypes
|
||||
from typing import Any, Optional, Tuple, Dict, List, cast
|
||||
from typing import Any, Optional, cast
|
||||
import tinygrad.runtime.autogen.cuda as cuda
|
||||
from tinygrad.helpers import init_c_var, dedup
|
||||
from tinygrad.device import Buffer, Device
|
||||
@@ -9,14 +9,14 @@ from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
|
||||
from tinygrad.engine.jit import MultiGraphRunner, GraphException
|
||||
|
||||
class CUDAGraph(MultiGraphRunner):
|
||||
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
|
||||
def __init__(self, jit_cache: list[ExecItem], input_rawbuffers: list[Buffer], var_vals: dict[Variable, int]):
|
||||
super().__init__(jit_cache, input_rawbuffers, var_vals)
|
||||
|
||||
# Check all jit items are compatible.
|
||||
if not all(isinstance(ji.prg, (CompiledRunner, BufferXfer)) for ji in jit_cache): raise GraphException
|
||||
|
||||
self.jc_idx_with_updatable_rawbufs = dedup([x[0] for x in self.input_replace.keys()])
|
||||
self.updatable_nodes: Dict[int, Tuple[Any, Any, Any, bool]] = {} # Dict[jc index] = tuple(graph node, node params, input kernel params, is memcpy)
|
||||
self.updatable_nodes: dict[int, tuple[Any, Any, Any, bool]] = {} # dict[jc index] = tuple(graph node, node params, input kernel params, is memcpy)
|
||||
|
||||
self.graph = init_c_var(cuda.CUgraph(), lambda x: check(cuda.cuGraphCreate(ctypes.byref(x), 0)))
|
||||
|
||||
@@ -48,7 +48,7 @@ class CUDAGraph(MultiGraphRunner):
|
||||
|
||||
self.instance = init_c_var(cuda.CUgraphExec(), lambda x: check(cuda.cuGraphInstantiate_v2(ctypes.byref(x), self.graph, None, None, 0)))
|
||||
|
||||
def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
|
||||
def __call__(self, input_rawbuffers: list[Buffer], var_vals: dict[Variable, int], wait=False) -> Optional[float]:
|
||||
# Update rawbuffers in the c_args struct.
|
||||
for (j,i),input_idx in self.input_replace.items():
|
||||
if not self.updatable_nodes[j][3]: setattr(self.updatable_nodes[j][2], f'f{i}', input_rawbuffers[input_idx]._buf)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import collections, time
|
||||
from typing import List, Any, Dict, cast, Optional, Tuple, Set
|
||||
from typing import List, Any, cast, Optional
|
||||
from tinygrad.helpers import round_up, PROFILE
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWQueue, HCQArgsState, BumpAllocator
|
||||
from tinygrad.device import Buffer, BufferSpec, Compiled, Device, ProfileGraphEntry, ProfileGraphEvent
|
||||
@@ -9,29 +9,29 @@ from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
|
||||
from tinygrad.engine.jit import MultiGraphRunner
|
||||
|
||||
class HCQGraph(MultiGraphRunner):
|
||||
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
|
||||
def __init__(self, jit_cache: list[ExecItem], input_rawbuffers: list[Buffer], var_vals: dict[Variable, int]):
|
||||
super().__init__(jit_cache, input_rawbuffers, var_vals)
|
||||
self.devices = list(set(cast(HCQCompiled, d) for ji in jit_cache for d in [Device[cast(Buffer, x).device] for x in ji.bufs]))
|
||||
|
||||
# Replace input buffers with variables.
|
||||
self.hcq_bufs = [[cast(Buffer, x)._buf for x in ji.bufs] for ji in jit_cache]
|
||||
self.input_replace_to_var: Dict[Tuple[int, int], Variable] = {}
|
||||
self.input_replace_to_var: dict[tuple[int, int], Variable] = {}
|
||||
|
||||
for (j,i), input_idx in self.input_replace.items():
|
||||
x = self.input_replace_to_var.setdefault((j,i), UOp.variable(f"input_{input_idx}", 0, 0xffffffffffffffff, dtype=dtypes.uint64))
|
||||
self.hcq_bufs[j][i] = HCQBuffer(x, self.hcq_bufs[j][i].size, texture_info=self.hcq_bufs[j][i].texture_info) # Create fake buffer with variable
|
||||
|
||||
# Allocate kernel args.
|
||||
kernargs_size: Dict[Compiled, int] = collections.defaultdict(int)
|
||||
kernargs_size: dict[Compiled, int] = collections.defaultdict(int)
|
||||
for ji in jit_cache:
|
||||
if not isinstance(ji.prg, CompiledRunner): continue
|
||||
kernargs_size[ji.prg.dev] += round_up(ji.prg._prg.kernargs_alloc_size, 16)
|
||||
self.kernargs_bufs: Dict[Compiled, HCQBuffer] = {dev:dev.allocator._alloc(sz, BufferSpec(cpu_access=True)) for dev,sz in kernargs_size.items()}
|
||||
self.kernargs_bufs: dict[Compiled, HCQBuffer] = {dev:dev.allocator._alloc(sz, BufferSpec(cpu_access=True)) for dev,sz in kernargs_size.items()}
|
||||
|
||||
# Fill initial arguments.
|
||||
self.ji_args: Dict[int, HCQArgsState] = {}
|
||||
self.ji_args: dict[int, HCQArgsState] = {}
|
||||
|
||||
kargs_alloc: Dict[Compiled, BumpAllocator] = {dev:BumpAllocator(buf.size, start=cast(int, buf.va_addr)) for dev,buf in self.kernargs_bufs.items()}
|
||||
kargs_alloc: dict[Compiled, BumpAllocator] = {dev:BumpAllocator(buf.size, start=cast(int, buf.va_addr)) for dev,buf in self.kernargs_bufs.items()}
|
||||
for j,ji in enumerate(jit_cache):
|
||||
if not isinstance(ji.prg, CompiledRunner): continue
|
||||
|
||||
@@ -42,24 +42,24 @@ class HCQGraph(MultiGraphRunner):
|
||||
# graph-related tasks. This synchronization uses a global timeline signal per device. Within the graph, the compute queue coordinates with
|
||||
# global operations and sets a kickoff signal. Any queue accessing a buffer from another device waits for this signal from the device’s
|
||||
# compute queue to ensure exclusive access. The compute queue signals the completion of the graph, synchronizing with the device's copy queue.
|
||||
self.ji_schedule: Dict[int, Tuple[HCQCompiled, HWQueue, List, List, HCQSignal, Optional[int]]] = {}
|
||||
self.ji_schedule: dict[int, tuple[HCQCompiled, HWQueue, List, List, HCQSignal, Optional[int]]] = {}
|
||||
|
||||
self.comp_queues: Dict[HCQCompiled, HWQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
|
||||
self.copy_queues: Dict[HCQCompiled, HWQueue] = {} # lazy allocation
|
||||
self.comp_queues: dict[HCQCompiled, HWQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
|
||||
self.copy_queues: dict[HCQCompiled, HWQueue] = {} # lazy allocation
|
||||
|
||||
self.signals: Dict[Any, HCQSignal] = {**{dev: dev.signal_t(value=0) for dev in self.devices}, **{"CPU": self.devices[0].signal_t(value=0)}}
|
||||
self.signals: dict[Any, HCQSignal] = {**{dev: dev.signal_t(value=0) for dev in self.devices}, **{"CPU": self.devices[0].signal_t(value=0)}}
|
||||
self.kickoff_value: int = 0
|
||||
self.kickoff_var = UOp.variable("kickoff_var", 0, 0xffffffff, dtype=dtypes.uint32)
|
||||
|
||||
# When profiling allocate 2 signals for each jit item to measure speed. The jth jit item have signals at 2*j and 2*j+1.
|
||||
# TODO: This logic might allocate a few extra signals...
|
||||
self.prof_signals: List[HCQSignal] = [self.devices[0].signal_t() for i in range(len(jit_cache) * 2)] if PROFILE else []
|
||||
self.prog_graph_deps: List[List[int]] = []
|
||||
self.prof_graph_entries: List[ProfileGraphEntry] = []
|
||||
self.prof_signals: list[HCQSignal] = [self.devices[0].signal_t() for i in range(len(jit_cache) * 2)] if PROFILE else []
|
||||
self.prog_graph_deps: list[list[int]] = []
|
||||
self.prof_graph_entries: list[ProfileGraphEntry] = []
|
||||
|
||||
last_j: Dict[HWQueue, Optional[int]] = collections.defaultdict(lambda: None)
|
||||
queue_access: Dict[HWQueue, Dict[HWQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
|
||||
dev_access: Dict[HWQueue, Set[HCQCompiled]] = collections.defaultdict(set)
|
||||
last_j: dict[HWQueue, Optional[int]] = collections.defaultdict(lambda: None)
|
||||
queue_access: dict[HWQueue, dict[HWQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
|
||||
dev_access: dict[HWQueue, set[HCQCompiled]] = collections.defaultdict(set)
|
||||
|
||||
for dev, queue in self.comp_queues.items(): dev_access[queue].add(dev)
|
||||
|
||||
@@ -120,7 +120,7 @@ class HCQGraph(MultiGraphRunner):
|
||||
self.prof_signal_is_used = [any(ent.st_id == j or ent.en_id == j for ent in self.prof_graph_entries) for j in range(len(self.prof_signals))]
|
||||
|
||||
# Build hardware queues.
|
||||
self.copy_to_devs: Dict[HCQCompiled, Set[HCQCompiled]] = {dev: set() for dev in self.devices}
|
||||
self.copy_to_devs: dict[HCQCompiled, set[HCQCompiled]] = {dev: set() for dev in self.devices}
|
||||
|
||||
# Create variable timeline signals for each device.
|
||||
timeline_sigaddrs = {dev: UOp.variable(f"timeline_sig_{dev.device_id}", 0, 0xffffffffffffffff, dtype=dtypes.uint64) for dev in self.devices}
|
||||
@@ -161,10 +161,10 @@ class HCQGraph(MultiGraphRunner):
|
||||
self.comp_queues[dev].signal(self.virt_timeline_signals[dev], self.virt_timeline_vals[dev] + 1).bind(dev)
|
||||
if dev in self.copy_queues: self.copy_queues[dev].bind(dev)
|
||||
|
||||
self.last_timeline: Dict[HCQCompiled, Tuple[HCQSignal, int]] = {dev: (dev.timeline_signal, 0) for dev in self.devices}
|
||||
self.last_timeline: dict[HCQCompiled, tuple[HCQSignal, int]] = {dev: (dev.timeline_signal, 0) for dev in self.devices}
|
||||
self.queue_signals_to_reset = [self.signals[q] for q in list(self.comp_queues.values()) + list(self.copy_queues.values()) if q in self.signals]
|
||||
|
||||
def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
|
||||
def __call__(self, input_rawbuffers: list[Buffer], var_vals: dict[Variable, int], wait=False) -> Optional[float]:
|
||||
# Wait and restore signals
|
||||
self.kickoff_value += 1
|
||||
for dev in self.devices: self.last_timeline[dev][0].wait(self.last_timeline[dev][1])
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import List, Any, Dict, cast, Optional
|
||||
from typing import Any, cast, Optional
|
||||
import ctypes
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.helpers import dedup, getenv
|
||||
@@ -17,7 +17,7 @@ class MTLResourceUsage:
|
||||
MTLResourceUsageWrite = 0b10
|
||||
|
||||
class MetalGraph(GraphRunner):
|
||||
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
|
||||
def __init__(self, jit_cache: list[ExecItem], input_rawbuffers: list[Buffer], var_vals: dict[Variable, int]):
|
||||
super().__init__(jit_cache, input_rawbuffers, var_vals)
|
||||
if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
|
||||
|
||||
@@ -58,7 +58,7 @@ class MetalGraph(GraphRunner):
|
||||
if len(self.vars): self.int_buf_view = self.dev.allocator._as_buffer(self.int_buf).cast('i')
|
||||
self.range = to_struct(0, len(jit_cache))
|
||||
|
||||
def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
|
||||
def __call__(self, input_rawbuffers: list[Buffer], var_vals: dict[Variable, int], wait=False) -> Optional[float]:
|
||||
|
||||
if self.command_buffer is not None and self.command_buffer in self.dev.mtl_buffers_in_flight: wait_check(self.command_buffer)
|
||||
all_resources = dedup(self.all_resources + [x._buf.buf for x in input_rawbuffers])
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import Tuple, List, Any, Optional, cast
|
||||
from typing import Any, Optional, cast
|
||||
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
@@ -79,7 +79,7 @@ class AMDComputeQueue(HWQueue):
|
||||
self.acquire_mem()
|
||||
return self
|
||||
|
||||
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
|
||||
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
||||
self.bind_args_state(args_state)
|
||||
|
||||
self.acquire_mem(gli=0, gl2=0)
|
||||
@@ -288,8 +288,8 @@ class AMDDevice(HCQCompiled):
|
||||
kfd:int = -1
|
||||
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
||||
signals_page:Any = None
|
||||
signals_pool:List[int] = []
|
||||
gpus:List[pathlib.Path] = []
|
||||
signals_pool:list[int] = []
|
||||
gpus:list[pathlib.Path] = []
|
||||
|
||||
def _gpu_map(self, mem:HCQBuffer):
|
||||
if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
from typing import Optional, List
|
||||
from typing import Optional
|
||||
import ctypes, subprocess, pathlib, tempfile
|
||||
from tinygrad.device import Compiled, Compiler, MallocAllocator
|
||||
from tinygrad.helpers import cpu_time_execution, cpu_objdump
|
||||
from tinygrad.renderer.cstyle import ClangRenderer
|
||||
|
||||
class ClangCompiler(Compiler):
|
||||
def __init__(self, cachekey="compile_clang", args:Optional[List[str]]=None, objdump_tool='objdump'):
|
||||
def __init__(self, cachekey="compile_clang", args:Optional[list[str]]=None, objdump_tool='objdump'):
|
||||
self.args = ['-march=native'] if args is None else args
|
||||
self.objdump_tool = objdump_tool
|
||||
super().__init__(cachekey)
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
# it should be a secure (example: no use of pickle) boundary. HTTP is used for RPC
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Tuple, Optional, Dict, Any, DefaultDict, List
|
||||
from typing import Optional, Any, DefaultDict
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
import multiprocessing, functools, http.client, hashlib, json, time, os, binascii, struct, ast, contextlib
|
||||
@@ -39,8 +39,8 @@ class ProgramFree(CloudRequest): name: str; datahash: str # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProgramExec(CloudRequest):
|
||||
name: str; datahash: str; bufs: Tuple[int, ...]; vals: Tuple[int, ...] # noqa: E702
|
||||
global_size: Optional[Tuple[int, ...]]; local_size: Optional[Tuple[int, ...]]; wait: bool # noqa: E702
|
||||
name: str; datahash: str; bufs: tuple[int, ...]; vals: tuple[int, ...] # noqa: E702
|
||||
global_size: Optional[tuple[int, ...]]; local_size: Optional[tuple[int, ...]]; wait: bool # noqa: E702
|
||||
|
||||
# for safe deserialization
|
||||
whitelist = {x.__name__:x for x in [BufferAlloc, BufferFree, CopyIn, CopyOut, ProgramAlloc, ProgramFree, ProgramExec, BufferSpec]}
|
||||
@@ -51,8 +51,8 @@ def safe_eval(node): return eval_fxns[node.__class__](node)
|
||||
|
||||
class BatchRequest:
|
||||
def __init__(self):
|
||||
self._q: List[CloudRequest] = []
|
||||
self._h: Dict[str, bytes] = {}
|
||||
self._q: list[CloudRequest] = []
|
||||
self._h: dict[str, bytes] = {}
|
||||
def h(self, d:bytes) -> str:
|
||||
binhash = hashlib.sha256(d).digest()
|
||||
self._h[datahash:=binascii.hexlify(binhash).decode()] = binhash+struct.pack("<Q", len(d))+d
|
||||
@@ -74,9 +74,9 @@ class BatchRequest:
|
||||
|
||||
@dataclass
|
||||
class CloudSession:
|
||||
programs: Dict[Tuple[str, str], Any] = field(default_factory=dict)
|
||||
programs: dict[tuple[str, str], Any] = field(default_factory=dict)
|
||||
# TODO: the buffer should track this internally
|
||||
buffers: Dict[int, Tuple[Any, int, Optional[BufferSpec]]] = field(default_factory=dict)
|
||||
buffers: dict[int, tuple[Any, int, Optional[BufferSpec]]] = field(default_factory=dict)
|
||||
|
||||
class CloudHandler(BaseHTTPRequestHandler):
|
||||
protocol_version = 'HTTP/1.1'
|
||||
@@ -164,7 +164,7 @@ class CloudProgram:
|
||||
super().__init__()
|
||||
def __del__(self): self.dev.req.q(ProgramFree(self.name, self.datahash))
|
||||
|
||||
def __call__(self, *bufs, global_size=None, local_size=None, vals:Tuple[int, ...]=(), wait=False):
|
||||
def __call__(self, *bufs, global_size=None, local_size=None, vals:tuple[int, ...]=(), wait=False):
|
||||
self.dev.req.q(ProgramExec(self.name, self.datahash, bufs, vals, global_size, local_size, wait))
|
||||
if wait: return float(self.dev.batch_submit())
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import ctypes, ctypes.util, functools
|
||||
from typing import Tuple, Optional, List
|
||||
from typing import Optional
|
||||
from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
|
||||
from tinygrad.device import Compiled, BufferSpec, LRUAllocator
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
@@ -12,7 +12,7 @@ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint:
|
||||
def check(status):
|
||||
if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
|
||||
|
||||
def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
|
||||
def encode_args(args, vals) -> tuple[ctypes.Structure, ctypes.Array]:
|
||||
c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] +
|
||||
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
|
||||
vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2),
|
||||
@@ -50,7 +50,7 @@ class CUDAProgram:
|
||||
def __del__(self):
|
||||
if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
|
||||
|
||||
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
check(cuda.cuCtxSetCurrent(self.dev.context))
|
||||
if not hasattr(self, "vargs"):
|
||||
self.c_args, self.vargs = encode_args(args, vals)
|
||||
@@ -90,7 +90,7 @@ class CUDAAllocator(LRUAllocator):
|
||||
def _offset(self, buf, size:int, offset:int): return cuda.CUdeviceptr_v2(buf.value + offset)
|
||||
|
||||
class CUDADevice(Compiled):
|
||||
devices: List[CUDADevice] = []
|
||||
devices: list[CUDADevice] = []
|
||||
peer_access = False
|
||||
|
||||
def __init__(self, device:str):
|
||||
@@ -110,7 +110,7 @@ class CUDADevice(Compiled):
|
||||
CUDADevice.peer_access = True
|
||||
|
||||
self.arch = f"sm_{major.value}{minor.value}"
|
||||
self.pending_copyin: List[Tuple[int, int, Optional[BufferSpec]]] = []
|
||||
self.pending_copyin: list[tuple[int, int, Optional[BufferSpec]]] = []
|
||||
CUDADevice.devices.append(self)
|
||||
|
||||
from tinygrad.runtime.graph.cuda import CUDAGraph
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import os, sys, mmap, io, ctypes, ctypes.util, contextlib
|
||||
from typing import Optional, Generator, Tuple, Callable, List
|
||||
from typing import Optional, Generator, Callable
|
||||
from tinygrad.helpers import OSX, round_up
|
||||
from tinygrad.device import Compiled, Allocator
|
||||
with contextlib.suppress(ImportError):
|
||||
@@ -33,12 +33,12 @@ class DiskAllocator(Allocator):
|
||||
else:
|
||||
dest[:] = src._buf()
|
||||
|
||||
def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[Tuple[int, int, int, int], None, None]:
|
||||
def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[tuple[int, int, int, int], None, None]:
|
||||
assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
|
||||
|
||||
fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
|
||||
processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
|
||||
reqs: List[Tuple[int, int, int, int]] = []
|
||||
reqs: list[tuple[int, int, int, int]] = []
|
||||
|
||||
while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
|
||||
if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import Tuple, Optional, List, cast
|
||||
from typing import Optional, cast
|
||||
import ctypes, functools, hashlib, contextlib
|
||||
from tinygrad.runtime.autogen import opencl as cl
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address
|
||||
@@ -44,10 +44,10 @@ class CLProgram:
|
||||
with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseKernel(self.kernel))
|
||||
with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseProgram(self.program))
|
||||
|
||||
def __call__(self, *bufs:Tuple[ctypes._CData, BufferSpec], global_size:Tuple[int,int,int]=(1,1,1), local_size:Optional[Tuple[int,int,int]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
|
||||
def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:Optional[tuple[int,int,int]]=None, vals:tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
|
||||
for i,(b,_) in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
|
||||
for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
|
||||
if local_size is not None: global_size = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
|
||||
if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
|
||||
event = cl.cl_event() if wait else None
|
||||
check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
|
||||
if wait:
|
||||
@@ -62,14 +62,14 @@ class CLAllocator(LRUAllocator):
|
||||
def __init__(self, dev:CLDevice):
|
||||
self.dev = dev
|
||||
super().__init__()
|
||||
def _alloc(self, size:int, options:BufferSpec) -> Tuple[ctypes._CData, BufferSpec]:
|
||||
def _alloc(self, size:int, options:BufferSpec) -> tuple[ctypes._CData, BufferSpec]:
|
||||
if options.image is not None:
|
||||
return (checked(cl.clCreateImage2D(self.dev.context, cl.CL_MEM_READ_WRITE,
|
||||
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
|
||||
options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status), options)
|
||||
return (checked(cl.clCreateBuffer(self.dev.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
|
||||
def _free(self, opaque:Tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
|
||||
def _copyin(self, dest:Tuple[ctypes._CData, BufferSpec], src:memoryview):
|
||||
def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
|
||||
def _copyin(self, dest:tuple[ctypes._CData, BufferSpec], src:memoryview):
|
||||
if dest[1].image is not None:
|
||||
check(cl.clEnqueueWriteImage(self.dev.queue, dest[0], False, (ctypes.c_size_t * 3)(0,0,0),
|
||||
(ctypes.c_size_t * 3)(dest[1].image.shape[1],dest[1].image.shape[0],1), 0, 0, from_mv(src), 0, None, None))
|
||||
@@ -77,7 +77,7 @@ class CLAllocator(LRUAllocator):
|
||||
if mv_address(src) % 16: src = memoryview(bytearray(src))
|
||||
check(cl.clEnqueueWriteBuffer(self.dev.queue, dest[0], False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
|
||||
self.dev.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
|
||||
def _copyout(self, dest:memoryview, src:Tuple[ctypes._CData, BufferSpec]):
|
||||
def _copyout(self, dest:memoryview, src:tuple[ctypes._CData, BufferSpec]):
|
||||
if src[1].image is not None:
|
||||
check(cl.clEnqueueReadImage(self.dev.queue, src[0], False, (ctypes.c_size_t * 3)(0,0,0),
|
||||
(ctypes.c_size_t * 3)(src[1].image.shape[1],src[1].image.shape[0],1), 0, 0, from_mv(dest), 0, None, None))
|
||||
@@ -103,7 +103,7 @@ class CLDevice(Compiled):
|
||||
if DEBUG >= 1: print(f"CLDevice: opening {self.device_name} with version {self.driver_version}")
|
||||
self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
|
||||
self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
|
||||
self.pending_copyin: List[memoryview] = []
|
||||
self.pending_copyin: list[memoryview] = []
|
||||
self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1] # noqa: E501
|
||||
|
||||
compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import ctypes, functools
|
||||
from typing import Tuple
|
||||
from tinygrad.helpers import init_c_var, from_mv, init_c_struct_t, getenv
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferSpec
|
||||
from tinygrad.runtime.autogen import hip
|
||||
@@ -21,7 +20,7 @@ class HIPProgram:
|
||||
def __del__(self):
|
||||
if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
|
||||
|
||||
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
check(hip.hipSetDevice(self.dev.device_id))
|
||||
if not hasattr(self, "vargs"):
|
||||
self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import ctypes, functools
|
||||
from typing import Tuple
|
||||
from tinygrad.device import Compiled, Compiler, MallocAllocator
|
||||
from tinygrad.helpers import cpu_time_execution, getenv, cpu_objdump
|
||||
from tinygrad.renderer.llvmir import LLVMRenderer
|
||||
@@ -32,7 +31,7 @@ class LLVMProgram:
|
||||
self.fxn = dev.engine.get_function_address(name)
|
||||
assert self.fxn != 0, "LLVM failed to get function address"
|
||||
|
||||
def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
|
||||
def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
|
||||
if not hasattr(self, 'cfunc'):
|
||||
self.cfunc = ctypes.CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
|
||||
return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import os, pathlib, struct, ctypes, tempfile, functools, decimal
|
||||
from typing import List, Any, Union, Tuple, cast
|
||||
from typing import Any, Union, cast
|
||||
from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t, PROFILE
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, cpu_profile, ProfileDeviceEvent, ProfileRangeEvent
|
||||
from tinygrad.renderer.cstyle import MetalRenderer
|
||||
@@ -125,7 +125,7 @@ class MetalProgram:
|
||||
descriptor, MTLPipelineOption.MTLPipelineOptionNone, None, ctypes.byref(error_pipeline_creation:=objc_instance()), restype=objc_instance)
|
||||
error_check(error_pipeline_creation)
|
||||
|
||||
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
max_total_threads = msg(self.pipeline_state, "maxTotalThreadsPerThreadgroup", restype=ctypes.c_ulong)
|
||||
if prod(local_size) > cast(int, max_total_threads):
|
||||
exec_width = msg(self.pipeline_state, "threadExecutionWidth", restype=ctypes.c_ulong)
|
||||
@@ -189,7 +189,7 @@ class MetalDevice(Compiled):
|
||||
self.sysdevice = libmetal.MTLCreateSystemDefaultDevice()
|
||||
self.mtl_queue = msg(self.sysdevice, "newCommandQueueWithMaxCommandBufferCount:", 1024, restype=objc_instance)
|
||||
if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue")
|
||||
self.mtl_buffers_in_flight: List[Any] = []
|
||||
self.mtl_buffers_in_flight: list[Any] = []
|
||||
self.timeline_signal = msg(self.sysdevice, "newSharedEvent", restype=objc_instance)
|
||||
self.timeline_value = 0
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
|
||||
from typing import List, Any, cast, Union, Type, Optional
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.ops import sint
|
||||
@@ -58,7 +58,7 @@ def make_uvm_type():
|
||||
uvm = make_uvm_type()
|
||||
|
||||
def make_qmd_struct_type():
|
||||
fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
|
||||
fields: list[tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
|
||||
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
|
||||
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
|
||||
bits = sorted(bits, key=lambda x: x[1][1])
|
||||
@@ -131,7 +131,7 @@ class NVComputeQueue(NVCommandQueue):
|
||||
self.active_qmd = None
|
||||
return self
|
||||
|
||||
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
|
||||
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
||||
self.bind_args_state(args_state)
|
||||
|
||||
ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
|
||||
@@ -187,7 +187,7 @@ class NVCopyQueue(NVCommandQueue):
|
||||
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
|
||||
|
||||
class NVArgsState(CLikeArgsState):
|
||||
def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
||||
def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
||||
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
|
||||
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
|
||||
|
||||
@@ -202,7 +202,7 @@ class NVProgram(HCQProgram):
|
||||
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
|
||||
|
||||
self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
|
||||
self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]]
|
||||
self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
|
||||
for sh in sections:
|
||||
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
|
||||
if sh.name == f".text.{self.name}":
|
||||
@@ -254,7 +254,7 @@ class NVProgram(HCQProgram):
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
|
||||
|
||||
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
|
||||
raise RuntimeError("Too many resources requested for launch")
|
||||
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
|
||||
@@ -287,7 +287,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
fd_uvm: int = -1
|
||||
gpus_info: Union[List, ctypes.Array] = []
|
||||
signals_page: Any = None
|
||||
signals_pool: List[int] = []
|
||||
signals_pool: list[int] = []
|
||||
|
||||
# TODO: Need a proper allocator for va addresses
|
||||
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
||||
@@ -412,7 +412,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
|
||||
|
||||
self._setup_nvclasses()
|
||||
self._debug_mappings: Dict[Tuple[int, int], str] = dict()
|
||||
self._debug_mappings: dict[tuple[int, int], str] = dict()
|
||||
|
||||
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
|
||||
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
|
||||
@@ -427,7 +427,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
|
||||
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
|
||||
|
||||
for dev in cast(List[NVDevice], self.devices):
|
||||
for dev in cast(list[NVDevice], self.devices):
|
||||
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
|
||||
except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
|
||||
|
||||
@@ -541,7 +541,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
if sm_errors.mmuFault.valid:
|
||||
mmu_info = rmctrl.debug_read_mmu_fault_info(self.fd_ctl, self.root, self.debugger)
|
||||
for i in range(mmu_info.count):
|
||||
pfinfo = mmu_info.mmuFaultInfoList[i]
|
||||
pfinfo = mmu_info.mmuFaultInfolist[i]
|
||||
report += [f"MMU fault: 0x{pfinfo.faultAddress:X} | {NV_PFAULT_FAULT_TYPE[pfinfo.faultType]} | {NV_PFAULT_ACCESS_TYPE[pfinfo.accessType]}"]
|
||||
if DEBUG >= 5:
|
||||
report += ["GPU mappings:\n"+"\n".join(f"\t0x{x:X} - 0x{x+y-1:X} | {self._debug_mappings[(x,y)]}" for x,y in sorted(self._debug_mappings))]
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
# works to test the tensor cores, and all the uops in general
|
||||
# this is the (living) definition of uops
|
||||
import sys
|
||||
from typing import Tuple, List, Optional, Any, Dict, TYPE_CHECKING
|
||||
from typing import Optional, Any, TYPE_CHECKING
|
||||
import pickle, base64, itertools, time, struct
|
||||
from tinygrad.dtype import DType, dtypes, ImageDType, PtrDType, truncate
|
||||
from tinygrad.helpers import all_same, getenv, flatten, get_single_element
|
||||
@@ -27,18 +27,18 @@ def _store(m, i, v):
|
||||
|
||||
class PythonProgram:
|
||||
def __init__(self, name:str, lib:bytes):
|
||||
self.uops: List[Tuple[Ops, Optional[DType], List[int], Any]] = pickle.loads(lib)
|
||||
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
self.uops: list[tuple[Ops, Optional[DType], list[int], Any]] = pickle.loads(lib)
|
||||
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
st = time.perf_counter()
|
||||
warp = list(itertools.product(*[range(x) for x in local_size[::-1]]))
|
||||
warp_size = len(warp)
|
||||
for idxs in itertools.product(*[range(x) for x in global_size[::-1]]):
|
||||
ul: Dict[int, Any] = {}
|
||||
dl: Dict[int, DType] = {}
|
||||
pbufs: List[memoryview] = list(bufs)
|
||||
pvals: List[int] = list(vals)
|
||||
ul: dict[int, Any] = {}
|
||||
dl: dict[int, DType] = {}
|
||||
pbufs: list[memoryview] = list(bufs)
|
||||
pvals: list[int] = list(vals)
|
||||
i = 0
|
||||
loop_ends: Dict[int, int] = {}
|
||||
loop_ends: dict[int, int] = {}
|
||||
while i < len(self.uops):
|
||||
uop, dtype, idp, arg = self.uops[i]
|
||||
void_ops = {Ops.STORE, Ops.ENDRANGE, Ops.BARRIER, Ops.IF, Ops.ENDIF}
|
||||
@@ -184,7 +184,7 @@ class PythonRenderer(Renderer):
|
||||
if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores
|
||||
if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CLANG", ClangRenderer.tensor_cores
|
||||
|
||||
def render(self, name:str, uops:List[UOp]) -> str:
|
||||
def render(self, name:str, uops:list[UOp]) -> str:
|
||||
lops = [(u.op, u.dtype, [uops.index(v) for v in u.src], u.arg) for u in uops]
|
||||
return base64.b64encode(pickle.dumps(lops)).decode()
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
import os, ctypes, functools, mmap, struct, array, math, sys
|
||||
assert sys.platform != 'win32'
|
||||
from types import SimpleNamespace
|
||||
from typing import Tuple, List, Any, cast, Optional
|
||||
from typing import Any, cast, Optional
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
||||
from tinygrad.runtime.autogen import kgsl, adreno, libc
|
||||
@@ -170,7 +170,7 @@ class QCOMComputeQueue(HWQueue):
|
||||
return self
|
||||
|
||||
class QCOMArgsState(HCQArgsState):
|
||||
def __init__(self, ptr:int, prg:QCOMProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
||||
def __init__(self, ptr:int, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
|
||||
if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
|
||||
@@ -208,7 +208,7 @@ class QCOMProgram(HCQProgram):
|
||||
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100)
|
||||
super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
|
||||
|
||||
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch")
|
||||
if any(g*l>mx for g,l,mx in zip(global_size, local_size, [65536, 65536, 65536])) and any(l>mx for l,mx in zip(local_size, [1024, 1024, 1024])):
|
||||
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
||||
@@ -268,7 +268,7 @@ class QCOMProgram(HCQProgram):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
|
||||
|
||||
class QCOMTextureInfo:
|
||||
def __init__(self, pitch:int, real_stride:int, desc:List[int], ibo:List[int]):
|
||||
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
|
||||
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
|
||||
|
||||
class QCOMAllocator(HCQAllocatorBase):
|
||||
@@ -320,7 +320,7 @@ class QCOMAllocator(HCQAllocatorBase):
|
||||
|
||||
class QCOMDevice(HCQCompiled):
|
||||
signals_page: Any = None
|
||||
signals_pool: List[int] = []
|
||||
signals_pool: list[int] = []
|
||||
gpu_id: int = 0
|
||||
dummy_addr: int = 0
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
from typing import Tuple, List, Any
|
||||
from typing import Any
|
||||
from dataclasses import dataclass
|
||||
import tinygrad.runtime.autogen.libc as libc
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ElfSection: name:str; header:libc.Elf64_Shdr; content:bytes # noqa: E702
|
||||
|
||||
def elf_loader(blob:bytes, force_section_align:int=1) -> Tuple[memoryview, List[ElfSection], Any]:
|
||||
def elf_loader(blob:bytes, force_section_align:int=1) -> tuple[memoryview, list[ElfSection], Any]:
|
||||
def _strtab(blob: bytes, idx: int) -> str: return blob[idx:blob.find(b'\x00', idx)].decode('utf-8')
|
||||
|
||||
header = libc.Elf64_Ehdr.from_buffer_copy(blob)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional, Dict, Tuple, cast, Type, TypeVar, Generic, Any
|
||||
from typing import Optional, Dict, cast, Type, TypeVar, Generic, Any
|
||||
import contextlib, decimal, statistics, time, ctypes, array
|
||||
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from tinygrad.renderer import Renderer
|
||||
@@ -31,10 +31,10 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
def __init__(self):
|
||||
self._q:Any = []
|
||||
self.binded_device:Optional[DeviceType] = None
|
||||
self.q_sints:List[Tuple[int, int]] = []
|
||||
self.mv_sints:List[Tuple[memoryview, int, int, Optional[int]]] = []
|
||||
self.syms:List[sint] = []
|
||||
self._prev_resolved_syms:List[Optional[int]] = []
|
||||
self.q_sints:list[tuple[int, int]] = []
|
||||
self.mv_sints:list[tuple[memoryview, int, int, Optional[int]]] = []
|
||||
self.syms:list[sint] = []
|
||||
self._prev_resolved_syms:list[Optional[int]] = []
|
||||
|
||||
def _new_sym(self, sym:sint) -> int:
|
||||
if sym not in self.syms:
|
||||
@@ -91,7 +91,7 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
Enqueues a memory barrier command to ensure memory coherence between agents. Only on compute queues.
|
||||
"""
|
||||
|
||||
def exec(self, prg:ProgramType, args_state:ArgsStateType, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
|
||||
def exec(self, prg:ProgramType, args_state:ArgsStateType, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
||||
"""
|
||||
Enqueues an execution command for a kernel program. Only on compute queues.
|
||||
|
||||
@@ -142,7 +142,7 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
if isinstance(val, int): mv[i] = val if mask is None else ((mv[i] & ~mask) | val)
|
||||
else: self.mv_sints.append((mv, i, self._new_sym(val), mask))
|
||||
|
||||
def _apply_var_vals(self, var_vals:Dict[Variable, int]):
|
||||
def _apply_var_vals(self, var_vals:dict[Variable, int]):
|
||||
resolved_syms = [sym_infer(sym, var_vals) for sym in self.syms]
|
||||
|
||||
for off, sym_idx in self.q_sints:
|
||||
@@ -153,9 +153,9 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
if self._prev_resolved_syms[sym_idx] == resolved_syms[sym_idx]: continue
|
||||
mv[off] = resolved_syms[sym_idx] if mask is None else ((mv[off] & ~mask) | resolved_syms[sym_idx])
|
||||
|
||||
self._prev_resolved_syms = cast(List[Optional[int]], resolved_syms)
|
||||
self._prev_resolved_syms = cast(list[Optional[int]], resolved_syms)
|
||||
|
||||
def submit(self, dev:DeviceType, var_vals:Optional[Dict[Variable, int]]=None):
|
||||
def submit(self, dev:DeviceType, var_vals:Optional[dict[Variable, int]]=None):
|
||||
"""
|
||||
Submits the command queue to a specific device for execution.
|
||||
|
||||
@@ -235,14 +235,14 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Optional[Type[HWQueue
|
||||
if enabled and PROFILE: dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, queue_type is dev.hw_copy_queue_t))
|
||||
|
||||
class HCQArgsState(Generic[ProgramType]):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=()):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
|
||||
self.ptr, self.prg = ptr, prg
|
||||
self.bind_data:List[Tuple[Tuple[sint, ...], int, str]] = []
|
||||
self.bind_data:list[tuple[tuple[sint, ...], int, str]] = []
|
||||
|
||||
def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt): self.bind_data.append((vals, ptr, fmt))
|
||||
|
||||
class CLikeArgsState(HCQArgsState[ProgramType]):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=(), prefix:Optional[List[int]]=None):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=(), prefix:Optional[list[int]]=None):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
|
||||
if prefix is not None: to_mv(self.ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
|
||||
@@ -254,7 +254,7 @@ class HCQProgram(Generic[DeviceType]):
|
||||
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
|
||||
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
|
||||
|
||||
def fill_kernargs(self, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=(), kernargs_ptr:Optional[int]=None) -> HCQArgsState:
|
||||
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs_ptr:Optional[int]=None) -> HCQArgsState:
|
||||
"""
|
||||
Fills arguments for the kernel, optionally allocating space from the device if `kernargs_ptr` is not provided.
|
||||
Args:
|
||||
@@ -266,8 +266,8 @@ class HCQProgram(Generic[DeviceType]):
|
||||
"""
|
||||
return self.args_state_t(kernargs_ptr or self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
|
||||
|
||||
def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1),
|
||||
vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]:
|
||||
def __call__(self, *bufs:HCQBuffer, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1),
|
||||
vals:tuple[int, ...]=(), wait:bool=False) -> Optional[float]:
|
||||
"""
|
||||
Enqueues the program for execution with the given arguments and dimensions.
|
||||
|
||||
@@ -298,7 +298,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
"""
|
||||
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
|
||||
"""
|
||||
devices: List[HCQCompiled] = []
|
||||
devices: list[HCQCompiled] = []
|
||||
|
||||
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
|
||||
comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]):
|
||||
@@ -307,9 +307,9 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.timeline_value:int = 1
|
||||
self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
|
||||
self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
|
||||
self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
|
||||
self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.raw_prof_records:list[tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
|
||||
self.dep_prof_records:list[tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
|
||||
|
||||
from tinygrad.runtime.graph.hcq import HCQGraph
|
||||
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
|
||||
|
||||
Reference in New Issue
Block a user