replace Tuple with tuple [pr] (#8344)

* replace Tuple with tuple [pr]

* replace List with list [pr]

* replace Dict with dict [pr]

* replace Set with set [pr]
This commit is contained in:
George Hotz
2024-12-19 21:27:56 -08:00
committed by GitHub
parent adcdc583a2
commit 9c77e9f9b7
45 changed files with 601 additions and 605 deletions

View File

@@ -1,5 +1,5 @@
import ctypes
from typing import Any, Optional, Tuple, Dict, List, cast
from typing import Any, Optional, cast
import tinygrad.runtime.autogen.cuda as cuda
from tinygrad.helpers import init_c_var, dedup
from tinygrad.device import Buffer, Device
@@ -9,14 +9,14 @@ from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
from tinygrad.engine.jit import MultiGraphRunner, GraphException
class CUDAGraph(MultiGraphRunner):
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
def __init__(self, jit_cache: list[ExecItem], input_rawbuffers: list[Buffer], var_vals: dict[Variable, int]):
super().__init__(jit_cache, input_rawbuffers, var_vals)
# Check all jit items are compatible.
if not all(isinstance(ji.prg, (CompiledRunner, BufferXfer)) for ji in jit_cache): raise GraphException
self.jc_idx_with_updatable_rawbufs = dedup([x[0] for x in self.input_replace.keys()])
self.updatable_nodes: Dict[int, Tuple[Any, Any, Any, bool]] = {} # Dict[jc index] = tuple(graph node, node params, input kernel params, is memcpy)
self.updatable_nodes: dict[int, tuple[Any, Any, Any, bool]] = {} # dict[jc index] = tuple(graph node, node params, input kernel params, is memcpy)
self.graph = init_c_var(cuda.CUgraph(), lambda x: check(cuda.cuGraphCreate(ctypes.byref(x), 0)))
@@ -48,7 +48,7 @@ class CUDAGraph(MultiGraphRunner):
self.instance = init_c_var(cuda.CUgraphExec(), lambda x: check(cuda.cuGraphInstantiate_v2(ctypes.byref(x), self.graph, None, None, 0)))
def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
def __call__(self, input_rawbuffers: list[Buffer], var_vals: dict[Variable, int], wait=False) -> Optional[float]:
# Update rawbuffers in the c_args struct.
for (j,i),input_idx in self.input_replace.items():
if not self.updatable_nodes[j][3]: setattr(self.updatable_nodes[j][2], f'f{i}', input_rawbuffers[input_idx]._buf)

View File

@@ -1,5 +1,5 @@
import collections, time
from typing import List, Any, Dict, cast, Optional, Tuple, Set
from typing import List, Any, cast, Optional
from tinygrad.helpers import round_up, PROFILE
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWQueue, HCQArgsState, BumpAllocator
from tinygrad.device import Buffer, BufferSpec, Compiled, Device, ProfileGraphEntry, ProfileGraphEvent
@@ -9,29 +9,29 @@ from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
from tinygrad.engine.jit import MultiGraphRunner
class HCQGraph(MultiGraphRunner):
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
def __init__(self, jit_cache: list[ExecItem], input_rawbuffers: list[Buffer], var_vals: dict[Variable, int]):
super().__init__(jit_cache, input_rawbuffers, var_vals)
self.devices = list(set(cast(HCQCompiled, d) for ji in jit_cache for d in [Device[cast(Buffer, x).device] for x in ji.bufs]))
# Replace input buffers with variables.
self.hcq_bufs = [[cast(Buffer, x)._buf for x in ji.bufs] for ji in jit_cache]
self.input_replace_to_var: Dict[Tuple[int, int], Variable] = {}
self.input_replace_to_var: dict[tuple[int, int], Variable] = {}
for (j,i), input_idx in self.input_replace.items():
x = self.input_replace_to_var.setdefault((j,i), UOp.variable(f"input_{input_idx}", 0, 0xffffffffffffffff, dtype=dtypes.uint64))
self.hcq_bufs[j][i] = HCQBuffer(x, self.hcq_bufs[j][i].size, texture_info=self.hcq_bufs[j][i].texture_info) # Create fake buffer with variable
# Allocate kernel args.
kernargs_size: Dict[Compiled, int] = collections.defaultdict(int)
kernargs_size: dict[Compiled, int] = collections.defaultdict(int)
for ji in jit_cache:
if not isinstance(ji.prg, CompiledRunner): continue
kernargs_size[ji.prg.dev] += round_up(ji.prg._prg.kernargs_alloc_size, 16)
self.kernargs_bufs: Dict[Compiled, HCQBuffer] = {dev:dev.allocator._alloc(sz, BufferSpec(cpu_access=True)) for dev,sz in kernargs_size.items()}
self.kernargs_bufs: dict[Compiled, HCQBuffer] = {dev:dev.allocator._alloc(sz, BufferSpec(cpu_access=True)) for dev,sz in kernargs_size.items()}
# Fill initial arguments.
self.ji_args: Dict[int, HCQArgsState] = {}
self.ji_args: dict[int, HCQArgsState] = {}
kargs_alloc: Dict[Compiled, BumpAllocator] = {dev:BumpAllocator(buf.size, start=cast(int, buf.va_addr)) for dev,buf in self.kernargs_bufs.items()}
kargs_alloc: dict[Compiled, BumpAllocator] = {dev:BumpAllocator(buf.size, start=cast(int, buf.va_addr)) for dev,buf in self.kernargs_bufs.items()}
for j,ji in enumerate(jit_cache):
if not isinstance(ji.prg, CompiledRunner): continue
@@ -42,24 +42,24 @@ class HCQGraph(MultiGraphRunner):
# graph-related tasks. This synchronization uses a global timeline signal per device. Within the graph, the compute queue coordinates with
# global operations and sets a kickoff signal. Any queue accessing a buffer from another device waits for this signal from the devices
# compute queue to ensure exclusive access. The compute queue signals the completion of the graph, synchronizing with the device's copy queue.
self.ji_schedule: Dict[int, Tuple[HCQCompiled, HWQueue, List, List, HCQSignal, Optional[int]]] = {}
self.ji_schedule: dict[int, tuple[HCQCompiled, HWQueue, List, List, HCQSignal, Optional[int]]] = {}
self.comp_queues: Dict[HCQCompiled, HWQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
self.copy_queues: Dict[HCQCompiled, HWQueue] = {} # lazy allocation
self.comp_queues: dict[HCQCompiled, HWQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
self.copy_queues: dict[HCQCompiled, HWQueue] = {} # lazy allocation
self.signals: Dict[Any, HCQSignal] = {**{dev: dev.signal_t(value=0) for dev in self.devices}, **{"CPU": self.devices[0].signal_t(value=0)}}
self.signals: dict[Any, HCQSignal] = {**{dev: dev.signal_t(value=0) for dev in self.devices}, **{"CPU": self.devices[0].signal_t(value=0)}}
self.kickoff_value: int = 0
self.kickoff_var = UOp.variable("kickoff_var", 0, 0xffffffff, dtype=dtypes.uint32)
# When profiling allocate 2 signals for each jit item to measure speed. The jth jit item have signals at 2*j and 2*j+1.
# TODO: This logic might allocate a few extra signals...
self.prof_signals: List[HCQSignal] = [self.devices[0].signal_t() for i in range(len(jit_cache) * 2)] if PROFILE else []
self.prog_graph_deps: List[List[int]] = []
self.prof_graph_entries: List[ProfileGraphEntry] = []
self.prof_signals: list[HCQSignal] = [self.devices[0].signal_t() for i in range(len(jit_cache) * 2)] if PROFILE else []
self.prog_graph_deps: list[list[int]] = []
self.prof_graph_entries: list[ProfileGraphEntry] = []
last_j: Dict[HWQueue, Optional[int]] = collections.defaultdict(lambda: None)
queue_access: Dict[HWQueue, Dict[HWQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
dev_access: Dict[HWQueue, Set[HCQCompiled]] = collections.defaultdict(set)
last_j: dict[HWQueue, Optional[int]] = collections.defaultdict(lambda: None)
queue_access: dict[HWQueue, dict[HWQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
dev_access: dict[HWQueue, set[HCQCompiled]] = collections.defaultdict(set)
for dev, queue in self.comp_queues.items(): dev_access[queue].add(dev)
@@ -120,7 +120,7 @@ class HCQGraph(MultiGraphRunner):
self.prof_signal_is_used = [any(ent.st_id == j or ent.en_id == j for ent in self.prof_graph_entries) for j in range(len(self.prof_signals))]
# Build hardware queues.
self.copy_to_devs: Dict[HCQCompiled, Set[HCQCompiled]] = {dev: set() for dev in self.devices}
self.copy_to_devs: dict[HCQCompiled, set[HCQCompiled]] = {dev: set() for dev in self.devices}
# Create variable timeline signals for each device.
timeline_sigaddrs = {dev: UOp.variable(f"timeline_sig_{dev.device_id}", 0, 0xffffffffffffffff, dtype=dtypes.uint64) for dev in self.devices}
@@ -161,10 +161,10 @@ class HCQGraph(MultiGraphRunner):
self.comp_queues[dev].signal(self.virt_timeline_signals[dev], self.virt_timeline_vals[dev] + 1).bind(dev)
if dev in self.copy_queues: self.copy_queues[dev].bind(dev)
self.last_timeline: Dict[HCQCompiled, Tuple[HCQSignal, int]] = {dev: (dev.timeline_signal, 0) for dev in self.devices}
self.last_timeline: dict[HCQCompiled, tuple[HCQSignal, int]] = {dev: (dev.timeline_signal, 0) for dev in self.devices}
self.queue_signals_to_reset = [self.signals[q] for q in list(self.comp_queues.values()) + list(self.copy_queues.values()) if q in self.signals]
def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
def __call__(self, input_rawbuffers: list[Buffer], var_vals: dict[Variable, int], wait=False) -> Optional[float]:
# Wait and restore signals
self.kickoff_value += 1
for dev in self.devices: self.last_timeline[dev][0].wait(self.last_timeline[dev][1])

View File

@@ -1,4 +1,4 @@
from typing import List, Any, Dict, cast, Optional
from typing import Any, cast, Optional
import ctypes
from tinygrad.dtype import dtypes
from tinygrad.helpers import dedup, getenv
@@ -17,7 +17,7 @@ class MTLResourceUsage:
MTLResourceUsageWrite = 0b10
class MetalGraph(GraphRunner):
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
def __init__(self, jit_cache: list[ExecItem], input_rawbuffers: list[Buffer], var_vals: dict[Variable, int]):
super().__init__(jit_cache, input_rawbuffers, var_vals)
if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
@@ -58,7 +58,7 @@ class MetalGraph(GraphRunner):
if len(self.vars): self.int_buf_view = self.dev.allocator._as_buffer(self.int_buf).cast('i')
self.range = to_struct(0, len(jit_cache))
def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
def __call__(self, input_rawbuffers: list[Buffer], var_vals: dict[Variable, int], wait=False) -> Optional[float]:
if self.command_buffer is not None and self.command_buffer in self.dev.mtl_buffers_in_flight: wait_check(self.command_buffer)
all_resources = dedup(self.all_resources + [x._buf.buf for x in input_rawbuffers])

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
from typing import Tuple, List, Any, Optional, cast
from typing import Any, Optional, cast
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys
assert sys.platform != 'win32'
from dataclasses import dataclass
@@ -79,7 +79,7 @@ class AMDComputeQueue(HWQueue):
self.acquire_mem()
return self
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
self.bind_args_state(args_state)
self.acquire_mem(gli=0, gl2=0)
@@ -288,8 +288,8 @@ class AMDDevice(HCQCompiled):
kfd:int = -1
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
signals_page:Any = None
signals_pool:List[int] = []
gpus:List[pathlib.Path] = []
signals_pool:list[int] = []
gpus:list[pathlib.Path] = []
def _gpu_map(self, mem:HCQBuffer):
if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return

View File

@@ -1,11 +1,11 @@
from typing import Optional, List
from typing import Optional
import ctypes, subprocess, pathlib, tempfile
from tinygrad.device import Compiled, Compiler, MallocAllocator
from tinygrad.helpers import cpu_time_execution, cpu_objdump
from tinygrad.renderer.cstyle import ClangRenderer
class ClangCompiler(Compiler):
def __init__(self, cachekey="compile_clang", args:Optional[List[str]]=None, objdump_tool='objdump'):
def __init__(self, cachekey="compile_clang", args:Optional[list[str]]=None, objdump_tool='objdump'):
self.args = ['-march=native'] if args is None else args
self.objdump_tool = objdump_tool
super().__init__(cachekey)

View File

@@ -5,7 +5,7 @@
# it should be a secure (example: no use of pickle) boundary. HTTP is used for RPC
from __future__ import annotations
from typing import Tuple, Optional, Dict, Any, DefaultDict, List
from typing import Optional, Any, DefaultDict
from collections import defaultdict
from dataclasses import dataclass, field
import multiprocessing, functools, http.client, hashlib, json, time, os, binascii, struct, ast, contextlib
@@ -39,8 +39,8 @@ class ProgramFree(CloudRequest): name: str; datahash: str # noqa: E702
@dataclass(frozen=True)
class ProgramExec(CloudRequest):
name: str; datahash: str; bufs: Tuple[int, ...]; vals: Tuple[int, ...] # noqa: E702
global_size: Optional[Tuple[int, ...]]; local_size: Optional[Tuple[int, ...]]; wait: bool # noqa: E702
name: str; datahash: str; bufs: tuple[int, ...]; vals: tuple[int, ...] # noqa: E702
global_size: Optional[tuple[int, ...]]; local_size: Optional[tuple[int, ...]]; wait: bool # noqa: E702
# for safe deserialization
whitelist = {x.__name__:x for x in [BufferAlloc, BufferFree, CopyIn, CopyOut, ProgramAlloc, ProgramFree, ProgramExec, BufferSpec]}
@@ -51,8 +51,8 @@ def safe_eval(node): return eval_fxns[node.__class__](node)
class BatchRequest:
def __init__(self):
self._q: List[CloudRequest] = []
self._h: Dict[str, bytes] = {}
self._q: list[CloudRequest] = []
self._h: dict[str, bytes] = {}
def h(self, d:bytes) -> str:
binhash = hashlib.sha256(d).digest()
self._h[datahash:=binascii.hexlify(binhash).decode()] = binhash+struct.pack("<Q", len(d))+d
@@ -74,9 +74,9 @@ class BatchRequest:
@dataclass
class CloudSession:
programs: Dict[Tuple[str, str], Any] = field(default_factory=dict)
programs: dict[tuple[str, str], Any] = field(default_factory=dict)
# TODO: the buffer should track this internally
buffers: Dict[int, Tuple[Any, int, Optional[BufferSpec]]] = field(default_factory=dict)
buffers: dict[int, tuple[Any, int, Optional[BufferSpec]]] = field(default_factory=dict)
class CloudHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'
@@ -164,7 +164,7 @@ class CloudProgram:
super().__init__()
def __del__(self): self.dev.req.q(ProgramFree(self.name, self.datahash))
def __call__(self, *bufs, global_size=None, local_size=None, vals:Tuple[int, ...]=(), wait=False):
def __call__(self, *bufs, global_size=None, local_size=None, vals:tuple[int, ...]=(), wait=False):
self.dev.req.q(ProgramExec(self.name, self.datahash, bufs, vals, global_size, local_size, wait))
if wait: return float(self.dev.batch_submit())

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
import ctypes, ctypes.util, functools
from typing import Tuple, Optional, List
from typing import Optional
from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
from tinygrad.device import Compiled, BufferSpec, LRUAllocator
from tinygrad.renderer.cstyle import CUDARenderer
@@ -12,7 +12,7 @@ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint:
def check(status):
if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
def encode_args(args, vals) -> tuple[ctypes.Structure, ctypes.Array]:
c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] +
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2),
@@ -50,7 +50,7 @@ class CUDAProgram:
def __del__(self):
if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
check(cuda.cuCtxSetCurrent(self.dev.context))
if not hasattr(self, "vargs"):
self.c_args, self.vargs = encode_args(args, vals)
@@ -90,7 +90,7 @@ class CUDAAllocator(LRUAllocator):
def _offset(self, buf, size:int, offset:int): return cuda.CUdeviceptr_v2(buf.value + offset)
class CUDADevice(Compiled):
devices: List[CUDADevice] = []
devices: list[CUDADevice] = []
peer_access = False
def __init__(self, device:str):
@@ -110,7 +110,7 @@ class CUDADevice(Compiled):
CUDADevice.peer_access = True
self.arch = f"sm_{major.value}{minor.value}"
self.pending_copyin: List[Tuple[int, int, Optional[BufferSpec]]] = []
self.pending_copyin: list[tuple[int, int, Optional[BufferSpec]]] = []
CUDADevice.devices.append(self)
from tinygrad.runtime.graph.cuda import CUDAGraph

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
import os, sys, mmap, io, ctypes, ctypes.util, contextlib
from typing import Optional, Generator, Tuple, Callable, List
from typing import Optional, Generator, Callable
from tinygrad.helpers import OSX, round_up
from tinygrad.device import Compiled, Allocator
with contextlib.suppress(ImportError):
@@ -33,12 +33,12 @@ class DiskAllocator(Allocator):
else:
dest[:] = src._buf()
def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[Tuple[int, int, int, int], None, None]:
def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[tuple[int, int, int, int], None, None]:
assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
reqs: List[Tuple[int, int, int, int]] = []
reqs: list[tuple[int, int, int, int]] = []
while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
from typing import Tuple, Optional, List, cast
from typing import Optional, cast
import ctypes, functools, hashlib, contextlib
from tinygrad.runtime.autogen import opencl as cl
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address
@@ -44,10 +44,10 @@ class CLProgram:
with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseKernel(self.kernel))
with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseProgram(self.program))
def __call__(self, *bufs:Tuple[ctypes._CData, BufferSpec], global_size:Tuple[int,int,int]=(1,1,1), local_size:Optional[Tuple[int,int,int]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:Optional[tuple[int,int,int]]=None, vals:tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
for i,(b,_) in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
if local_size is not None: global_size = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
event = cl.cl_event() if wait else None
check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
if wait:
@@ -62,14 +62,14 @@ class CLAllocator(LRUAllocator):
def __init__(self, dev:CLDevice):
self.dev = dev
super().__init__()
def _alloc(self, size:int, options:BufferSpec) -> Tuple[ctypes._CData, BufferSpec]:
def _alloc(self, size:int, options:BufferSpec) -> tuple[ctypes._CData, BufferSpec]:
if options.image is not None:
return (checked(cl.clCreateImage2D(self.dev.context, cl.CL_MEM_READ_WRITE,
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status), options)
return (checked(cl.clCreateBuffer(self.dev.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
def _free(self, opaque:Tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
def _copyin(self, dest:Tuple[ctypes._CData, BufferSpec], src:memoryview):
def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
def _copyin(self, dest:tuple[ctypes._CData, BufferSpec], src:memoryview):
if dest[1].image is not None:
check(cl.clEnqueueWriteImage(self.dev.queue, dest[0], False, (ctypes.c_size_t * 3)(0,0,0),
(ctypes.c_size_t * 3)(dest[1].image.shape[1],dest[1].image.shape[0],1), 0, 0, from_mv(src), 0, None, None))
@@ -77,7 +77,7 @@ class CLAllocator(LRUAllocator):
if mv_address(src) % 16: src = memoryview(bytearray(src))
check(cl.clEnqueueWriteBuffer(self.dev.queue, dest[0], False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
self.dev.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
def _copyout(self, dest:memoryview, src:Tuple[ctypes._CData, BufferSpec]):
def _copyout(self, dest:memoryview, src:tuple[ctypes._CData, BufferSpec]):
if src[1].image is not None:
check(cl.clEnqueueReadImage(self.dev.queue, src[0], False, (ctypes.c_size_t * 3)(0,0,0),
(ctypes.c_size_t * 3)(src[1].image.shape[1],src[1].image.shape[0],1), 0, 0, from_mv(dest), 0, None, None))
@@ -103,7 +103,7 @@ class CLDevice(Compiled):
if DEBUG >= 1: print(f"CLDevice: opening {self.device_name} with version {self.driver_version}")
self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
self.pending_copyin: List[memoryview] = []
self.pending_copyin: list[memoryview] = []
self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1] # noqa: E501
compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import ctypes, functools
from typing import Tuple
from tinygrad.helpers import init_c_var, from_mv, init_c_struct_t, getenv
from tinygrad.device import Compiled, LRUAllocator, BufferSpec
from tinygrad.runtime.autogen import hip
@@ -21,7 +20,7 @@ class HIPProgram:
def __del__(self):
if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
check(hip.hipSetDevice(self.dev.device_id))
if not hasattr(self, "vargs"):
self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import ctypes, functools
from typing import Tuple
from tinygrad.device import Compiled, Compiler, MallocAllocator
from tinygrad.helpers import cpu_time_execution, getenv, cpu_objdump
from tinygrad.renderer.llvmir import LLVMRenderer
@@ -32,7 +31,7 @@ class LLVMProgram:
self.fxn = dev.engine.get_function_address(name)
assert self.fxn != 0, "LLVM failed to get function address"
def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
if not hasattr(self, 'cfunc'):
self.cfunc = ctypes.CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
import os, pathlib, struct, ctypes, tempfile, functools, decimal
from typing import List, Any, Union, Tuple, cast
from typing import Any, Union, cast
from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t, PROFILE
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, cpu_profile, ProfileDeviceEvent, ProfileRangeEvent
from tinygrad.renderer.cstyle import MetalRenderer
@@ -125,7 +125,7 @@ class MetalProgram:
descriptor, MTLPipelineOption.MTLPipelineOptionNone, None, ctypes.byref(error_pipeline_creation:=objc_instance()), restype=objc_instance)
error_check(error_pipeline_creation)
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
max_total_threads = msg(self.pipeline_state, "maxTotalThreadsPerThreadgroup", restype=ctypes.c_ulong)
if prod(local_size) > cast(int, max_total_threads):
exec_width = msg(self.pipeline_state, "threadExecutionWidth", restype=ctypes.c_ulong)
@@ -189,7 +189,7 @@ class MetalDevice(Compiled):
self.sysdevice = libmetal.MTLCreateSystemDefaultDevice()
self.mtl_queue = msg(self.sysdevice, "newCommandQueueWithMaxCommandBufferCount:", 1024, restype=objc_instance)
if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue")
self.mtl_buffers_in_flight: List[Any] = []
self.mtl_buffers_in_flight: list[Any] = []
self.timeline_signal = msg(self.sysdevice, "newSharedEvent", restype=objc_instance)
self.timeline_value = 0

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
assert sys.platform != 'win32'
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
from typing import List, Any, cast, Union, Type, Optional
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.ops import sint
@@ -58,7 +58,7 @@ def make_uvm_type():
uvm = make_uvm_type()
def make_qmd_struct_type():
fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
fields: list[tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
bits = sorted(bits, key=lambda x: x[1][1])
@@ -131,7 +131,7 @@ class NVComputeQueue(NVCommandQueue):
self.active_qmd = None
return self
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
self.bind_args_state(args_state)
ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
@@ -187,7 +187,7 @@ class NVCopyQueue(NVCommandQueue):
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
class NVArgsState(CLikeArgsState):
def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
@@ -202,7 +202,7 @@ class NVProgram(HCQProgram):
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]]
self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
for sh in sections:
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
if sh.name == f".text.{self.name}":
@@ -254,7 +254,7 @@ class NVProgram(HCQProgram):
def __del__(self):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
raise RuntimeError("Too many resources requested for launch")
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
@@ -287,7 +287,7 @@ class NVDevice(HCQCompiled[NVSignal]):
fd_uvm: int = -1
gpus_info: Union[List, ctypes.Array] = []
signals_page: Any = None
signals_pool: List[int] = []
signals_pool: list[int] = []
# TODO: Need a proper allocator for va addresses
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
@@ -412,7 +412,7 @@ class NVDevice(HCQCompiled[NVSignal]):
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
self._setup_nvclasses()
self._debug_mappings: Dict[Tuple[int, int], str] = dict()
self._debug_mappings: dict[tuple[int, int], str] = dict()
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
@@ -427,7 +427,7 @@ class NVDevice(HCQCompiled[NVSignal]):
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
for dev in cast(List[NVDevice], self.devices):
for dev in cast(list[NVDevice], self.devices):
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
@@ -541,7 +541,7 @@ class NVDevice(HCQCompiled[NVSignal]):
if sm_errors.mmuFault.valid:
mmu_info = rmctrl.debug_read_mmu_fault_info(self.fd_ctl, self.root, self.debugger)
for i in range(mmu_info.count):
pfinfo = mmu_info.mmuFaultInfoList[i]
pfinfo = mmu_info.mmuFaultInfolist[i]
report += [f"MMU fault: 0x{pfinfo.faultAddress:X} | {NV_PFAULT_FAULT_TYPE[pfinfo.faultType]} | {NV_PFAULT_ACCESS_TYPE[pfinfo.accessType]}"]
if DEBUG >= 5:
report += ["GPU mappings:\n"+"\n".join(f"\t0x{x:X} - 0x{x+y-1:X} | {self._debug_mappings[(x,y)]}" for x,y in sorted(self._debug_mappings))]

View File

@@ -3,7 +3,7 @@
# works to test the tensor cores, and all the uops in general
# this is the (living) definition of uops
import sys
from typing import Tuple, List, Optional, Any, Dict, TYPE_CHECKING
from typing import Optional, Any, TYPE_CHECKING
import pickle, base64, itertools, time, struct
from tinygrad.dtype import DType, dtypes, ImageDType, PtrDType, truncate
from tinygrad.helpers import all_same, getenv, flatten, get_single_element
@@ -27,18 +27,18 @@ def _store(m, i, v):
class PythonProgram:
def __init__(self, name:str, lib:bytes):
self.uops: List[Tuple[Ops, Optional[DType], List[int], Any]] = pickle.loads(lib)
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
self.uops: list[tuple[Ops, Optional[DType], list[int], Any]] = pickle.loads(lib)
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
st = time.perf_counter()
warp = list(itertools.product(*[range(x) for x in local_size[::-1]]))
warp_size = len(warp)
for idxs in itertools.product(*[range(x) for x in global_size[::-1]]):
ul: Dict[int, Any] = {}
dl: Dict[int, DType] = {}
pbufs: List[memoryview] = list(bufs)
pvals: List[int] = list(vals)
ul: dict[int, Any] = {}
dl: dict[int, DType] = {}
pbufs: list[memoryview] = list(bufs)
pvals: list[int] = list(vals)
i = 0
loop_ends: Dict[int, int] = {}
loop_ends: dict[int, int] = {}
while i < len(self.uops):
uop, dtype, idp, arg = self.uops[i]
void_ops = {Ops.STORE, Ops.ENDRANGE, Ops.BARRIER, Ops.IF, Ops.ENDIF}
@@ -184,7 +184,7 @@ class PythonRenderer(Renderer):
if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores
if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CLANG", ClangRenderer.tensor_cores
def render(self, name:str, uops:List[UOp]) -> str:
def render(self, name:str, uops:list[UOp]) -> str:
lops = [(u.op, u.dtype, [uops.index(v) for v in u.src], u.arg) for u in uops]
return base64.b64encode(pickle.dumps(lops)).decode()

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
import os, ctypes, functools, mmap, struct, array, math, sys
assert sys.platform != 'win32'
from types import SimpleNamespace
from typing import Tuple, List, Any, cast, Optional
from typing import Any, cast, Optional
from tinygrad.device import BufferSpec
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
from tinygrad.runtime.autogen import kgsl, adreno, libc
@@ -170,7 +170,7 @@ class QCOMComputeQueue(HWQueue):
return self
class QCOMArgsState(HCQArgsState):
def __init__(self, ptr:int, prg:QCOMProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
def __init__(self, ptr:int, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
super().__init__(ptr, prg, bufs, vals=vals)
if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
@@ -208,7 +208,7 @@ class QCOMProgram(HCQProgram):
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100)
super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch")
if any(g*l>mx for g,l,mx in zip(global_size, local_size, [65536, 65536, 65536])) and any(l>mx for l,mx in zip(local_size, [1024, 1024, 1024])):
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
@@ -268,7 +268,7 @@ class QCOMProgram(HCQProgram):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
class QCOMTextureInfo:
def __init__(self, pitch:int, real_stride:int, desc:List[int], ibo:List[int]):
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
class QCOMAllocator(HCQAllocatorBase):
@@ -320,7 +320,7 @@ class QCOMAllocator(HCQAllocatorBase):
class QCOMDevice(HCQCompiled):
signals_page: Any = None
signals_pool: List[int] = []
signals_pool: list[int] = []
gpu_id: int = 0
dummy_addr: int = 0

View File

@@ -1,11 +1,11 @@
from typing import Tuple, List, Any
from typing import Any
from dataclasses import dataclass
import tinygrad.runtime.autogen.libc as libc
@dataclass(frozen=True)
class ElfSection: name:str; header:libc.Elf64_Shdr; content:bytes # noqa: E702
def elf_loader(blob:bytes, force_section_align:int=1) -> Tuple[memoryview, List[ElfSection], Any]:
def elf_loader(blob:bytes, force_section_align:int=1) -> tuple[memoryview, list[ElfSection], Any]:
def _strtab(blob: bytes, idx: int) -> str: return blob[idx:blob.find(b'\x00', idx)].decode('utf-8')
header = libc.Elf64_Ehdr.from_buffer_copy(blob)

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
from typing import List, Optional, Dict, Tuple, cast, Type, TypeVar, Generic, Any
from typing import Optional, Dict, cast, Type, TypeVar, Generic, Any
import contextlib, decimal, statistics, time, ctypes, array
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
from tinygrad.renderer import Renderer
@@ -31,10 +31,10 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
def __init__(self):
self._q:Any = []
self.binded_device:Optional[DeviceType] = None
self.q_sints:List[Tuple[int, int]] = []
self.mv_sints:List[Tuple[memoryview, int, int, Optional[int]]] = []
self.syms:List[sint] = []
self._prev_resolved_syms:List[Optional[int]] = []
self.q_sints:list[tuple[int, int]] = []
self.mv_sints:list[tuple[memoryview, int, int, Optional[int]]] = []
self.syms:list[sint] = []
self._prev_resolved_syms:list[Optional[int]] = []
def _new_sym(self, sym:sint) -> int:
if sym not in self.syms:
@@ -91,7 +91,7 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
Enqueues a memory barrier command to ensure memory coherence between agents. Only on compute queues.
"""
def exec(self, prg:ProgramType, args_state:ArgsStateType, global_size:Tuple[sint, ...], local_size:Tuple[sint, ...]):
def exec(self, prg:ProgramType, args_state:ArgsStateType, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
"""
Enqueues an execution command for a kernel program. Only on compute queues.
@@ -142,7 +142,7 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
if isinstance(val, int): mv[i] = val if mask is None else ((mv[i] & ~mask) | val)
else: self.mv_sints.append((mv, i, self._new_sym(val), mask))
def _apply_var_vals(self, var_vals:Dict[Variable, int]):
def _apply_var_vals(self, var_vals:dict[Variable, int]):
resolved_syms = [sym_infer(sym, var_vals) for sym in self.syms]
for off, sym_idx in self.q_sints:
@@ -153,9 +153,9 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
if self._prev_resolved_syms[sym_idx] == resolved_syms[sym_idx]: continue
mv[off] = resolved_syms[sym_idx] if mask is None else ((mv[off] & ~mask) | resolved_syms[sym_idx])
self._prev_resolved_syms = cast(List[Optional[int]], resolved_syms)
self._prev_resolved_syms = cast(list[Optional[int]], resolved_syms)
def submit(self, dev:DeviceType, var_vals:Optional[Dict[Variable, int]]=None):
def submit(self, dev:DeviceType, var_vals:Optional[dict[Variable, int]]=None):
"""
Submits the command queue to a specific device for execution.
@@ -235,14 +235,14 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Optional[Type[HWQueue
if enabled and PROFILE: dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, queue_type is dev.hw_copy_queue_t))
class HCQArgsState(Generic[ProgramType]):
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=()):
def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
self.ptr, self.prg = ptr, prg
self.bind_data:List[Tuple[Tuple[sint, ...], int, str]] = []
self.bind_data:list[tuple[tuple[sint, ...], int, str]] = []
def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt): self.bind_data.append((vals, ptr, fmt))
class CLikeArgsState(HCQArgsState[ProgramType]):
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=(), prefix:Optional[List[int]]=None):
def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=(), prefix:Optional[list[int]]=None):
super().__init__(ptr, prg, bufs, vals=vals)
if prefix is not None: to_mv(self.ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
@@ -254,7 +254,7 @@ class HCQProgram(Generic[DeviceType]):
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
def fill_kernargs(self, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=(), kernargs_ptr:Optional[int]=None) -> HCQArgsState:
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs_ptr:Optional[int]=None) -> HCQArgsState:
"""
Fills arguments for the kernel, optionally allocating space from the device if `kernargs_ptr` is not provided.
Args:
@@ -266,8 +266,8 @@ class HCQProgram(Generic[DeviceType]):
"""
return self.args_state_t(kernargs_ptr or self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1),
vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]:
def __call__(self, *bufs:HCQBuffer, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1),
vals:tuple[int, ...]=(), wait:bool=False) -> Optional[float]:
"""
Enqueues the program for execution with the given arguments and dimensions.
@@ -298,7 +298,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
"""
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
"""
devices: List[HCQCompiled] = []
devices: list[HCQCompiled] = []
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]):
@@ -307,9 +307,9 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self.timeline_value:int = 1
self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = []
self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
self.raw_prof_records:list[tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
self.dep_prof_records:list[tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
from tinygrad.runtime.graph.hcq import HCQGraph
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)