diff --git a/test/external/external_test_speed_llama.py b/test/external/external_test_speed_llama.py index d21d2f82ae..1ae0023ac5 100644 --- a/test/external/external_test_speed_llama.py +++ b/test/external/external_test_speed_llama.py @@ -9,7 +9,7 @@ from tinygrad.helpers import Profiling class FakeProgram: def __init__(self, name:str, prg:bytes): pass - def __call__(self, *bufs, global_size, local_size, wait=False): pass + def __call__(self, *bufs, global_size, local_size, vals=(), wait=False): pass class FakeAllocator(Allocator): def _alloc(self, sz): return None diff --git a/tinygrad/device.py b/tinygrad/device.py index a779143b1f..8494eb28d4 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -94,7 +94,7 @@ class Buffer: def fromCPU(device:str, x:np.ndarray): return Buffer(device, x.size, dtypes.from_np(x.dtype)).copyin(x.data) def toCPU(self) -> np.ndarray: # zero copy with as_buffer - if hasattr(self.allocator, 'as_buffer'): return np.frombuffer(self.allocator.as_buffer(self._buf), dtype=np.dtype(self.dtype.np, metadata={"backing": self._buf})) + if hasattr(self.allocator, 'as_buffer'): return np.frombuffer(self.allocator.as_buffer(self._buf), dtype=np.dtype(self.dtype.np, metadata={"backing": self._buf})) # type: ignore ret = np.empty(self.size, self.dtype.np) if self.size > 0: self.allocator.copyout(flat_mv(ret.data), self._buf) return ret @@ -265,7 +265,7 @@ class CompiledASTRunner(JITRunner): lra = self.runtime_args.copy() if global_size: lra['global_size'] = global_size if local_size and 'local_size' not in lra: lra['local_size'] = local_size - et = self.clprg(*[x._buf for x in rawbufs], *[var_vals[k] for k in self.vars], **lra, wait=wait or DEBUG>=2) + et = self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.vars), wait=wait or DEBUG>=2) update_stats(self.display_name, self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit, lra=lra) return et diff --git a/tinygrad/features/graph/cuda.py b/tinygrad/features/graph/cuda.py index 989866d8f1..9832a41a3b 100644 --- a/tinygrad/features/graph/cuda.py +++ b/tinygrad/features/graph/cuda.py @@ -27,7 +27,7 @@ class CUDAGraph: prg: CompiledASTRunner = cast(CompiledASTRunner, ji.prg) c_deps = (type(graph_node)*1)(*(graph_node,)) if graph_node is not None else None - c_kernel_input_config, c_input_params = encode_args_cuda_style([cast(Buffer, x)._buf for x in ji.rawbufs] + [var_vals[x] for x in prg.vars], *self.encode_args_info()) + c_kernel_input_config, c_input_params = encode_args_cuda_style([cast(Buffer, x)._buf for x in ji.rawbufs], [var_vals[x] for x in prg.vars], *self.encode_args_info()) c_node_params = self.build_kernel_node_params(prg, *cast(Tuple[List[int], List[int]], prg.launch_dims(var_vals)), c_kernel_input_config) graph_node = self.graph_add_kernel_node(self.graph, c_deps, c_node_params) diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 254ced7985..a6a9341c51 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -293,8 +293,8 @@ def compile_cuda_style(prg, compile_options, prog_t, create_prog, compile_prog, if status != 0: raise RuntimeError(f"compile failed: {get_bytes(prog, get_log_size, get_log, check).decode()}") return get_bytes(prog, get_code_size, get_code, check) -def encode_args_cuda_style(args, device_ptr_t, marks) -> Tuple[ctypes.Array, ctypes.Structure]: - c_args = init_c_struct_t(tuple([(f'f{i}', device_ptr_t if not isinstance(x, int) else ctypes.c_int) for i,x in enumerate(args)]))(*args) +def encode_args_cuda_style(bufs, vals, device_ptr_t, marks) -> Tuple[ctypes.Array, ctypes.Structure]: + c_args = init_c_struct_t(tuple([(f'f{i}', device_ptr_t) for i in range(len(bufs))] + [(f'f{i}', ctypes.c_int) for i in range(len(bufs), len(bufs)+len(vals))]))(*bufs, *vals) return (ctypes.c_void_p * 5)(ctypes.c_void_p(marks[0]), ctypes.cast(ctypes.pointer(c_args), ctypes.c_void_p), ctypes.c_void_p(marks[1]), ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(c_args))), ctypes.c_void_p), ctypes.c_void_p(marks[2])), c_args def time_execution_cuda_style(cb, ev_t, evcreate, evrecord, evsync, evdestroy, evtime, enable=False) -> Optional[float]: diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 6b6c97886b..d9293022a0 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -22,7 +22,7 @@ class ClangProgram: pathlib.Path(cached_file_path.name).write_bytes(lib) self.fxn: Any = ctypes.CDLL(str(cached_file_path.name))[name] - def __call__(self, *args, wait=False): return cpu_time_execution(lambda: self.fxn(*args), enable=wait) + def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait) renderer = functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict", arg_int_prefix="const int")) ClangDevice = Compiled(MallocAllocator, LinearizerOptions(supports_float4=False, has_local=False), renderer, compile_clang, ClangProgram) diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 22927b0e52..92857411ba 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -43,9 +43,9 @@ class CUDAProgram: def __del__(self): if not CUDACPU: check(cuda.cuModuleUnload(self.module)) - def __call__(self, *args, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], wait=False): + def __call__(self, *bufs, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], vals:Tuple[int, ...]=(), wait=False): if not CUDACPU: check(cuda.cuCtxSetCurrent(self.device.context)) - c_kernel_input_config = encode_args_cuda_style(args, cuda.CUdeviceptr_v2, (1,2,0))[0] if not CUDACPU else args + c_kernel_input_config = encode_args_cuda_style(bufs, vals, cuda.CUdeviceptr_v2, (1,2,0))[0] if not CUDACPU else (bufs+vals) return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, c_kernel_input_config)), enable=wait) class CUDAAllocator(LRUAllocator): diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 81544ed74b..39e4cbb4ea 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Tuple, Optional, Union, List, cast +from typing import Tuple, Optional, List import ctypes, functools import gpuctypes.opencl as cl from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, diskcache, OSX, ImageDType, DEBUG @@ -40,10 +40,9 @@ class CLProgram: check(cl.clReleaseKernel(self.kernel)) check(cl.clReleaseProgram(self.program)) - def __call__(self, *bufs:Union[cl.cl_mem, int], global_size:Tuple[int,...], local_size:Optional[Tuple[int,...]]=None, wait=False) -> Optional[float]: - for i,b in enumerate(bufs): - bc = ctypes.c_int32(b) if isinstance(b, int) else cast(cl.cl_mem, b) - cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(bc), ctypes.byref(bc)) + def __call__(self, *bufs:cl.cl_mem, global_size:Tuple[int,...], local_size:Optional[Tuple[int,...]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: + for i,b in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b)) + for i,b in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(b))) if local_size is not None: global_size = tuple(int(g*l) for g,l in zip(global_size, local_size)) event = cl.cl_event() if wait else None check(cl.clEnqueueNDRangeKernel(self.device.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py index 534a80779c..dc02969ca0 100644 --- a/tinygrad/runtime/ops_hip.py +++ b/tinygrad/runtime/ops_hip.py @@ -38,10 +38,10 @@ class HIPProgram: def __del__(self): if not MOCKHIP: check(hip.hipModuleUnload(self.module)) - def __call__(self, *args, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], wait=False): + def __call__(self, *args, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], vals:Tuple[int, ...]=(), wait=False): if MOCKHIP: return float("inf") check(hip.hipSetDevice(self.device)) - return hip_time_execution(lambda: check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, encode_args_cuda_style(args, hip.hipDeviceptr_t, marks=(1,2,3))[0])), enable=wait) + return hip_time_execution(lambda: check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, encode_args_cuda_style(args, vals, hip.hipDeviceptr_t, marks=(1,2,3))[0])), enable=wait) T = TypeVar("T") class HIPAllocator(LRUAllocator): diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index af0d9e4a8f..9f92402659 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -1,5 +1,5 @@ import ctypes -from typing import ClassVar +from typing import ClassVar, Tuple from tinygrad.device import Compiled, MallocAllocator from tinygrad.helpers import getenv, DEBUG, diskcache, cpu_time_execution from ctypes import CFUNCTYPE @@ -59,8 +59,8 @@ class LLVMProgram: LLVM().engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib)) self.fxn = LLVM.engine.get_function_address(name) - def __call__(self, *bufs, wait=False): - self.cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.c_int32 if isinstance(b, int) else ctypes.c_void_p for b in bufs])(self.fxn) - return cpu_time_execution(lambda: self.cfunc(*bufs), enable=wait) + def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False): + self.cfunc = CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn) + return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait) LLVMDevice = Compiled(MallocAllocator, LinearizerOptions(supports_float4=False, has_local=False, has_shared=False), uops_to_llvm_ir, compile_llvm, LLVMProgram) diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 604db8d26b..1f83296439 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -21,24 +21,23 @@ def compile_metal(prg, use_xcode=bool(getenv("METAL_XCODE"))) -> bytes: class MetalProgram: def __init__(self, device:MetalDevice, name:str, lib:bytes): self.device, self.name, self.lib = device, name, lib - data = libdispatch.dispatch_data_create(lib, len(lib), None, None) - self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None)) - self.fxn = self.library.newFunctionWithName_(name) if DEBUG >= 6: with tempfile.NamedTemporaryFile(delete=True) as shader: shader.write(lib) shader.flush() os.system(f"cd {pathlib.Path(__file__).parents[2]}/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}") + data = libdispatch.dispatch_data_create(lib, len(lib), None, None) + self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None)) + self.fxn = self.library.newFunctionWithName_(name) self.pipeline_state = unwrap2(self.device.device.newComputePipelineStateWithFunction_error_(self.fxn, None)) - def __call__(self, *bufs, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], wait=False): + def __call__(self, *bufs, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], vals:Tuple[int, ...]=(), wait=False): assert prod(local_size) <= self.pipeline_state.maxTotalThreadsPerThreadgroup(), f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}" command_buffer = self.device.mtl_queue.commandBuffer() encoder = command_buffer.computeCommandEncoder() encoder.setComputePipelineState_(self.pipeline_state) - for i,a in enumerate(bufs): - if isinstance(a, int): encoder.setBytes_length_atIndex_((arg:=ctypes.c_int32(a)), ctypes.sizeof(arg), i) - else: encoder.setBuffer_offset_atIndex_(a, 0, i) + for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a, 0, i) + for i,a in enumerate(vals,start=len(bufs)): encoder.setBytes_length_atIndex_(ctypes.c_int32(a), 4, i) encoder.dispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size)) encoder.endEncoding() command_buffer.commit() diff --git a/tinygrad/runtime/ops_webgpu.py b/tinygrad/runtime/ops_webgpu.py index b3103a941c..dc6cea8fed 100644 --- a/tinygrad/runtime/ops_webgpu.py +++ b/tinygrad/runtime/ops_webgpu.py @@ -8,7 +8,7 @@ wgpu_device = get_default_device() class WebGPUProgram: def __init__(self, name:str, lib:bytes): self.name, self.lib, self.prg = name, lib, wgpu_device.create_shader_module(code=lib) # NOTE: this is the compiler - def __call__(self, *bufs, global_size, local_size, wait=False): + def __call__(self, *bufs, global_size, local_size, vals=(), wait=False): assert len(bufs) <= 8, "WEBGPU only supports 8 buffers" binding_layouts = [{"binding": i, "visibility": wgpu.ShaderStage.COMPUTE, "buffer": {"type": wgpu.BufferBindingType.storage}} for i in range(len(bufs))] bindings = [{"binding": i, "resource": {"buffer": x, "offset": 0, "size": x.size}} for i, x in enumerate(bufs)]