From ed8a32722a4ac387c83a9ccf5b3a8b820432a528 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Wed, 24 Jan 2024 13:23:09 -0800 Subject: [PATCH] hip mutex signal (#3234) * hip mutex * hip mutex 2 * sync --- tinygrad/device.py | 4 +++- tinygrad/helpers.py | 2 +- tinygrad/lazy.py | 3 ++- tinygrad/realize.py | 28 ++++------------------ tinygrad/runtime/ops_gpu.py | 9 +++---- tinygrad/runtime/ops_hip.py | 47 +++++++++++++++++++++++-------------- 6 files changed, 46 insertions(+), 47 deletions(-) diff --git a/tinygrad/device.py b/tinygrad/device.py index e0289f85d7..683fe66166 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -68,6 +68,8 @@ def update_stats(name:str, op_estimate:sint, mem_estimate:int, var_vals: Optiona class BufferOptions: image: Optional[ImageDType] = None uncached: bool = False + host: bool = False + signal: bool = False class Buffer: def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None, options:Optional[BufferOptions]=None): @@ -156,7 +158,7 @@ class LRUAllocator(Allocator): # pylint: disable=abstract-method for opaque in opaques: self._free(opaque) opaques.clear() def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None): - if getenv("LRU", 1): self.cache[(size, options)].append(opaque) + if getenv("LRU", 1) and (options is None or not options.signal): self.cache[(size, options)].append(opaque) else: self._free(opaque) class _MallocAllocator(LRUAllocator): diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 7b54390e5a..d85d368d20 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -100,7 +100,7 @@ class Timing(contextlib.ContextDecorator): self.et = time.perf_counter_ns() - self.st if self.enabled: print(f"{self.prefix}{self.et*1e-6:6.2f} ms"+(self.on_exit(self.et) if self.on_exit else "")) -def _format_fcn(fcn): return f"{fcn[0]}:{fcn[2]}" if fcn[2] != "" else f"{fcn[0]}:{fcn[1]}" +def _format_fcn(fcn): return f"{fcn[0]}:{fcn[1]}:{fcn[2]}" class Profiling(contextlib.ContextDecorator): def __init__(self, enabled=True, sort='cumtime', frac=0.2, fn=None, ts=1): self.enabled, self.sort, self.frac, self.fn, self.time_scale = enabled, sort, frac, fn, 1e3/ts diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 2a2b4df827..b5102cb20e 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -78,7 +78,8 @@ class LazyBuffer: def schedule(self, seen=None): return create_schedule([self], seen) def _copy(self, device:str) -> LazyBuffer: - sync = LazyBuffer.loadop(LoadOps.SYNC, (0,), dtypes.uint32, self.device, src=self, enable_cache=True) + sync_size = 1 if self.device.startswith("HIP") else 0 + sync = LazyBuffer.loadop(LoadOps.SYNC, (sync_size,), dtypes.uint32, self.device, src=self, enable_cache=True) wait = LazyBuffer.loadop(LoadOps.WAIT, (0,), dtypes.uint32, device, src=sync, enable_cache=True) return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, None, (self, wait), enable_cache=False) diff --git a/tinygrad/realize.py b/tinygrad/realize.py index b4151ae085..1a186984c5 100644 --- a/tinygrad/realize.py +++ b/tinygrad/realize.py @@ -1,6 +1,6 @@ from typing import List, Dict, Optional, cast from tinygrad.ops import LoadOps, ScheduleItem, BufferOps, GlobalCounters -from tinygrad.device import Device, Buffer, BufferCopy, BufferXfer, BufferRead, JITRunner, update_stats, InterpretedASTRunner, Compiled +from tinygrad.device import Device, Buffer, BufferCopy, BufferXfer, BufferRead, JITRunner, update_stats, InterpretedASTRunner, Compiled, BufferOptions from tinygrad.graph import print_tree, realized_lazybuffer from tinygrad.helpers import colored, getenv, GRAPH, cpu_time_execution, DEBUG from tinygrad.shape.symbolic import Variable @@ -21,26 +21,6 @@ class SyncOp(JITRunner): et = cpu_time_execution(self.device.synchronize, enable=wait or DEBUG >= 1) update_stats(colored("synchronize", "RED"), 0, 0, {}, et, 1, device=self.dname) -class SyncEvent(JITRunner): - def __init__(self, lb): - self.lb, self.device, self.dname = lb, Device[lb.device], lb.device - assert hasattr(self.device, "event_create") - setattr(self.lb, "event", self.device.event_create()) - super().__init__() - def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False): - assert hasattr(self.device, "event_record") - self.device.event_record(self.lb.event) - update_stats(colored("sync", "red"), 0, 0, {}, None, 1, device=self.dname) - -class WaitEvent(JITRunner): - def __init__(self, device, lb_sync): - self.lb_sync, self.device, self.dname = lb_sync, Device[device], device - super().__init__() - def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False): - assert hasattr(self.device, "event_wait") - self.device.event_wait(self.lb_sync.event) - update_stats(colored("wait", "RED"), 0, 0, {}, None, 1, device=self.dname) - def lower_schedule_item(si:ScheduleItem) -> Optional[JITRunner]: assert all(si.out.device == x.device for x in si.inputs) or si.ast.op in {LoadOps.COPY, LoadOps.WAIT}, \ f"all devices must be the same, {si.out.device} != {[x.device for x in si.inputs]} {print_tree(si.ast) or ''}" @@ -52,8 +32,9 @@ def lower_schedule_item(si:ScheduleItem) -> Optional[JITRunner]: if si.ast.op is LoadOps.CUSTOM: return CustomOp(si.ast.arg) # TODO: this doesn't have to be only HIP, check if it has the event functions if si.ast.op in {LoadOps.SYNC, LoadOps.WAIT} and si.out.device.startswith("HIP") and si.inputs[0].device.startswith("HIP"): + from tinygrad.runtime.ops_hip import SyncEvent, WaitEvent if si.ast.op is LoadOps.SYNC: return SyncEvent(si.out) - if si.ast.op is LoadOps.WAIT: return WaitEvent(si.out.device, si.inputs[0]) + if si.ast.op is LoadOps.WAIT: return WaitEvent(si.out.device) else: if si.ast.op is LoadOps.SYNC: return SyncOp(si.out.device) if isinstance(Device[si.out.device], Compiled) else None if si.ast.op is LoadOps.WAIT: return None @@ -78,8 +59,9 @@ def run_schedule(schedule:List[ScheduleItem]): # we don't have an output buffer, we have to create it, and create to max size if it has symbolic shape if si.out.size > 0: + options = BufferOptions(host=True, signal=True) if si.ast.op is LoadOps.SYNC else None si.out.realized = si.out.output_buffer if si.out.output_buffer is not None else \ - Buffer(si.out.device, si.out.size, si.out.dtype, "PLACEHOLDER" if isinstance(prg, InterpretedASTRunner) else None) + Buffer(si.out.device, si.out.size, si.out.dtype, "PLACEHOLDER" if isinstance(prg, InterpretedASTRunner) else None, options=options) del si.out.srcs # run the function (put it in JIT) diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index c4aa69599e..c36b33ba2e 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -61,10 +61,11 @@ class CLAllocator(LRUAllocator): def _alloc(self, size:int) -> cl.cl_mem: return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status) def _alloc_with_options(self, size:int, options:BufferOptions) -> cl.cl_mem: - assert options.image is not None - return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE, - cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]), - options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status) + if options.image is not None: + return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE, + cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]), + options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status) + else: return self._alloc(size) def _free(self, buf:cl.cl_mem): check(cl.clReleaseMemObject(buf)) def copyin(self, dest:cl.cl_mem, src:memoryview): check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None)) diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py index 9a9cde7466..bdbf59f355 100644 --- a/tinygrad/runtime/ops_hip.py +++ b/tinygrad/runtime/ops_hip.py @@ -1,10 +1,10 @@ from __future__ import annotations import ctypes, functools, subprocess, io -from typing import Tuple, TypeVar, List, Any +from typing import Tuple, TypeVar, List, Any, cast import gpuctypes.hip as hip from tinygrad.helpers import DEBUG, getenv, init_c_var, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style -from tinygrad.helpers import from_mv, round_up, to_mv -from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions +from tinygrad.helpers import from_mv, round_up, to_mv, colored +from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.codegen.kernel import LinearizerOptions @@ -55,15 +55,18 @@ class HIPAllocator(LRUAllocator): check(hip.hipSetDevice(self.device.device)) return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size))) def _alloc_with_options(self, size:int, options:BufferOptions): - assert options.uncached check(hip.hipSetDevice(self.device.device)) - return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipExtMallocWithFlags(ctypes.byref(x), size, 3))) # hipDeviceMallocUncached = 3 + if options.uncached: + return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipExtMallocWithFlags(ctypes.byref(x), size, 3))) # hipDeviceMallocUncached = 3 + elif options.host: + return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 2 if options.signal else 0))) + else: + raise Exception("no options") def _free(self, opaque:T): check(hip.hipFree(opaque)) - def _hostalloc(self, size:int): return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 0))) def copy_from_fd(self, dest, fd, offset, size): check(hip.hipSetDevice(self.device.device)) if not hasattr(self, 'hb'): - self.hb = [self._hostalloc(CHUNK_SIZE) for _ in range(2)] + self.hb = [self._alloc_with_options(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)] self.hb_events = [None, None] self.hb_polarity = 0 fo = io.FileIO(fd, "a+b", closefd=False) @@ -86,7 +89,7 @@ class HIPAllocator(LRUAllocator): minor_offset = 0 # only on the first def copyin(self, dest:T, src: memoryview): check(hip.hipSetDevice(self.device.device)) - host_mem = self._hostalloc(len(src)) + host_mem = self._alloc_with_options(len(src), BufferOptions(host=True)) self.device.pending_copyin.append(host_mem) ctypes.memmove(host_mem, from_mv(src), len(src)) check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None)) @@ -114,12 +117,22 @@ class HIPDevice(Compiled): for opaque in self.pending_copyin: check(hip.hipFree(opaque)) self.track_cross_buffer.clear() self.pending_copyin.clear() - def event_create(self): - check(hip.hipSetDevice(self.device)) - return init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x)))) - def event_record(self, evt): - check(hip.hipSetDevice(self.device)) - check(hip.hipEventRecord(evt, None)) - def event_wait(self, evt): - check(hip.hipSetDevice(self.device)) - check(hip.hipStreamWaitEvent(None, evt, 0)) \ No newline at end of file + +class SyncEvent(JITRunner): + def __init__(self, lb): + self.lb, self.device, self.dname = lb, cast(HIPDevice, Device[lb.device]), lb.device + super().__init__() + def __call__(self, rawbufs:List[Buffer], var_vals, wait=False, jit=False): + to_mv(rawbufs[0]._buf, 4).cast("I")[0] = 0 + check(hip.hipSetDevice(self.device.device)) + check(hip.hipStreamWriteValue32(None, rawbufs[0]._buf, 1, 0)) + update_stats(colored("sync", "red"), 0, 0, {}, None, 1, device=self.dname) + +class WaitEvent(JITRunner): + def __init__(self, device): + self.device, self.dname = cast(HIPDevice, Device[device]), device + super().__init__() + def __call__(self, rawbufs:List[Buffer], var_vals, wait=False, jit=False): + check(hip.hipSetDevice(self.device.device)) + check(hip.hipStreamWaitValue32(None, rawbufs[0]._buf, 1, 1, 0xFFFFFFFF)) + update_stats(colored("wait", "RED"), 0, 0, {}, None, 1, device=self.dname)