From ed8a32722a4ac387c83a9ccf5b3a8b820432a528 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Wed, 24 Jan 2024 13:23:09 -0800
Subject: [PATCH] hip mutex signal (#3234)

* hip mutex

* hip mutex 2

* sync
---
 tinygrad/device.py          |  4 +++-
 tinygrad/helpers.py         |  2 +-
 tinygrad/lazy.py            |  3 ++-
 tinygrad/realize.py         | 28 ++++------------------
 tinygrad/runtime/ops_gpu.py |  9 +++----
 tinygrad/runtime/ops_hip.py | 47 +++++++++++++++++++++++--------------
 6 files changed, 46 insertions(+), 47 deletions(-)
diff --git a/tinygrad/device.py b/tinygrad/device.py
index e0289f85d7..683fe66166 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -68,6 +68,8 @@ def update_stats(name:str, op_estimate:sint, mem_estimate:int, var_vals: Optiona
 class BufferOptions:
   image: Optional[ImageDType] = None
   uncached: bool = False
+  host: bool = False
+  signal: bool = False
 
 class Buffer:
   def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None, options:Optional[BufferOptions]=None):
@@ -156,7 +158,7 @@ class LRUAllocator(Allocator):  # pylint: disable=abstract-method
       for opaque in opaques: self._free(opaque)
       opaques.clear()
   def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None):
-    if getenv("LRU", 1): self.cache[(size, options)].append(opaque)
+    if getenv("LRU", 1) and (options is None or not options.signal): self.cache[(size, options)].append(opaque)
     else: self._free(opaque)
 
 class _MallocAllocator(LRUAllocator):
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 7b54390e5a..d85d368d20 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -100,7 +100,7 @@ class Timing(contextlib.ContextDecorator):
     self.et = time.perf_counter_ns() - self.st
     if self.enabled: print(f"{self.prefix}{self.et*1e-6:6.2f} ms"+(self.on_exit(self.et) if self.on_exit else ""))
 
-def _format_fcn(fcn): return f"{fcn[0]}:{fcn[2]}" if fcn[2] != "<genexpr>" else f"{fcn[0]}:{fcn[1]}"
+def _format_fcn(fcn): return f"{fcn[0]}:{fcn[1]}:{fcn[2]}"
 class Profiling(contextlib.ContextDecorator):
   def __init__(self, enabled=True, sort='cumtime', frac=0.2, fn=None, ts=1):
     self.enabled, self.sort, self.frac, self.fn, self.time_scale = enabled, sort, frac, fn, 1e3/ts
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index 2a2b4df827..b5102cb20e 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -78,7 +78,8 @@ class LazyBuffer:
   def schedule(self, seen=None): return create_schedule([self], seen)
 
   def _copy(self, device:str) -> LazyBuffer:
-    sync = LazyBuffer.loadop(LoadOps.SYNC, (0,), dtypes.uint32, self.device, src=self, enable_cache=True)
+    sync_size = 1 if self.device.startswith("HIP") else 0
+    sync = LazyBuffer.loadop(LoadOps.SYNC, (sync_size,), dtypes.uint32, self.device, src=self, enable_cache=True)
     wait = LazyBuffer.loadop(LoadOps.WAIT, (0,), dtypes.uint32, device, src=sync, enable_cache=True)
     return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, None, (self, wait), enable_cache=False)
 
diff --git a/tinygrad/realize.py b/tinygrad/realize.py
index b4151ae085..1a186984c5 100644
--- a/tinygrad/realize.py
+++ b/tinygrad/realize.py
@@ -1,6 +1,6 @@
 from typing import List, Dict, Optional, cast
 from tinygrad.ops import LoadOps, ScheduleItem, BufferOps, GlobalCounters
-from tinygrad.device import Device, Buffer, BufferCopy, BufferXfer, BufferRead, JITRunner, update_stats, InterpretedASTRunner, Compiled
+from tinygrad.device import Device, Buffer, BufferCopy, BufferXfer, BufferRead, JITRunner, update_stats, InterpretedASTRunner, Compiled, BufferOptions
 from tinygrad.graph import print_tree, realized_lazybuffer
 from tinygrad.helpers import colored, getenv, GRAPH, cpu_time_execution, DEBUG
 from tinygrad.shape.symbolic import Variable
@@ -21,26 +21,6 @@ class SyncOp(JITRunner):
     et = cpu_time_execution(self.device.synchronize, enable=wait or DEBUG >= 1)
     update_stats(colored("synchronize", "RED"), 0, 0, {}, et, 1, device=self.dname)
 
-class SyncEvent(JITRunner):
-  def __init__(self, lb):
-    self.lb, self.device, self.dname = lb, Device[lb.device], lb.device
-    assert hasattr(self.device, "event_create")
-    setattr(self.lb, "event", self.device.event_create())
-    super().__init__()
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False):
-    assert hasattr(self.device, "event_record")
-    self.device.event_record(self.lb.event)
-    update_stats(colored("sync", "red"), 0, 0, {}, None, 1, device=self.dname)
-
-class WaitEvent(JITRunner):
-  def __init__(self, device, lb_sync):
-    self.lb_sync, self.device, self.dname = lb_sync, Device[device], device
-    super().__init__()
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False):
-    assert hasattr(self.device, "event_wait")
-    self.device.event_wait(self.lb_sync.event)
-    update_stats(colored("wait", "RED"), 0, 0, {}, None, 1, device=self.dname)
-
 def lower_schedule_item(si:ScheduleItem) -> Optional[JITRunner]:
   assert all(si.out.device == x.device for x in si.inputs) or si.ast.op in {LoadOps.COPY, LoadOps.WAIT}, \
     f"all devices must be the same, {si.out.device} != {[x.device for x in si.inputs]} {print_tree(si.ast) or ''}"
@@ -52,8 +32,9 @@ def lower_schedule_item(si:ScheduleItem) -> Optional[JITRunner]:
   if si.ast.op is LoadOps.CUSTOM: return CustomOp(si.ast.arg)
   # TODO: this doesn't have to be only HIP, check if it has the event functions
   if si.ast.op in {LoadOps.SYNC, LoadOps.WAIT} and si.out.device.startswith("HIP") and si.inputs[0].device.startswith("HIP"):
+    from tinygrad.runtime.ops_hip import SyncEvent, WaitEvent
     if si.ast.op is LoadOps.SYNC: return SyncEvent(si.out)
-    if si.ast.op is LoadOps.WAIT: return WaitEvent(si.out.device, si.inputs[0])
+    if si.ast.op is LoadOps.WAIT: return WaitEvent(si.out.device)
   else:
     if si.ast.op is LoadOps.SYNC: return SyncOp(si.out.device) if isinstance(Device[si.out.device], Compiled) else None
     if si.ast.op is LoadOps.WAIT: return None
@@ -78,8 +59,9 @@ def run_schedule(schedule:List[ScheduleItem]):
 
     # we don't have an output buffer, we have to create it, and create to max size if it has symbolic shape
     if si.out.size > 0:
+      options = BufferOptions(host=True, signal=True) if si.ast.op is LoadOps.SYNC else None
       si.out.realized = si.out.output_buffer if si.out.output_buffer is not None else \
-        Buffer(si.out.device, si.out.size, si.out.dtype, "PLACEHOLDER" if isinstance(prg, InterpretedASTRunner) else None)
+        Buffer(si.out.device, si.out.size, si.out.dtype, "PLACEHOLDER" if isinstance(prg, InterpretedASTRunner) else None, options=options)
       del si.out.srcs
 
     # run the function (put it in JIT)
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index c4aa69599e..c36b33ba2e 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -61,10 +61,11 @@ class CLAllocator(LRUAllocator):
   def _alloc(self, size:int) -> cl.cl_mem:
     return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
   def _alloc_with_options(self, size:int, options:BufferOptions) -> cl.cl_mem:
-    assert options.image is not None
-    return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
-                                      cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
-                                      options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status)
+    if options.image is not None:
+      return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
+                                        cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
+                                        options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status)
+    else: return self._alloc(size)
   def _free(self, buf:cl.cl_mem): check(cl.clReleaseMemObject(buf))
   def copyin(self, dest:cl.cl_mem, src:memoryview):
     check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py
index 9a9cde7466..bdbf59f355 100644
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 import ctypes, functools, subprocess, io
-from typing import Tuple, TypeVar, List, Any
+from typing import Tuple, TypeVar, List, Any, cast
 import gpuctypes.hip as hip
 from tinygrad.helpers import DEBUG, getenv, init_c_var, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style
-from tinygrad.helpers import from_mv, round_up, to_mv
-from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions
+from tinygrad.helpers import from_mv, round_up, to_mv, colored
+from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.codegen.kernel import LinearizerOptions
 
@@ -55,15 +55,18 @@ class HIPAllocator(LRUAllocator):
     check(hip.hipSetDevice(self.device.device))
     return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
   def _alloc_with_options(self, size:int, options:BufferOptions):
-    assert options.uncached
     check(hip.hipSetDevice(self.device.device))
-    return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipExtMallocWithFlags(ctypes.byref(x), size, 3)))  # hipDeviceMallocUncached = 3
+    if options.uncached:
+      return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipExtMallocWithFlags(ctypes.byref(x), size, 3)))  # hipDeviceMallocUncached = 3
+    elif options.host:
+      return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 2 if options.signal else 0)))
+    else:
+      raise Exception("no options")
   def _free(self, opaque:T): check(hip.hipFree(opaque))
-  def _hostalloc(self, size:int): return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 0)))
   def copy_from_fd(self, dest, fd, offset, size):
     check(hip.hipSetDevice(self.device.device))
     if not hasattr(self, 'hb'):
-      self.hb = [self._hostalloc(CHUNK_SIZE) for _ in range(2)]
+      self.hb = [self._alloc_with_options(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)]
       self.hb_events = [None, None]
       self.hb_polarity = 0
     fo = io.FileIO(fd, "a+b", closefd=False)
@@ -86,7 +89,7 @@ class HIPAllocator(LRUAllocator):
       minor_offset = 0 # only on the first
   def copyin(self, dest:T, src: memoryview):
     check(hip.hipSetDevice(self.device.device))
-    host_mem = self._hostalloc(len(src))
+    host_mem = self._alloc_with_options(len(src), BufferOptions(host=True))
     self.device.pending_copyin.append(host_mem)
     ctypes.memmove(host_mem, from_mv(src), len(src))
     check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))
@@ -114,12 +117,22 @@ class HIPDevice(Compiled):
     for opaque in self.pending_copyin: check(hip.hipFree(opaque))
     self.track_cross_buffer.clear()
     self.pending_copyin.clear()
-  def event_create(self):
-    check(hip.hipSetDevice(self.device))
-    return init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
-  def event_record(self, evt):
-    check(hip.hipSetDevice(self.device))
-    check(hip.hipEventRecord(evt, None))
-  def event_wait(self, evt):
-    check(hip.hipSetDevice(self.device))
-    check(hip.hipStreamWaitEvent(None, evt, 0))
\ No newline at end of file
+
+class SyncEvent(JITRunner):
+  def __init__(self, lb):
+    self.lb, self.device, self.dname = lb, cast(HIPDevice, Device[lb.device]), lb.device
+    super().__init__()
+  def __call__(self, rawbufs:List[Buffer], var_vals, wait=False, jit=False):
+    to_mv(rawbufs[0]._buf, 4).cast("I")[0] = 0
+    check(hip.hipSetDevice(self.device.device))
+    check(hip.hipStreamWriteValue32(None, rawbufs[0]._buf, 1, 0))
+    update_stats(colored("sync", "red"), 0, 0, {}, None, 1, device=self.dname)
+
+class WaitEvent(JITRunner):
+  def __init__(self, device):
+    self.device, self.dname = cast(HIPDevice, Device[device]), device
+    super().__init__()
+  def __call__(self, rawbufs:List[Buffer], var_vals, wait=False, jit=False):
+    check(hip.hipSetDevice(self.device.device))
+    check(hip.hipStreamWaitValue32(None, rawbufs[0]._buf, 1, 1, 0xFFFFFFFF))
+    update_stats(colored("wait", "RED"), 0, 0, {}, None, 1, device=self.dname)