event driven hip (#3160)

* event driven hip * simpler, src makes copy * pass mypy
2026-01-09 06:58:11 -05:00 · 2024-01-18 14:35:18 -08:00
parent d2aab65958
commit c80884884e
4 changed files with 40 additions and 11 deletions
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -424,10 +424,11 @@ After you are done speaking, output [EOS]. You are not Chad.
      st = GlobalCounters.time_sum_s
      with Profiling(enabled=args.profile):
        with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
-          with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
+          with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
                      f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
                      (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_count*1e-9*2/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
-            tok = llama.model(Tensor([toks[start_pos:]], device=device), start_pos, args.temperature).item()
+            tok_tensor = llama.model(Tensor([toks[start_pos:]], device=device), start_pos, args.temperature)
+          tok = tok_tensor.item()

      # use the kv cache
      start_pos = len(toks)
--- a/extra/hip_events.py
+++ b/extra/hip_events.py
@@ -0,0 +1,12 @@
+import ctypes
+import gpuctypes.hip as hip
+from tinygrad.runtime.ops_hip import check
+from tinygrad.helpers import init_c_var
+
+if __name__ == "__main__":
+  check(hip.hipSetDevice(0))
+  evt = init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
+  check(hip.hipSetDevice(1))
+  check(hip.hipStreamWaitEvent(None, evt, 0))
+  check(hip.hipSetDevice(0))
+  check(hip.hipEventRecord(evt, None))
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -68,8 +68,8 @@ def update_stats(name:str, op_estimate:sint, mem_estimate:int, var_vals: Optiona
 class Buffer:
  def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None):
    assert isinstance(dtype, DType)
-    self.device, self.size, self.dtype = device, size, dtype
-    self.allocator = Device[self.device].allocator
+    self.device, self.size, self.dtype, self.d = device, size, dtype, Device[device]
+    self.allocator = self.d.allocator
    # TODO: image hack shouldn't be here. where should it be?
    self._buf = opaque if opaque is not None else self.allocator.alloc(dtype if isinstance(dtype, ImageDType) else size * dtype.itemsize)
    # TODO: mem_used for all devices
@@ -97,11 +97,11 @@ class Buffer:
    return mv

 def _internal_buffer_copy(dest:Buffer, src:Buffer):
-  if hasattr(dest.allocator, 'transfer') and type(dest.allocator) is type(src.allocator):  # noqa: E721
+  if hasattr(src.allocator, 'transfer') and type(dest.allocator) is type(src.allocator):  # noqa: E721
    # fast path, used on HIP between GPUs
    # NOTE: it's important we use the dest device here to ensure the transfer is ready
-    Device[src.device].synchronize()   # TODO: async this
-    dest.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
+    src.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
+    if hasattr(dest.d, "block") and hasattr(src.d, "event"): dest.d.block(src.d.event())
    return
  if getenv("FROM_BUFFER") and hasattr(dest.allocator, 'from_buffer') and hasattr(dest.allocator, 'transfer') and hasattr(src.allocator, 'as_buffer'):
    # fast path, used on Metal in OS X Sonoma
@@ -128,7 +128,7 @@ class _BufferCopy(JITRunner):
    _internal_buffer_copy(dest, src)
    et = None
    if wait or DEBUG >= 2:
-      Device[dest.device].synchronize()
+      dest.d.synchronize()
      et = time.perf_counter() - st
    update_stats(colored(f"copy {dest.size*dest.dtype.itemsize:8d}, {dest.device[:7]:>7s} <- {src.device[:7]:7s}", "yellow"),
                 0, dest.size*dest.dtype.itemsize, {}, et, 2, jit, device=dest.device)
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -46,6 +46,9 @@ class HIPAllocator(LRUAllocator):
  def __init__(self, device:HIPDevice):
    self.device = device
    super().__init__()
+  def free_cache(self):
+    self.device.synchronize()
+    return super().free_cache()
  def _alloc(self, size:int):
    check(hip.hipSetDevice(self.device.device))
    return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
@@ -63,6 +66,7 @@ class HIPAllocator(LRUAllocator):
    for local_offset in range(0, size+minor_offset, CHUNK_SIZE):
      local_size = min(round_up(size+minor_offset, PAGE_SIZE)-local_offset, CHUNK_SIZE)
      if self.hb_events[self.hb_polarity] is not None:
+        # NOTE: block doesn't work here because we modify the CPU memory
        check(hip.hipEventSynchronize(self.hb_events[self.hb_polarity]))
        check(hip.hipEventDestroy(self.hb_events[self.hb_polarity]))
        self.hb_events[self.hb_polarity] = None
@@ -81,18 +85,19 @@ class HIPAllocator(LRUAllocator):
    ctypes.memmove(host_mem, from_mv(src), len(src))
    check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))
  def copyout(self, dest:memoryview, src:T):
+    self.device.synchronize()
    check(hip.hipSetDevice(self.device.device))
    check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
  def transfer(self, dest:T, src:T, sz:int):
    check(hip.hipSetDevice(self.device.device))
-    # TODO: hipMemcpyAsync, but you have to track the "src" buffer to not free it
-    check(hip.hipMemcpy(dest, src, sz, hip.hipMemcpyDeviceToDevice))
+    check(hip.hipMemcpyAsync(dest, src, sz, hip.hipMemcpyDeviceToDevice, None))

 class HIPDevice(Compiled):
  def __init__(self, device:str=""):
    self.device = int(device.split(":")[1]) if ":" in device else 0
    self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode() if not MOCKHIP else "gfx1100"  # noqa: E501
    self.pending_copyin: List[hip.hipDeviceptr_t] = []
+    self.pending_events: List[hip.hipEvent_t] = []

    from tinygrad.runtime.graph.hip import HIPGraph
    super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer,
@@ -101,4 +106,15 @@ class HIPDevice(Compiled):
    check(hip.hipSetDevice(self.device))
    check(hip.hipDeviceSynchronize())
    for opaque in self.pending_copyin: check(hip.hipFree(opaque))
-    self.pending_copyin.clear()
+    for opaque in self.pending_events: check(hip.hipEventDestroy(opaque))
+    self.pending_copyin.clear()
+    self.pending_events.clear()
+  def event(self):
+    check(hip.hipSetDevice(self.device))
+    evt = init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
+    self.pending_events.append(evt)
+    check(hip.hipEventRecord(evt, None))
+    return evt
+  def block(self, evt):
+    check(hip.hipSetDevice(self.device))
+    check(hip.hipStreamWaitEvent(None, evt, 0))