event driven hip (#3160)

* event driven hip

* simpler, src makes copy

* pass mypy
This commit is contained in:
George Hotz
2024-01-18 14:35:18 -08:00
committed by GitHub
parent d2aab65958
commit c80884884e
4 changed files with 40 additions and 11 deletions

View File

@@ -424,10 +424,11 @@ After you are done speaking, output [EOS]. You are not Chad.
st = GlobalCounters.time_sum_s
with Profiling(enabled=args.profile):
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_count*1e-9*2/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
tok = llama.model(Tensor([toks[start_pos:]], device=device), start_pos, args.temperature).item()
tok_tensor = llama.model(Tensor([toks[start_pos:]], device=device), start_pos, args.temperature)
tok = tok_tensor.item()
# use the kv cache
start_pos = len(toks)

12
extra/hip_events.py Normal file
View File

@@ -0,0 +1,12 @@
import ctypes
import gpuctypes.hip as hip
from tinygrad.runtime.ops_hip import check
from tinygrad.helpers import init_c_var
if __name__ == "__main__":
check(hip.hipSetDevice(0))
evt = init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
check(hip.hipSetDevice(1))
check(hip.hipStreamWaitEvent(None, evt, 0))
check(hip.hipSetDevice(0))
check(hip.hipEventRecord(evt, None))

View File

@@ -68,8 +68,8 @@ def update_stats(name:str, op_estimate:sint, mem_estimate:int, var_vals: Optiona
class Buffer:
def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None):
assert isinstance(dtype, DType)
self.device, self.size, self.dtype = device, size, dtype
self.allocator = Device[self.device].allocator
self.device, self.size, self.dtype, self.d = device, size, dtype, Device[device]
self.allocator = self.d.allocator
# TODO: image hack shouldn't be here. where should it be?
self._buf = opaque if opaque is not None else self.allocator.alloc(dtype if isinstance(dtype, ImageDType) else size * dtype.itemsize)
# TODO: mem_used for all devices
@@ -97,11 +97,11 @@ class Buffer:
return mv
def _internal_buffer_copy(dest:Buffer, src:Buffer):
if hasattr(dest.allocator, 'transfer') and type(dest.allocator) is type(src.allocator): # noqa: E721
if hasattr(src.allocator, 'transfer') and type(dest.allocator) is type(src.allocator): # noqa: E721
# fast path, used on HIP between GPUs
# NOTE: it's important we use the dest device here to ensure the transfer is ready
Device[src.device].synchronize() # TODO: async this
dest.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
src.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
if hasattr(dest.d, "block") and hasattr(src.d, "event"): dest.d.block(src.d.event())
return
if getenv("FROM_BUFFER") and hasattr(dest.allocator, 'from_buffer') and hasattr(dest.allocator, 'transfer') and hasattr(src.allocator, 'as_buffer'):
# fast path, used on Metal in OS X Sonoma
@@ -128,7 +128,7 @@ class _BufferCopy(JITRunner):
_internal_buffer_copy(dest, src)
et = None
if wait or DEBUG >= 2:
Device[dest.device].synchronize()
dest.d.synchronize()
et = time.perf_counter() - st
update_stats(colored(f"copy {dest.size*dest.dtype.itemsize:8d}, {dest.device[:7]:>7s} <- {src.device[:7]:7s}", "yellow"),
0, dest.size*dest.dtype.itemsize, {}, et, 2, jit, device=dest.device)

View File

@@ -46,6 +46,9 @@ class HIPAllocator(LRUAllocator):
def __init__(self, device:HIPDevice):
self.device = device
super().__init__()
def free_cache(self):
self.device.synchronize()
return super().free_cache()
def _alloc(self, size:int):
check(hip.hipSetDevice(self.device.device))
return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
@@ -63,6 +66,7 @@ class HIPAllocator(LRUAllocator):
for local_offset in range(0, size+minor_offset, CHUNK_SIZE):
local_size = min(round_up(size+minor_offset, PAGE_SIZE)-local_offset, CHUNK_SIZE)
if self.hb_events[self.hb_polarity] is not None:
# NOTE: block doesn't work here because we modify the CPU memory
check(hip.hipEventSynchronize(self.hb_events[self.hb_polarity]))
check(hip.hipEventDestroy(self.hb_events[self.hb_polarity]))
self.hb_events[self.hb_polarity] = None
@@ -81,18 +85,19 @@ class HIPAllocator(LRUAllocator):
ctypes.memmove(host_mem, from_mv(src), len(src))
check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))
def copyout(self, dest:memoryview, src:T):
self.device.synchronize()
check(hip.hipSetDevice(self.device.device))
check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
def transfer(self, dest:T, src:T, sz:int):
check(hip.hipSetDevice(self.device.device))
# TODO: hipMemcpyAsync, but you have to track the "src" buffer to not free it
check(hip.hipMemcpy(dest, src, sz, hip.hipMemcpyDeviceToDevice))
check(hip.hipMemcpyAsync(dest, src, sz, hip.hipMemcpyDeviceToDevice, None))
class HIPDevice(Compiled):
def __init__(self, device:str=""):
self.device = int(device.split(":")[1]) if ":" in device else 0
self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode() if not MOCKHIP else "gfx1100" # noqa: E501
self.pending_copyin: List[hip.hipDeviceptr_t] = []
self.pending_events: List[hip.hipEvent_t] = []
from tinygrad.runtime.graph.hip import HIPGraph
super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer,
@@ -101,4 +106,15 @@ class HIPDevice(Compiled):
check(hip.hipSetDevice(self.device))
check(hip.hipDeviceSynchronize())
for opaque in self.pending_copyin: check(hip.hipFree(opaque))
self.pending_copyin.clear()
for opaque in self.pending_events: check(hip.hipEventDestroy(opaque))
self.pending_copyin.clear()
self.pending_events.clear()
def event(self):
check(hip.hipSetDevice(self.device))
evt = init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
self.pending_events.append(evt)
check(hip.hipEventRecord(evt, None))
return evt
def block(self, evt):
check(hip.hipSetDevice(self.device))
check(hip.hipStreamWaitEvent(None, evt, 0))