mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 06:58:11 -05:00
event driven hip (#3160)
* event driven hip * simpler, src makes copy * pass mypy
This commit is contained in:
@@ -424,10 +424,11 @@ After you are done speaking, output [EOS]. You are not Chad.
|
||||
st = GlobalCounters.time_sum_s
|
||||
with Profiling(enabled=args.profile):
|
||||
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
|
||||
with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
|
||||
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
|
||||
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
|
||||
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_count*1e-9*2/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
|
||||
tok = llama.model(Tensor([toks[start_pos:]], device=device), start_pos, args.temperature).item()
|
||||
tok_tensor = llama.model(Tensor([toks[start_pos:]], device=device), start_pos, args.temperature)
|
||||
tok = tok_tensor.item()
|
||||
|
||||
# use the kv cache
|
||||
start_pos = len(toks)
|
||||
|
||||
12
extra/hip_events.py
Normal file
12
extra/hip_events.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import ctypes
|
||||
import gpuctypes.hip as hip
|
||||
from tinygrad.runtime.ops_hip import check
|
||||
from tinygrad.helpers import init_c_var
|
||||
|
||||
if __name__ == "__main__":
|
||||
check(hip.hipSetDevice(0))
|
||||
evt = init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
|
||||
check(hip.hipSetDevice(1))
|
||||
check(hip.hipStreamWaitEvent(None, evt, 0))
|
||||
check(hip.hipSetDevice(0))
|
||||
check(hip.hipEventRecord(evt, None))
|
||||
@@ -68,8 +68,8 @@ def update_stats(name:str, op_estimate:sint, mem_estimate:int, var_vals: Optiona
|
||||
class Buffer:
|
||||
def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None):
|
||||
assert isinstance(dtype, DType)
|
||||
self.device, self.size, self.dtype = device, size, dtype
|
||||
self.allocator = Device[self.device].allocator
|
||||
self.device, self.size, self.dtype, self.d = device, size, dtype, Device[device]
|
||||
self.allocator = self.d.allocator
|
||||
# TODO: image hack shouldn't be here. where should it be?
|
||||
self._buf = opaque if opaque is not None else self.allocator.alloc(dtype if isinstance(dtype, ImageDType) else size * dtype.itemsize)
|
||||
# TODO: mem_used for all devices
|
||||
@@ -97,11 +97,11 @@ class Buffer:
|
||||
return mv
|
||||
|
||||
def _internal_buffer_copy(dest:Buffer, src:Buffer):
|
||||
if hasattr(dest.allocator, 'transfer') and type(dest.allocator) is type(src.allocator): # noqa: E721
|
||||
if hasattr(src.allocator, 'transfer') and type(dest.allocator) is type(src.allocator): # noqa: E721
|
||||
# fast path, used on HIP between GPUs
|
||||
# NOTE: it's important we use the dest device here to ensure the transfer is ready
|
||||
Device[src.device].synchronize() # TODO: async this
|
||||
dest.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
|
||||
src.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
|
||||
if hasattr(dest.d, "block") and hasattr(src.d, "event"): dest.d.block(src.d.event())
|
||||
return
|
||||
if getenv("FROM_BUFFER") and hasattr(dest.allocator, 'from_buffer') and hasattr(dest.allocator, 'transfer') and hasattr(src.allocator, 'as_buffer'):
|
||||
# fast path, used on Metal in OS X Sonoma
|
||||
@@ -128,7 +128,7 @@ class _BufferCopy(JITRunner):
|
||||
_internal_buffer_copy(dest, src)
|
||||
et = None
|
||||
if wait or DEBUG >= 2:
|
||||
Device[dest.device].synchronize()
|
||||
dest.d.synchronize()
|
||||
et = time.perf_counter() - st
|
||||
update_stats(colored(f"copy {dest.size*dest.dtype.itemsize:8d}, {dest.device[:7]:>7s} <- {src.device[:7]:7s}", "yellow"),
|
||||
0, dest.size*dest.dtype.itemsize, {}, et, 2, jit, device=dest.device)
|
||||
|
||||
@@ -46,6 +46,9 @@ class HIPAllocator(LRUAllocator):
|
||||
def __init__(self, device:HIPDevice):
|
||||
self.device = device
|
||||
super().__init__()
|
||||
def free_cache(self):
|
||||
self.device.synchronize()
|
||||
return super().free_cache()
|
||||
def _alloc(self, size:int):
|
||||
check(hip.hipSetDevice(self.device.device))
|
||||
return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
|
||||
@@ -63,6 +66,7 @@ class HIPAllocator(LRUAllocator):
|
||||
for local_offset in range(0, size+minor_offset, CHUNK_SIZE):
|
||||
local_size = min(round_up(size+minor_offset, PAGE_SIZE)-local_offset, CHUNK_SIZE)
|
||||
if self.hb_events[self.hb_polarity] is not None:
|
||||
# NOTE: block doesn't work here because we modify the CPU memory
|
||||
check(hip.hipEventSynchronize(self.hb_events[self.hb_polarity]))
|
||||
check(hip.hipEventDestroy(self.hb_events[self.hb_polarity]))
|
||||
self.hb_events[self.hb_polarity] = None
|
||||
@@ -81,18 +85,19 @@ class HIPAllocator(LRUAllocator):
|
||||
ctypes.memmove(host_mem, from_mv(src), len(src))
|
||||
check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))
|
||||
def copyout(self, dest:memoryview, src:T):
|
||||
self.device.synchronize()
|
||||
check(hip.hipSetDevice(self.device.device))
|
||||
check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
|
||||
def transfer(self, dest:T, src:T, sz:int):
|
||||
check(hip.hipSetDevice(self.device.device))
|
||||
# TODO: hipMemcpyAsync, but you have to track the "src" buffer to not free it
|
||||
check(hip.hipMemcpy(dest, src, sz, hip.hipMemcpyDeviceToDevice))
|
||||
check(hip.hipMemcpyAsync(dest, src, sz, hip.hipMemcpyDeviceToDevice, None))
|
||||
|
||||
class HIPDevice(Compiled):
|
||||
def __init__(self, device:str=""):
|
||||
self.device = int(device.split(":")[1]) if ":" in device else 0
|
||||
self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode() if not MOCKHIP else "gfx1100" # noqa: E501
|
||||
self.pending_copyin: List[hip.hipDeviceptr_t] = []
|
||||
self.pending_events: List[hip.hipEvent_t] = []
|
||||
|
||||
from tinygrad.runtime.graph.hip import HIPGraph
|
||||
super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer,
|
||||
@@ -101,4 +106,15 @@ class HIPDevice(Compiled):
|
||||
check(hip.hipSetDevice(self.device))
|
||||
check(hip.hipDeviceSynchronize())
|
||||
for opaque in self.pending_copyin: check(hip.hipFree(opaque))
|
||||
self.pending_copyin.clear()
|
||||
for opaque in self.pending_events: check(hip.hipEventDestroy(opaque))
|
||||
self.pending_copyin.clear()
|
||||
self.pending_events.clear()
|
||||
def event(self):
|
||||
check(hip.hipSetDevice(self.device))
|
||||
evt = init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
|
||||
self.pending_events.append(evt)
|
||||
check(hip.hipEventRecord(evt, None))
|
||||
return evt
|
||||
def block(self, evt):
|
||||
check(hip.hipSetDevice(self.device))
|
||||
check(hip.hipStreamWaitEvent(None, evt, 0))
|
||||
Reference in New Issue
Block a user