diff --git a/tinygrad/device.py b/tinygrad/device.py index 0c6c7f84f5..2b9babb560 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -99,9 +99,13 @@ class Buffer: def _internal_buffer_copy(dest:Buffer, src:Buffer): if hasattr(src.allocator, 'transfer') and type(dest.allocator) is type(src.allocator): # noqa: E721 # fast path, used on HIP between GPUs - # NOTE: it's important we use the dest device here to ensure the transfer is ready + # NOTE: we have to block here so the data isn't copied too early. this is probably due to buffer reuse + if hasattr(src.d, "block") and hasattr(dest.d, "event"): src.d.block(dest.d.event()) + else: dest.d.synchronize() src.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize) + # NOTE: we have to block here so the data is ready on dest when dest needs it if hasattr(dest.d, "block") and hasattr(src.d, "event"): dest.d.block(src.d.event()) + else: src.d.synchronize() return if getenv("FROM_BUFFER") and hasattr(dest.allocator, 'from_buffer') and hasattr(dest.allocator, 'transfer') and hasattr(src.allocator, 'as_buffer'): # fast path, used on Metal in OS X Sonoma