cuda tranfer + async copyin (#3873)

This commit is contained in:
nimlgen
2024-03-22 19:01:37 +03:00
committed by GitHub
parent 624bc89910
commit 8ef5490ec8
2 changed files with 36 additions and 10 deletions

View File

@@ -31,14 +31,13 @@ def lower_schedule_item(si:ScheduleItem) -> Optional[JITRunner]:
if si.ast[0].op is BufferOps.STORE: return Device[si.outputs[0].device].get_runner(*si.ast)
assert len(si.ast) == 1 and len(si.outputs) == 1, "only ASTRunner supports multioutput"
out, ast = si.outputs[0], si.ast[0]
if ast.op in {LoadOps.SYNC, LoadOps.WAIT} and out.device.startswith("HSA") and si.inputs[0].device.startswith("HSA"):
# Our HSA runtime handles synchronization
if ast.op is LoadOps.SYNC: return None
if ast.op is LoadOps.COPY:
if hasattr(Device[out.device].allocator, 'transfer') and type(Device[out.device]) is type(Device[si.inputs[0].device]): return BufferXfer()
if si.inputs[0].device.startswith("DISK"): return BufferRead()
return BufferCopy()
if ast.op is LoadOps.CUSTOM: return CustomOp(ast.arg)
if ast.op is LoadOps.SYNC and out.device.startswith("CUDA") and si.inputs[0].device.startswith("CUDA"): return None
if ast.op is LoadOps.SYNC and out.device.startswith("HSA") and si.inputs[0].device.startswith("HSA"): return None
if ast.op is LoadOps.SYNC: return SyncOp(out.device) if isinstance(Device[out.device], Compiled) else None
return None

View File

@@ -1,10 +1,10 @@
from __future__ import annotations
import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
from pathlib import Path
from typing import Tuple, Optional
from typing import Tuple, Optional, List
import tinygrad.runtime.autogen.cuda as cuda
from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import CUDARenderer
from tinygrad.renderer.assembly import PTXRenderer
@@ -122,15 +122,32 @@ class CUDAAllocator(LRUAllocator):
def _alloc(self, size):
check(cuda.cuCtxSetCurrent(self.device.context))
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
def _alloc_with_options(self, size:int, options:BufferOptions):
if options.host:
return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0)))
else:
raise Exception("no options")
def _free(self, opaque): check(cuda.cuMemFree_v2(opaque))
def copyin(self, dest, src:memoryview):
check(cuda.cuCtxSetCurrent(self.device.context))
check(cuda.cuMemcpyHtoD_v2(dest, from_mv(src), len(src), None))
host_mem = self._alloc_with_options(len(src), BufferOptions(host=True))
self.device.pending_copyin.append(host_mem.value)
ctypes.memmove(host_mem, from_mv(src), len(src))
check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
def copyout(self, dest:memoryview, src):
CUDADevice.synchronize_system()
check(cuda.cuCtxSetCurrent(self.device.context))
check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest)))
def transfer(self, dest, src, sz:int, src_dev, dest_dev):
check(cuda.cuCtxSetCurrent(src_dev.context))
check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
check(cuda.cuMemcpyDtoDAsync_v2(dest, src, sz, None))
check(cuda.cuEventRecord(sync_event, None))
check(cuda.cuCtxSetCurrent(dest_dev.context))
check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev
class CUDADevice(Compiled):
devices: List[CUDADevice] = []
def __init__(self, device:str):
device_id = int(device.split(":")[1]) if ":" in device else 0
if not CUDACPU:
@@ -138,13 +155,23 @@ class CUDADevice(Compiled):
check(cuda.cuDeviceGet(ctypes.byref(cu_device := cuda.CUdevice()), device_id))
self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, cu_device)))
check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
self.pending_copyin: List[int] = []
CUDADevice.devices.append(self)
from tinygrad.runtime.graph.cuda import CUDAGraph
super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
def synchronize(self):
if not CUDACPU:
check(cuda.cuCtxSetCurrent(self.context))
check(cuda.cuCtxSynchronize())
if CUDACPU: return
check(cuda.cuCtxSetCurrent(self.context))
check(cuda.cuCtxSynchronize())
for opaque in self.pending_copyin: check(cuda.cuMemFreeHost(opaque))
self.pending_copyin.clear()
@staticmethod
def synchronize_system():
for d in CUDADevice.devices: d.synchronize()