multi cl_queue (#762)

* multi cl_queue * only platforms 1 * gpus first, then cpus * put device on underlying buffer * cl_queue array
2026-02-10 22:54:59 -05:00 · 2023-05-03 12:15:28 -07:00
parent 7757f5fed2
commit 7ecf4dff68
4 changed files with 28 additions and 25 deletions
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -14,16 +14,17 @@ FLOAT16 = getenv("FLOAT16", 0)

 class _CL:
  def __init__(self):
-    devices: List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
-    if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU
-    if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")
-    self.cl_ctx: cl.Context = cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
-    self.cl_queue: cl.CommandQueue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)  # this is an in-order command queue
+    platforms: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()]) if len(y)]
+    if DEBUG >= 1: print(f"using {platforms[getenv('CL_PLATFORM', 0)]}")
+    self.cl_ctx: cl.Context = cl.Context(devices=platforms[getenv('CL_PLATFORM', 0)])
+    self.cl_queue: List[cl.CommandQueue] = [cl.CommandQueue(self.cl_ctx, device=device, properties=cl.command_queue_properties.PROFILING_ENABLE) for device in self.cl_ctx.devices]
+  def synchronize(self):
+    for q in self.cl_queue: q.finish()
 CL = _CL()

 # TODO: merge CLImage in here
 class CLBuffer(RawBufferCopyInOut):
-  def __init__(self, size, dtype):
+  def __init__(self, size, dtype, device=0):
    if isinstance(dtype, ImageDType):
      fmt = cl.ImageFormat(cl.channel_order.RGBA, {2: cl.channel_type.HALF_FLOAT, 4: cl.channel_type.FLOAT}[dtype.itemsize])
      buf = cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, fmt, shape=(dtype.shape[1], dtype.shape[0]))
@@ -31,13 +32,14 @@ class CLBuffer(RawBufferCopyInOut):
      # NOTE: the memory is a bit off here due to padding, it's buf.row_pitch * buf.height * 4 * dtype.itemsize
    else:
      buf = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size * dtype.itemsize)
+    setattr(buf, 'device', device)  # device is tracked on the underlying buffer
    super().__init__(size, dtype, buf)
  def _copyin(self, x:np.ndarray):
    assert not self.dtype.name.startswith("image"), f"can't copyin images {self.dtype}"
-    cl.enqueue_copy(CL.cl_queue, self._buf, x, is_blocking=False)
+    cl.enqueue_copy(CL.cl_queue[self._buf.device], self._buf, x, is_blocking=False)
  def _copyout(self, x:np.ndarray):
    assert not self.dtype.name.startswith("image"), f"can't copyout images {self.dtype}"
-    cl.enqueue_copy(CL.cl_queue, x, self._buf, is_blocking=True)
+    cl.enqueue_copy(CL.cl_queue[self._buf.device], x, self._buf, is_blocking=True)

 class CLProgram:
  def __init__(self, name:str, prg:str, binary=False, argdtypes=None, options=None):
@@ -63,7 +65,8 @@ class CLProgram:
  def max_work_group_size(): return CL.cl_ctx.devices[0].max_work_group_size

  def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]:
-    e = self.clprg(CL.cl_queue, global_size, local_size, *[x._buf if isinstance(x, CLBuffer) else x for x in bufs])
+    cl_bufs = [x._buf if isinstance(x, CLBuffer) else x for x in bufs]
+    e = self.clprg(CL.cl_queue[cl_bufs[0].device], global_size, local_size, *cl_bufs)
    if wait:
      e.wait()
      return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9
@@ -76,4 +79,4 @@ class CLCodegen(CStyleCodegen):
    barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True)

-GPUBuffer = Compiled(CLBuffer, CLCodegen, CLProgram, CL.cl_queue.finish)
+GPUBuffer = Compiled(CLBuffer, CLCodegen, CLProgram, CL.synchronize)