global -> group (#1007)

* global -> group * allow None for local_size in custom function * lil local * comment on shape * fix cuda * smart local cast * better local heuristic * fix ptx, and work_dim cleanup * fix metal * fix ops test * fix openpilot jit * no more optlocal * might fix metal tests * try metal now * see generated metal code * test free removal. REVERT THIS * mergable
2026-02-10 22:54:59 -05:00 · 2023-06-21 11:50:43 -07:00
parent aab9ee0fca
commit 18892242b0
17 changed files with 81 additions and 90 deletions
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -76,7 +76,7 @@ class CLProgram:

  def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]:
    cl_bufs = [x._buf if isinstance(x, CLBuffer) else x for x in bufs]
-    e = self.clprg(CL.cl_queue[cl_bufs[0].device], global_size, local_size, *cl_bufs)
+    e = self.clprg(CL.cl_queue[cl_bufs[0].device], [g*l for g,l in zip(global_size, local_size)] if local_size is not None else global_size, local_size, *cl_bufs)
    if wait:
      e.wait()
      try:
@@ -91,6 +91,6 @@ class CLCodegen(CStyleCodegen):
    double_prekernel="#ifdef cl_khr_fp64\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n#elif defined(cl_amd_fp64)\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n#endif",
    half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
    barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
-    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True)
+    gid = [f'get_group_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True)

 GPUBuffer = Compiled(CLBuffer, fromimport("tinygrad.codegen.assembly_rdna", "RDNACodegen") if getenv("RDNA") else CLCodegen, CLProgram, CL.synchronize)