limit gl*lc (#15359)

2026-04-29 03:00:14 -04:00 · 2026-03-19 12:38:55 +08:00
parent b39816e998
commit 86eec01f97
5 changed files with 19 additions and 3 deletions
--- a/test/null/test_gpudims.py
+++ b/test/null/test_gpudims.py
@@ -1,9 +1,10 @@
 import unittest, math
 import z3
-from tinygrad.codegen.gpudims import get_grouped_dims
-from tinygrad.uop.ops import UOp, Ops
+from tinygrad.codegen.gpudims import get_grouped_dims, add_gpudims
+from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType
 from tinygrad.uop.validate import uops_to_z3
 from tinygrad.dtype import dtypes
+from tinygrad.renderer import Renderer
 from tinygrad.helpers import flatten, dedup

 class TestGroupedDims(unittest.TestCase):
@@ -93,6 +94,14 @@ class TestGroupedDims(unittest.TestCase):
    assert idxs[2].op is Ops.SPECIAL, f"expected SPECIAL for direct-mapped dim, got {idxs[2].op}"
    assert idxs[3].op is Ops.SPECIAL, f"expected SPECIAL for direct-mapped dim, got {idxs[3].op}"

+  def test_global_prod_max(self):
+    g, l = UOp.range(256, 0, AxisType.GLOBAL), UOp.range(256, 1, AxisType.LOCAL)
+    sink = UOp(Ops.PARAM, dtypes.float.ptr(), (), 0).index(g + l).store(UOp.const(dtypes.float, 1.0)).end(g, l).sink(arg=KernelInfo())
+    class R(Renderer): global_max, local_max, global_prod_max = (256, 256, 256), (128, 128, 128), (128, 128, 128)
+    specials = [u for u in add_gpudims(R(), sink).toposort() if u.op is Ops.SPECIAL]
+    self.assertGreater(len([s for s in specials if "lidx" in s.arg]), 1)
+    self.assertGreater(len([s for s in specials if "gidx" in s.arg]), 1)
+
  def test_max_sizes_none(self):
    self._check_grouped_dims("gidx", (2,3,4), None, False, [2,3,4])
    self._check_grouped_dims("gidx", (100,), None, False, [100])
--- a/tinygrad/codegen/gpudims.py
+++ b/tinygrad/codegen/gpudims.py
@@ -81,7 +81,11 @@ def add_gpudims(ctx:Renderer, s:UOp):
    idxs = get_grouped_dims("idx", global_shape, ctx.global_max, reverse=True)
  else:
    # define indexes for GPU-like execution
-    idxs = get_grouped_dims("gidx", global_shape, ctx.global_max, reverse=True) + get_grouped_dims("lidx", local_shape, ctx.local_max)
+    local_idxs = get_grouped_dims("lidx", local_shape, ctx.local_max)
+    hw_local = [_dim_max(u.src[0]) for u in local_idxs if u.op is Ops.SPECIAL]
+    global_max = ctx.global_max if ctx.global_prod_max is None else \
+      tuple(min(gm, pm//l) for gm,pm,l in zip(ctx.global_max or ctx.global_prod_max, ctx.global_prod_max, hw_local+[1]*3))
+    idxs = get_grouped_dims("gidx", global_shape, global_max, reverse=True) + local_idxs

  # apply to multiple ranges
  subs = {}
--- a/tinygrad/renderer/init.py
+++ b/tinygrad/renderer/init.py
@@ -142,6 +142,7 @@ class Renderer:
  # NOTE: these two should be in (x,y,z) order to match the max_sizes argument in get_grouped_dims
  global_max: tuple[int, ...]|None = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now
  local_max: tuple[int, ...]|None = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now
+  global_prod_max: tuple[int, ...]|None = None
  shared_max: int = 32768
  tensor_cores: list[TensorCore] = []
  pre_matcher: PatternMatcher|None = None
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -468,6 +468,7 @@ class AMDHIPRenderer(CStyleLanguage):
  shared_max = 65536
  # NOTE: this is only really needed on gfx12, even though gfx11 reports the same limitation
  global_max = (2147483647, 65535, 65535)
+  global_prod_max = (0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)

  @staticmethod
  def get_tensor_cores(arch):
--- a/tinygrad/renderer/llvmir.py
+++ b/tinygrad/renderer/llvmir.py
@@ -216,6 +216,7 @@ class AMDLLVMRenderer(LLVMRenderer):
  has_local = True
  shared_max = AMDHIPRenderer.shared_max
  global_max = AMDHIPRenderer.global_max
+  global_prod_max = AMDHIPRenderer.global_prod_max
  abi = "amdgpu_kernel"
  code_for_op = {**LLVMRenderer.code_for_op, **{op: lambda: None for op in llvm_intrinsics}}
  string_rewrite = PatternMatcher([