limit gl*lc (#15359)

This commit is contained in:
nimlgen
2026-03-19 12:38:55 +08:00
committed by GitHub
parent b39816e998
commit 86eec01f97
5 changed files with 19 additions and 3 deletions

View File

@@ -1,9 +1,10 @@
import unittest, math
import z3
from tinygrad.codegen.gpudims import get_grouped_dims
from tinygrad.uop.ops import UOp, Ops
from tinygrad.codegen.gpudims import get_grouped_dims, add_gpudims
from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType
from tinygrad.uop.validate import uops_to_z3
from tinygrad.dtype import dtypes
from tinygrad.renderer import Renderer
from tinygrad.helpers import flatten, dedup
class TestGroupedDims(unittest.TestCase):
@@ -93,6 +94,14 @@ class TestGroupedDims(unittest.TestCase):
assert idxs[2].op is Ops.SPECIAL, f"expected SPECIAL for direct-mapped dim, got {idxs[2].op}"
assert idxs[3].op is Ops.SPECIAL, f"expected SPECIAL for direct-mapped dim, got {idxs[3].op}"
def test_global_prod_max(self):
g, l = UOp.range(256, 0, AxisType.GLOBAL), UOp.range(256, 1, AxisType.LOCAL)
sink = UOp(Ops.PARAM, dtypes.float.ptr(), (), 0).index(g + l).store(UOp.const(dtypes.float, 1.0)).end(g, l).sink(arg=KernelInfo())
class R(Renderer): global_max, local_max, global_prod_max = (256, 256, 256), (128, 128, 128), (128, 128, 128)
specials = [u for u in add_gpudims(R(), sink).toposort() if u.op is Ops.SPECIAL]
self.assertGreater(len([s for s in specials if "lidx" in s.arg]), 1)
self.assertGreater(len([s for s in specials if "gidx" in s.arg]), 1)
def test_max_sizes_none(self):
self._check_grouped_dims("gidx", (2,3,4), None, False, [2,3,4])
self._check_grouped_dims("gidx", (100,), None, False, [100])

View File

@@ -81,7 +81,11 @@ def add_gpudims(ctx:Renderer, s:UOp):
idxs = get_grouped_dims("idx", global_shape, ctx.global_max, reverse=True)
else:
# define indexes for GPU-like execution
idxs = get_grouped_dims("gidx", global_shape, ctx.global_max, reverse=True) + get_grouped_dims("lidx", local_shape, ctx.local_max)
local_idxs = get_grouped_dims("lidx", local_shape, ctx.local_max)
hw_local = [_dim_max(u.src[0]) for u in local_idxs if u.op is Ops.SPECIAL]
global_max = ctx.global_max if ctx.global_prod_max is None else \
tuple(min(gm, pm//l) for gm,pm,l in zip(ctx.global_max or ctx.global_prod_max, ctx.global_prod_max, hw_local+[1]*3))
idxs = get_grouped_dims("gidx", global_shape, global_max, reverse=True) + local_idxs
# apply to multiple ranges
subs = {}

View File

@@ -142,6 +142,7 @@ class Renderer:
# NOTE: these two should be in (x,y,z) order to match the max_sizes argument in get_grouped_dims
global_max: tuple[int, ...]|None = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now
local_max: tuple[int, ...]|None = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now
global_prod_max: tuple[int, ...]|None = None
shared_max: int = 32768
tensor_cores: list[TensorCore] = []
pre_matcher: PatternMatcher|None = None

View File

@@ -468,6 +468,7 @@ class AMDHIPRenderer(CStyleLanguage):
shared_max = 65536
# NOTE: this is only really needed on gfx12, even though gfx11 reports the same limitation
global_max = (2147483647, 65535, 65535)
global_prod_max = (0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
@staticmethod
def get_tensor_cores(arch):

View File

@@ -216,6 +216,7 @@ class AMDLLVMRenderer(LLVMRenderer):
has_local = True
shared_max = AMDHIPRenderer.shared_max
global_max = AMDHIPRenderer.global_max
global_prod_max = AMDHIPRenderer.global_prod_max
abi = "amdgpu_kernel"
code_for_op = {**LLVMRenderer.code_for_op, **{op: lambda: None for op in llvm_intrinsics}}
string_rewrite = PatternMatcher([