diff --git a/test/null/test_gpudims.py b/test/null/test_gpudims.py index b18ff6c1ac..a3e9956a7f 100644 --- a/test/null/test_gpudims.py +++ b/test/null/test_gpudims.py @@ -1,9 +1,10 @@ import unittest, math import z3 -from tinygrad.codegen.gpudims import get_grouped_dims -from tinygrad.uop.ops import UOp, Ops +from tinygrad.codegen.gpudims import get_grouped_dims, add_gpudims +from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType from tinygrad.uop.validate import uops_to_z3 from tinygrad.dtype import dtypes +from tinygrad.renderer import Renderer from tinygrad.helpers import flatten, dedup class TestGroupedDims(unittest.TestCase): @@ -93,6 +94,14 @@ class TestGroupedDims(unittest.TestCase): assert idxs[2].op is Ops.SPECIAL, f"expected SPECIAL for direct-mapped dim, got {idxs[2].op}" assert idxs[3].op is Ops.SPECIAL, f"expected SPECIAL for direct-mapped dim, got {idxs[3].op}" + def test_global_prod_max(self): + g, l = UOp.range(256, 0, AxisType.GLOBAL), UOp.range(256, 1, AxisType.LOCAL) + sink = UOp(Ops.PARAM, dtypes.float.ptr(), (), 0).index(g + l).store(UOp.const(dtypes.float, 1.0)).end(g, l).sink(arg=KernelInfo()) + class R(Renderer): global_max, local_max, global_prod_max = (256, 256, 256), (128, 128, 128), (128, 128, 128) + specials = [u for u in add_gpudims(R(), sink).toposort() if u.op is Ops.SPECIAL] + self.assertGreater(len([s for s in specials if "lidx" in s.arg]), 1) + self.assertGreater(len([s for s in specials if "gidx" in s.arg]), 1) + def test_max_sizes_none(self): self._check_grouped_dims("gidx", (2,3,4), None, False, [2,3,4]) self._check_grouped_dims("gidx", (100,), None, False, [100]) diff --git a/tinygrad/codegen/gpudims.py b/tinygrad/codegen/gpudims.py index 91677ef001..dc37a1d503 100644 --- a/tinygrad/codegen/gpudims.py +++ b/tinygrad/codegen/gpudims.py @@ -81,7 +81,11 @@ def add_gpudims(ctx:Renderer, s:UOp): idxs = get_grouped_dims("idx", global_shape, ctx.global_max, reverse=True) else: # define indexes for GPU-like execution - idxs = get_grouped_dims("gidx", global_shape, ctx.global_max, reverse=True) + get_grouped_dims("lidx", local_shape, ctx.local_max) + local_idxs = get_grouped_dims("lidx", local_shape, ctx.local_max) + hw_local = [_dim_max(u.src[0]) for u in local_idxs if u.op is Ops.SPECIAL] + global_max = ctx.global_max if ctx.global_prod_max is None else \ + tuple(min(gm, pm//l) for gm,pm,l in zip(ctx.global_max or ctx.global_prod_max, ctx.global_prod_max, hw_local+[1]*3)) + idxs = get_grouped_dims("gidx", global_shape, global_max, reverse=True) + local_idxs # apply to multiple ranges subs = {} diff --git a/tinygrad/renderer/__init__.py b/tinygrad/renderer/__init__.py index c417f817d1..e17e489242 100644 --- a/tinygrad/renderer/__init__.py +++ b/tinygrad/renderer/__init__.py @@ -142,6 +142,7 @@ class Renderer: # NOTE: these two should be in (x,y,z) order to match the max_sizes argument in get_grouped_dims global_max: tuple[int, ...]|None = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now local_max: tuple[int, ...]|None = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now + global_prod_max: tuple[int, ...]|None = None shared_max: int = 32768 tensor_cores: list[TensorCore] = [] pre_matcher: PatternMatcher|None = None diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index f9bdba33d9..cc6d98e3be 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -468,6 +468,7 @@ class AMDHIPRenderer(CStyleLanguage): shared_max = 65536 # NOTE: this is only really needed on gfx12, even though gfx11 reports the same limitation global_max = (2147483647, 65535, 65535) + global_prod_max = (0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF) @staticmethod def get_tensor_cores(arch): diff --git a/tinygrad/renderer/llvmir.py b/tinygrad/renderer/llvmir.py index 4cb71aa571..0b0cc72140 100644 --- a/tinygrad/renderer/llvmir.py +++ b/tinygrad/renderer/llvmir.py @@ -216,6 +216,7 @@ class AMDLLVMRenderer(LLVMRenderer): has_local = True shared_max = AMDHIPRenderer.shared_max global_max = AMDHIPRenderer.global_max + global_prod_max = AMDHIPRenderer.global_prod_max abi = "amdgpu_kernel" code_for_op = {**LLVMRenderer.code_for_op, **{op: lambda: None for op in llvm_intrinsics}} string_rewrite = PatternMatcher([