mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
limit gl*lc (#15359)
This commit is contained in:
@@ -1,9 +1,10 @@
|
||||
import unittest, math
|
||||
import z3
|
||||
from tinygrad.codegen.gpudims import get_grouped_dims
|
||||
from tinygrad.uop.ops import UOp, Ops
|
||||
from tinygrad.codegen.gpudims import get_grouped_dims, add_gpudims
|
||||
from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType
|
||||
from tinygrad.uop.validate import uops_to_z3
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.helpers import flatten, dedup
|
||||
|
||||
class TestGroupedDims(unittest.TestCase):
|
||||
@@ -93,6 +94,14 @@ class TestGroupedDims(unittest.TestCase):
|
||||
assert idxs[2].op is Ops.SPECIAL, f"expected SPECIAL for direct-mapped dim, got {idxs[2].op}"
|
||||
assert idxs[3].op is Ops.SPECIAL, f"expected SPECIAL for direct-mapped dim, got {idxs[3].op}"
|
||||
|
||||
def test_global_prod_max(self):
|
||||
g, l = UOp.range(256, 0, AxisType.GLOBAL), UOp.range(256, 1, AxisType.LOCAL)
|
||||
sink = UOp(Ops.PARAM, dtypes.float.ptr(), (), 0).index(g + l).store(UOp.const(dtypes.float, 1.0)).end(g, l).sink(arg=KernelInfo())
|
||||
class R(Renderer): global_max, local_max, global_prod_max = (256, 256, 256), (128, 128, 128), (128, 128, 128)
|
||||
specials = [u for u in add_gpudims(R(), sink).toposort() if u.op is Ops.SPECIAL]
|
||||
self.assertGreater(len([s for s in specials if "lidx" in s.arg]), 1)
|
||||
self.assertGreater(len([s for s in specials if "gidx" in s.arg]), 1)
|
||||
|
||||
def test_max_sizes_none(self):
|
||||
self._check_grouped_dims("gidx", (2,3,4), None, False, [2,3,4])
|
||||
self._check_grouped_dims("gidx", (100,), None, False, [100])
|
||||
|
||||
@@ -81,7 +81,11 @@ def add_gpudims(ctx:Renderer, s:UOp):
|
||||
idxs = get_grouped_dims("idx", global_shape, ctx.global_max, reverse=True)
|
||||
else:
|
||||
# define indexes for GPU-like execution
|
||||
idxs = get_grouped_dims("gidx", global_shape, ctx.global_max, reverse=True) + get_grouped_dims("lidx", local_shape, ctx.local_max)
|
||||
local_idxs = get_grouped_dims("lidx", local_shape, ctx.local_max)
|
||||
hw_local = [_dim_max(u.src[0]) for u in local_idxs if u.op is Ops.SPECIAL]
|
||||
global_max = ctx.global_max if ctx.global_prod_max is None else \
|
||||
tuple(min(gm, pm//l) for gm,pm,l in zip(ctx.global_max or ctx.global_prod_max, ctx.global_prod_max, hw_local+[1]*3))
|
||||
idxs = get_grouped_dims("gidx", global_shape, global_max, reverse=True) + local_idxs
|
||||
|
||||
# apply to multiple ranges
|
||||
subs = {}
|
||||
|
||||
@@ -142,6 +142,7 @@ class Renderer:
|
||||
# NOTE: these two should be in (x,y,z) order to match the max_sizes argument in get_grouped_dims
|
||||
global_max: tuple[int, ...]|None = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now
|
||||
local_max: tuple[int, ...]|None = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now
|
||||
global_prod_max: tuple[int, ...]|None = None
|
||||
shared_max: int = 32768
|
||||
tensor_cores: list[TensorCore] = []
|
||||
pre_matcher: PatternMatcher|None = None
|
||||
|
||||
@@ -468,6 +468,7 @@ class AMDHIPRenderer(CStyleLanguage):
|
||||
shared_max = 65536
|
||||
# NOTE: this is only really needed on gfx12, even though gfx11 reports the same limitation
|
||||
global_max = (2147483647, 65535, 65535)
|
||||
global_prod_max = (0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
|
||||
@staticmethod
|
||||
def get_tensor_cores(arch):
|
||||
|
||||
@@ -216,6 +216,7 @@ class AMDLLVMRenderer(LLVMRenderer):
|
||||
has_local = True
|
||||
shared_max = AMDHIPRenderer.shared_max
|
||||
global_max = AMDHIPRenderer.global_max
|
||||
global_prod_max = AMDHIPRenderer.global_prod_max
|
||||
abi = "amdgpu_kernel"
|
||||
code_for_op = {**LLVMRenderer.code_for_op, **{op: lambda: None for op in llvm_intrinsics}}
|
||||
string_rewrite = PatternMatcher([
|
||||
|
||||
Reference in New Issue
Block a user