From a02998472bf169987db3b5cba0628e69bff79181 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Fri, 19 Jul 2024 14:35:09 -0700 Subject: [PATCH] fix no locals behavior (#5593) --- tinygrad/codegen/kernel.py | 10 +++++++--- tinygrad/codegen/lowerer.py | 10 +++++++--- tinygrad/ops.py | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index c7bb981ac7..cf76d0ef9e 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -725,7 +725,7 @@ class Kernel: local_load = LazyOp(BufferOps.LOAD, (local_store,), local_buffer) return LazyOp(op.op, (local_load,), tuple(range(self.first_reduce, self.first_reduce+self.group_for_reduces))) elif op.op is MetaOps.KERNEL: - arg = KernelInfo(self.local_dims, self.upcasted) + arg = KernelInfo(self.local_dims, self.upcasted, self.dont_use_locals) else: arg = op.arg return LazyOp(op.op, tuple(fixup_ast(x, apply_to_st) for x in op.src), arg) @@ -749,8 +749,12 @@ class Kernel: self.local_size: Optional[List[int]] = [1,1,1] for u in uop_sink.parents: if u.op is UOps.SPECIAL: - if u.arg[1][0] == 'l': self.local_size[u.arg[0]] = u.arg[2] - else: self.global_size[u.arg[0]] = u.arg[2] + if u.arg[1][0] == 'i': self.local_size = None + if u.arg[1][0] == 'l': + assert self.local_size is not None + self.local_size[u.arg[0]] = u.arg[2] + else: + self.global_size[u.arg[0]] = u.arg[2] else: self.global_size, self.local_size = None, None diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py index ca27b5aee3..d4987c6a77 100644 --- a/tinygrad/codegen/lowerer.py +++ b/tinygrad/codegen/lowerer.py @@ -104,9 +104,13 @@ class IndependentLowerer: global_dims = first_reduce-ki.local_dims if opts.has_local: - # define indexes for GPU-like execution - self.idxs = get_grouped_dims("gidx", full_shape[:global_dims], opts.global_max, reverse=True) + \ - get_grouped_dims("lidx", full_shape[global_dims:first_reduce+group_for_reduces], opts.local_max) + if ki.dont_use_locals: + assert ki.local_dims == 0, "can't use locals if there's no local dims" + self.idxs = get_grouped_dims("idx", full_shape[:global_dims], opts.global_max, reverse=True) + else: + # define indexes for GPU-like execution + self.idxs = get_grouped_dims("gidx", full_shape[:global_dims], opts.global_max, reverse=True) + \ + get_grouped_dims("lidx", full_shape[global_dims:first_reduce+group_for_reduces], opts.local_max) else: # all loops are RANGES self.idxs = [UOp(UOps.RANGE, dtypes.bigint, (UOp.const(dtypes.bigint, 0), variable_to_uop(g)), (i, False)) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 084551cc8f..c80ac03e0b 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -49,6 +49,7 @@ class ConstBuffer: class KernelInfo: local_dims: int = 0 # number of local dimensions (this is remapping RANGE to SPECIAL) upcasted: int = 0 # count that are upcasted (this is remapping RANGE to EXPAND) + dont_use_locals: bool = False # don't use local indexing @dataclass(frozen=True, eq=False) class LazyOp: