map local aliases to reduceop (#4766)

* map

* ugh

* save one line

* concerning, does this pass

* Revert "concerning, does this pass"

This reverts commit 64d4664f17.

* use local_alias
This commit is contained in:
qazal
2024-05-29 09:11:25 +08:00
committed by GitHub
parent 7624ad3ddd
commit 6e5fa5fd92
2 changed files with 15 additions and 14 deletions

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
from collections import defaultdict
import math, itertools
from typing import NamedTuple, Optional, List, Tuple, cast, Dict, Union
from typing import DefaultDict, NamedTuple, Optional, List, Tuple, cast, Dict, Union
from tinygrad.ops import LazyOp, UnaryOps, BinaryOps, ReduceOps, MemBuffer, ConstBuffer, BufferOps, UNSAFE_PAD_OPS
from tinygrad.device import Device
from tinygrad.renderer import Renderer, TensorCore
@@ -87,7 +88,7 @@ class Kernel:
self.group_for_reduces: int = 0
self.upcasted: int = 0
self.local_dims: int = 0
self.local_alias: Dict[int, LocalBuffer] = {}
self.local_alias: DefaultDict[LazyOp, Dict[int, LocalBuffer]] = defaultdict(dict)
self.tensor_core: Optional[TensorCore] = None
self.tensor_core_opts: Optional[TensorCoreOptions] = None
# the local aliased buffers for A and B
@@ -115,7 +116,7 @@ class Kernel:
# parameters for optimizations
ret.applied_opts, ret.group_for_reduces, ret.upcasted, ret.local_dims, ret.dont_use_locals = \
self.applied_opts[:], self.group_for_reduces, self.upcasted, self.local_dims, self.dont_use_locals
ret.tensor_core, ret.tensor_core_opts, ret.local_alias, ret.bufs_for_tensor_core = self.tensor_core, self.tensor_core_opts, {}, \
ret.tensor_core, ret.tensor_core_opts, ret.local_alias, ret.bufs_for_tensor_core = self.tensor_core, self.tensor_core_opts, defaultdict(dict), \
self.bufs_for_tensor_core
# uncached since linearize didn't run
@@ -306,7 +307,7 @@ class Kernel:
self.reshape_and_permute(None, order)
if DEBUG >= 3: print("permuted global dim", order, "due to allocation exceeds global limit")
def alias_buffer(self, i, pattern):
def alias_buffer(self, op:LazyOp, i:int, pattern:List[int]) -> None:
assert len(pattern) == len(self.sts[i].shape), f"must include a pattern for each shape {pattern} {self.sts[i].shape}"
bst = 1
@@ -321,7 +322,7 @@ class Kernel:
self.sts.append(ShapeTracker((View.create(tuple(shp), tuple(stride)),)))
self.bufs.append(LocalBuffer(name=f"ldata{i}", size=self.sts[-1].size))
if DEBUG >= 4: print("aliasing buffer", self.sts[i])
self.local_alias[i] = cast(LocalBuffer, self.bufs[-1])
self.local_alias[op][i] = cast(LocalBuffer, self.bufs[-1])
# ******************** high level optimizers ********************

View File

@@ -201,11 +201,11 @@ class Linearizer(Kernel):
# compute local aliases - modify idxs if necessary for TC
alias_buf_idxs = []
for i in self.local_alias:
localbuf_idx = self.bufs.index(self.local_alias[i])
for i in (local_alias:=self.local_alias[reduceop]):
localbuf_idx = self.bufs.index(local_alias[i])
buf_idxs = [idx*0 if s == 0 else idx for idx,s in zip(global_idxs+local_idxs+reduce_idxs+full_upcast_idxs,self.sts[i].real_strides())]
if (tc:=self.tensor_core):
min_alias_idx = min(self.local_alias.keys())
min_alias_idx = min(local_alias.keys())
replace_input_idxs = calc_tc_idxs(tc.thread_local_sizes[i-min_alias_idx], tc.thread_local_aliases[i-min_alias_idx])
for n in range(len(tc.threads)):
buf_idxs[self.global_dims+n] = replace_input_idxs[n] # replace locals
@@ -258,7 +258,7 @@ class Linearizer(Kernel):
assert not locals_to_store, "storing locals isn't supported here"
# load earlybufs
loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i,
loaded_buffers.update({b:self.global_load(self.bufs.index(local_alias[i]) if i in self.local_alias else i,
global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs) if b in self.earlybufs})
# run early AST (with reduce)
@@ -322,8 +322,8 @@ class Linearizer(Kernel):
# late alias the tensor core buffers
if (tc:=self.tensor_core) and self.tensor_core_opts is not None:
alias_pattern = [0]*(self.global_dims) + [2]*(len(tc.threads)) + [0]*(self.local_dims-len(tc.threads)) + [0]*(self.shape_len-self.upcasted-self.first_reduce) + [1,1] + [3]*(self.upcasted-2) # noqa: E501
for _,tc_bufs in self.bufs_for_tensor_core.items():
for tc_buf in tc_bufs: self.alias_buffer(tc_buf, alias_pattern) # TODO aliased buffers should map to the reduceop
for op, tc_bufs in self.bufs_for_tensor_core.items():
for tc_buf in tc_bufs: self.alias_buffer(op, tc_buf, alias_pattern)
# save backups
sts_backup, gfr_backup, upc_backup = self.sts[:], self.group_for_reduces, self.upcasted
@@ -350,9 +350,9 @@ class Linearizer(Kernel):
assert var.expr is not None
self.loop_uops[var.expr] = self.uops.add(UOps.DEFINE_VAR, dtypes.int32, (), var)
# define local buffers
for lb in self.local_alias.values():
self.buf_uops[self.bufs.index(lb)] = self.uops.add(UOps.DEFINE_LOCAL,
PtrDType(dtypes.float32), (), (lb.name, self.sts[self.bufs.index(lb)].size))
for aliases in self.local_alias.values():
for lb in aliases.values(): self.buf_uops[self.bufs.index(lb)] = self.uops.add(UOps.DEFINE_LOCAL, PtrDType(lb.dtype),
(), (lb.name, self.sts[self.bufs.index(lb)].size))
# add a local buffer for multistage reduce. # TODO: use local alias
if self.group_for_reduces:
# TODO: the strides of this can be controlled