diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 5eeda2742e..422d03e86b 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -1,6 +1,7 @@ from __future__ import annotations +from collections import defaultdict import math, itertools -from typing import NamedTuple, Optional, List, Tuple, cast, Dict, Union +from typing import DefaultDict, NamedTuple, Optional, List, Tuple, cast, Dict, Union from tinygrad.ops import LazyOp, UnaryOps, BinaryOps, ReduceOps, MemBuffer, ConstBuffer, BufferOps, UNSAFE_PAD_OPS from tinygrad.device import Device from tinygrad.renderer import Renderer, TensorCore @@ -87,7 +88,7 @@ class Kernel: self.group_for_reduces: int = 0 self.upcasted: int = 0 self.local_dims: int = 0 - self.local_alias: Dict[int, LocalBuffer] = {} + self.local_alias: DefaultDict[LazyOp, Dict[int, LocalBuffer]] = defaultdict(dict) self.tensor_core: Optional[TensorCore] = None self.tensor_core_opts: Optional[TensorCoreOptions] = None # the local aliased buffers for A and B @@ -115,7 +116,7 @@ class Kernel: # parameters for optimizations ret.applied_opts, ret.group_for_reduces, ret.upcasted, ret.local_dims, ret.dont_use_locals = \ self.applied_opts[:], self.group_for_reduces, self.upcasted, self.local_dims, self.dont_use_locals - ret.tensor_core, ret.tensor_core_opts, ret.local_alias, ret.bufs_for_tensor_core = self.tensor_core, self.tensor_core_opts, {}, \ + ret.tensor_core, ret.tensor_core_opts, ret.local_alias, ret.bufs_for_tensor_core = self.tensor_core, self.tensor_core_opts, defaultdict(dict), \ self.bufs_for_tensor_core # uncached since linearize didn't run @@ -306,7 +307,7 @@ class Kernel: self.reshape_and_permute(None, order) if DEBUG >= 3: print("permuted global dim", order, "due to allocation exceeds global limit") - def alias_buffer(self, i, pattern): + def alias_buffer(self, op:LazyOp, i:int, pattern:List[int]) -> None: assert len(pattern) == len(self.sts[i].shape), f"must include a pattern for each shape {pattern} {self.sts[i].shape}" bst = 1 @@ -321,7 +322,7 @@ class Kernel: self.sts.append(ShapeTracker((View.create(tuple(shp), tuple(stride)),))) self.bufs.append(LocalBuffer(name=f"ldata{i}", size=self.sts[-1].size)) if DEBUG >= 4: print("aliasing buffer", self.sts[i]) - self.local_alias[i] = cast(LocalBuffer, self.bufs[-1]) + self.local_alias[op][i] = cast(LocalBuffer, self.bufs[-1]) # ******************** high level optimizers ******************** diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 9f1098f95a..ee1b54e403 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -201,11 +201,11 @@ class Linearizer(Kernel): # compute local aliases - modify idxs if necessary for TC alias_buf_idxs = [] - for i in self.local_alias: - localbuf_idx = self.bufs.index(self.local_alias[i]) + for i in (local_alias:=self.local_alias[reduceop]): + localbuf_idx = self.bufs.index(local_alias[i]) buf_idxs = [idx*0 if s == 0 else idx for idx,s in zip(global_idxs+local_idxs+reduce_idxs+full_upcast_idxs,self.sts[i].real_strides())] if (tc:=self.tensor_core): - min_alias_idx = min(self.local_alias.keys()) + min_alias_idx = min(local_alias.keys()) replace_input_idxs = calc_tc_idxs(tc.thread_local_sizes[i-min_alias_idx], tc.thread_local_aliases[i-min_alias_idx]) for n in range(len(tc.threads)): buf_idxs[self.global_dims+n] = replace_input_idxs[n] # replace locals @@ -258,7 +258,7 @@ class Linearizer(Kernel): assert not locals_to_store, "storing locals isn't supported here" # load earlybufs - loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i, + loaded_buffers.update({b:self.global_load(self.bufs.index(local_alias[i]) if i in self.local_alias else i, global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs) if b in self.earlybufs}) # run early AST (with reduce) @@ -322,8 +322,8 @@ class Linearizer(Kernel): # late alias the tensor core buffers if (tc:=self.tensor_core) and self.tensor_core_opts is not None: alias_pattern = [0]*(self.global_dims) + [2]*(len(tc.threads)) + [0]*(self.local_dims-len(tc.threads)) + [0]*(self.shape_len-self.upcasted-self.first_reduce) + [1,1] + [3]*(self.upcasted-2) # noqa: E501 - for _,tc_bufs in self.bufs_for_tensor_core.items(): - for tc_buf in tc_bufs: self.alias_buffer(tc_buf, alias_pattern) # TODO aliased buffers should map to the reduceop + for op, tc_bufs in self.bufs_for_tensor_core.items(): + for tc_buf in tc_bufs: self.alias_buffer(op, tc_buf, alias_pattern) # save backups sts_backup, gfr_backup, upc_backup = self.sts[:], self.group_for_reduces, self.upcasted @@ -350,9 +350,9 @@ class Linearizer(Kernel): assert var.expr is not None self.loop_uops[var.expr] = self.uops.add(UOps.DEFINE_VAR, dtypes.int32, (), var) # define local buffers - for lb in self.local_alias.values(): - self.buf_uops[self.bufs.index(lb)] = self.uops.add(UOps.DEFINE_LOCAL, - PtrDType(dtypes.float32), (), (lb.name, self.sts[self.bufs.index(lb)].size)) + for aliases in self.local_alias.values(): + for lb in aliases.values(): self.buf_uops[self.bufs.index(lb)] = self.uops.add(UOps.DEFINE_LOCAL, PtrDType(lb.dtype), + (), (lb.name, self.sts[self.bufs.index(lb)].size)) # add a local buffer for multistage reduce. # TODO: use local alias if self.group_for_reduces: # TODO: the strides of this can be controlled