mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-26 23:38:58 -05:00
map local aliases to reduceop (#4766)
* map
* ugh
* save one line
* concerning, does this pass
* Revert "concerning, does this pass"
This reverts commit 64d4664f17.
* use local_alias
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
from collections import defaultdict
|
||||
import math, itertools
|
||||
from typing import NamedTuple, Optional, List, Tuple, cast, Dict, Union
|
||||
from typing import DefaultDict, NamedTuple, Optional, List, Tuple, cast, Dict, Union
|
||||
from tinygrad.ops import LazyOp, UnaryOps, BinaryOps, ReduceOps, MemBuffer, ConstBuffer, BufferOps, UNSAFE_PAD_OPS
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.renderer import Renderer, TensorCore
|
||||
@@ -87,7 +88,7 @@ class Kernel:
|
||||
self.group_for_reduces: int = 0
|
||||
self.upcasted: int = 0
|
||||
self.local_dims: int = 0
|
||||
self.local_alias: Dict[int, LocalBuffer] = {}
|
||||
self.local_alias: DefaultDict[LazyOp, Dict[int, LocalBuffer]] = defaultdict(dict)
|
||||
self.tensor_core: Optional[TensorCore] = None
|
||||
self.tensor_core_opts: Optional[TensorCoreOptions] = None
|
||||
# the local aliased buffers for A and B
|
||||
@@ -115,7 +116,7 @@ class Kernel:
|
||||
# parameters for optimizations
|
||||
ret.applied_opts, ret.group_for_reduces, ret.upcasted, ret.local_dims, ret.dont_use_locals = \
|
||||
self.applied_opts[:], self.group_for_reduces, self.upcasted, self.local_dims, self.dont_use_locals
|
||||
ret.tensor_core, ret.tensor_core_opts, ret.local_alias, ret.bufs_for_tensor_core = self.tensor_core, self.tensor_core_opts, {}, \
|
||||
ret.tensor_core, ret.tensor_core_opts, ret.local_alias, ret.bufs_for_tensor_core = self.tensor_core, self.tensor_core_opts, defaultdict(dict), \
|
||||
self.bufs_for_tensor_core
|
||||
|
||||
# uncached since linearize didn't run
|
||||
@@ -306,7 +307,7 @@ class Kernel:
|
||||
self.reshape_and_permute(None, order)
|
||||
if DEBUG >= 3: print("permuted global dim", order, "due to allocation exceeds global limit")
|
||||
|
||||
def alias_buffer(self, i, pattern):
|
||||
def alias_buffer(self, op:LazyOp, i:int, pattern:List[int]) -> None:
|
||||
assert len(pattern) == len(self.sts[i].shape), f"must include a pattern for each shape {pattern} {self.sts[i].shape}"
|
||||
|
||||
bst = 1
|
||||
@@ -321,7 +322,7 @@ class Kernel:
|
||||
self.sts.append(ShapeTracker((View.create(tuple(shp), tuple(stride)),)))
|
||||
self.bufs.append(LocalBuffer(name=f"ldata{i}", size=self.sts[-1].size))
|
||||
if DEBUG >= 4: print("aliasing buffer", self.sts[i])
|
||||
self.local_alias[i] = cast(LocalBuffer, self.bufs[-1])
|
||||
self.local_alias[op][i] = cast(LocalBuffer, self.bufs[-1])
|
||||
|
||||
# ******************** high level optimizers ********************
|
||||
|
||||
|
||||
@@ -201,11 +201,11 @@ class Linearizer(Kernel):
|
||||
|
||||
# compute local aliases - modify idxs if necessary for TC
|
||||
alias_buf_idxs = []
|
||||
for i in self.local_alias:
|
||||
localbuf_idx = self.bufs.index(self.local_alias[i])
|
||||
for i in (local_alias:=self.local_alias[reduceop]):
|
||||
localbuf_idx = self.bufs.index(local_alias[i])
|
||||
buf_idxs = [idx*0 if s == 0 else idx for idx,s in zip(global_idxs+local_idxs+reduce_idxs+full_upcast_idxs,self.sts[i].real_strides())]
|
||||
if (tc:=self.tensor_core):
|
||||
min_alias_idx = min(self.local_alias.keys())
|
||||
min_alias_idx = min(local_alias.keys())
|
||||
replace_input_idxs = calc_tc_idxs(tc.thread_local_sizes[i-min_alias_idx], tc.thread_local_aliases[i-min_alias_idx])
|
||||
for n in range(len(tc.threads)):
|
||||
buf_idxs[self.global_dims+n] = replace_input_idxs[n] # replace locals
|
||||
@@ -258,7 +258,7 @@ class Linearizer(Kernel):
|
||||
assert not locals_to_store, "storing locals isn't supported here"
|
||||
|
||||
# load earlybufs
|
||||
loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i,
|
||||
loaded_buffers.update({b:self.global_load(self.bufs.index(local_alias[i]) if i in self.local_alias else i,
|
||||
global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs) if b in self.earlybufs})
|
||||
|
||||
# run early AST (with reduce)
|
||||
@@ -322,8 +322,8 @@ class Linearizer(Kernel):
|
||||
# late alias the tensor core buffers
|
||||
if (tc:=self.tensor_core) and self.tensor_core_opts is not None:
|
||||
alias_pattern = [0]*(self.global_dims) + [2]*(len(tc.threads)) + [0]*(self.local_dims-len(tc.threads)) + [0]*(self.shape_len-self.upcasted-self.first_reduce) + [1,1] + [3]*(self.upcasted-2) # noqa: E501
|
||||
for _,tc_bufs in self.bufs_for_tensor_core.items():
|
||||
for tc_buf in tc_bufs: self.alias_buffer(tc_buf, alias_pattern) # TODO aliased buffers should map to the reduceop
|
||||
for op, tc_bufs in self.bufs_for_tensor_core.items():
|
||||
for tc_buf in tc_bufs: self.alias_buffer(op, tc_buf, alias_pattern)
|
||||
|
||||
# save backups
|
||||
sts_backup, gfr_backup, upc_backup = self.sts[:], self.group_for_reduces, self.upcasted
|
||||
@@ -350,9 +350,9 @@ class Linearizer(Kernel):
|
||||
assert var.expr is not None
|
||||
self.loop_uops[var.expr] = self.uops.add(UOps.DEFINE_VAR, dtypes.int32, (), var)
|
||||
# define local buffers
|
||||
for lb in self.local_alias.values():
|
||||
self.buf_uops[self.bufs.index(lb)] = self.uops.add(UOps.DEFINE_LOCAL,
|
||||
PtrDType(dtypes.float32), (), (lb.name, self.sts[self.bufs.index(lb)].size))
|
||||
for aliases in self.local_alias.values():
|
||||
for lb in aliases.values(): self.buf_uops[self.bufs.index(lb)] = self.uops.add(UOps.DEFINE_LOCAL, PtrDType(lb.dtype),
|
||||
(), (lb.name, self.sts[self.bufs.index(lb)].size))
|
||||
# add a local buffer for multistage reduce. # TODO: use local alias
|
||||
if self.group_for_reduces:
|
||||
# TODO: the strides of this can be controlled
|
||||
|
||||
Reference in New Issue
Block a user