From 94679322a30b088f27fd293a3b3e0053f8a9f9fa Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Mon, 4 Mar 2024 16:28:28 +0200 Subject: [PATCH] simpler float4 direct store and locals support (#3592) * swap vins instead * delete the upcast * leave it to remove_childless try 1 * Revert "leave it to remove_childless try 1" This reverts commit bf25e935f8ecef78bbd07ed9fa10edc5331ef0a5. * try 2, simpler * Revert "try 2, simpler" This reverts commit d2472af711af04e3108ddf6e76c4f7ca307d61b0. * add note --- test/test_linearizer.py | 23 +++++++++++++++++++++-- tinygrad/codegen/uops.py | 5 ++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 9e4fa537b3..3870c5581b 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -783,7 +783,6 @@ class TestLinearizerUOptimize(unittest.TestCase): store_val = [u.vin[-1] for u in k.uops if u.uop is UOps.STORE][0] assert store_val.dtype == dtypes.float.vec(4) and store_val.uop != UOps.CAST - @unittest.skip("TODO: support locals replacement across the uop graph") def test_grouped_store_locals_and_globals(self): if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: self.skipTest("Only Compiled uses linearizer with locals and shared") @@ -804,8 +803,28 @@ class TestLinearizerUOptimize(unittest.TestCase): # check that the float4 cast collapses for all stores for store in local_stores+global_stores: assert store.vin[-1].dtype == dtypes.float.vec(2) and store.vin[-1].uop != UOps.CAST - # check that the barrier uses the new stores + # check the children's vins assert barrier.vin == tuple(local_stores) + assert len([u for u in k.uops if u.uop is UOps.IF and u.vin[-1] == barrier]) == 1 + + def test_grouped_store_local_only(self): + if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: + self.skipTest("Only Compiled uses linearizer with locals and shared") + + x, y = Tensor.rand(1,128), Tensor.rand(128, 128) + r = (x@y).relu() + k = Linearizer(create_schedule([r.lazydata])[-1].ast) + k.hand_coded_optimizations() + k.linearize() + + stores = [u for u in k.uops if u.uop == UOps.STORE] + + # the float4 value stores directly in lds and we skip upcast + assert stores[0].vin[-1].dtype == dtypes.float.vec(4) + assert stores[0].vin[-1].uop != UOps.CAST + + # the global store doesn't change + assert stores[1].vin[-1].dtype == dtypes.float if __name__ == '__main__': unittest.main() diff --git a/tinygrad/codegen/uops.py b/tinygrad/codegen/uops.py index 91b0e74900..57f303fc06 100644 --- a/tinygrad/codegen/uops.py +++ b/tinygrad/codegen/uops.py @@ -224,12 +224,11 @@ class UOpGraph: replaced_stores: Dict[UOp,UOp] = {} for u in self.uops: if u.uop is not UOps.STORE or (val:=u.vin[-1]).uop is not UOps.CAST or cast(DType,val.dtype).count == 1: continue - if u.vin[0].uop is UOps.DEFINE_LOCAL: continue # TODO add support for local store if all(el.uop is UOps.GEP for el in val.vin): replaced_stores[u] = val.vin[0].vin[0] elif all(el.uop is UOps.PHI for el in val.vin): replaced_stores[u] = phi_resolve_acc(val) for prev,new in replaced_stores.items(): - self.add(UOps.STORE, prev.dtype, (prev.vin[0],prev.vin[1],new), insert_before=self.uops.index(prev)) - self.uops.remove(prev) + self.uops.remove(prev.vin[-1]) # remove the old upcast NOTE: the upcast's vins become childless now + self.uops[self.uops.index(prev)].vin = (prev.vin[0],prev.vin[1],new) # replace with the float4 value # add UOps.END* self.add_ends()