From e8fcd2f3dbc085a524cc94635cea59bec13ed8ab Mon Sep 17 00:00:00 2001 From: George Hotz Date: Mon, 16 Oct 2023 14:32:22 -0700 Subject: [PATCH] Revert "limit metal buffers and revert the 207 fix (#2087)" This reverts commit 2fb10f6a1970ba15db7bb4cc86b9586ebd5cf6ad. --- .github/workflows/test.yml | 2 +- test/external/external_test_opt.py | 6 +++--- test/test_schedule.py | 6 ------ tinygrad/lazy.py | 22 +++++++--------------- 4 files changed, 11 insertions(+), 25 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4fe9cd7b0f..184faf6e66 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -154,7 +154,7 @@ jobs: - if: ${{ matrix.task == 'openpilot' }} name: Test openpilot model compile and size run: | - DEBUG=2 ALLOWED_KERNEL_COUNT=207 VALIDTEST=1 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py + DEBUG=2 ALLOWED_KERNEL_COUNT=209 VALIDTEST=1 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py python -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000' - if: ${{ matrix.task == 'openpilot' }} name: Test openpilot model correctness (float32) diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py index 25a0b0cc6f..48dc84e490 100644 --- a/test/external/external_test_opt.py +++ b/test/external/external_test_opt.py @@ -64,7 +64,7 @@ class TestInferenceMinKernels(unittest.TestCase): for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np)) img = Tensor.randn(1, 3, 224, 224) # TODO: this seems very high - with CLCache(115): + with CLCache(116): model.forward(img).realize() def test_resnet(self): @@ -78,7 +78,7 @@ class TestInferenceMinKernels(unittest.TestCase): model = ViT(embed_dim=192, num_heads=3) for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np)) img = Tensor.randn(1, 3, 224, 224) - with CLCache(222): # NOTE: this is way too high + with CLCache(223): # NOTE: this is way too high out = model.forward(img) assert len(CacheCollector.cache) == 0, "ViT prerealized?" out.realize() @@ -88,7 +88,7 @@ class TestInferenceMinKernels(unittest.TestCase): args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000} model = Transformer(**args_tiny) for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np)) - with CLCache(85): + with CLCache(94): model(Tensor([[1,2,3,4]]), 0).realize() @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented") diff --git a/test/test_schedule.py b/test/test_schedule.py index dfd2c6c8d9..d5dc2b217c 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -326,11 +326,5 @@ class TestSchedule(unittest.TestCase): out = x.to('cpu') check_schedule(out, 0, filter_loadops=False) - @unittest.skipUnless(Device.DEFAULT == "METAL", "only for metal") - def test_metal_limit_buffers(self): - t = sum([Tensor([1,2,3,4]) for _ in range(40)]) - for si in t.lazydata.schedule(): - assert len(si.inputs) <= 30 - if __name__ == '__main__': unittest.main(verbosity=2) diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index f3b8e9c4ad..c073f13ae9 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -202,9 +202,9 @@ class LazyBuffer: # *** elementwise ops *** - def e(self:LazyBuffer, op:Union[UnaryOps, BinaryOps, TernaryOps], *_srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer: + def e(self:LazyBuffer, op:Union[UnaryOps, BinaryOps, TernaryOps], *srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer: # srcs includes self - srcs:Tuple[LazyBuffer, ...] = (self,)+_srcs + srcs = (self,)+srcs # if we are separated from other binary ops by movement ops, we push those movement ops above those binaryops if SHUFFLE_MOVEMENT_OPS: srcs = _push_movement_ops(srcs) @@ -225,13 +225,9 @@ class LazyBuffer: if MERGE_ELEMENTWISE_OPS: # remove the buffers from any (childless) BinaryOps that feed into this - merged_srcs:Tuple[Union[LazyOp, LazyBuffer], ...] = tuple([x.op if x.optype == BinaryOps and not x.children and not x.realized else x for x in srcs]) # type: ignore - # NOTE: this is incompete, you can still fuse with reduce ops and exceed the limit - merged_srcs = merged_srcs if self.device != "METAL" or sum(len(x.buffers) for x in merged_srcs) < 30 else srcs - else: - merged_srcs = srcs + srcs = tuple([x.op if x.optype == BinaryOps and not x.children and not x.realized else x for x in srcs]) # type: ignore - return create_lazybuffer(out_device, ShapeTracker.from_shape(out_shape), BinaryOps, LazyOp(op, merged_srcs, arg), out_dtype) + return create_lazybuffer(out_device, ShapeTracker.from_shape(out_shape), BinaryOps, LazyOp(op, srcs, arg), out_dtype) # *** reduce ops *** @@ -251,12 +247,8 @@ class LazyBuffer: # *** movement ops *** def _movement_op(self, st: ShapeTracker, op: MovementOps, arg: Union[Tuple[sint, ...], Tuple[Tuple[sint, sint], ...]]) -> LazyBuffer: - if SHUFFLE_MOVEMENT_OPS and not self.realized and self.optype == BinaryOps and not self.children: - base_bufs = (x.base for x in self.op.buffers) - # don't push if all ast buffers (.base) are realized or sourceless - push_reshape_safe = (self.op.op in UnaryOps) or (any(isinstance(x, LazyOp) or not x.children for x in self.op.src) and not all(x.realized or len(x.op.src) == 0 for x in base_bufs)) - if op not in {MovementOps.EXPAND, MovementOps.PAD} and (op is not MovementOps.RESHAPE or push_reshape_safe): - return self.op.replace_with_movement_ops([(op, arg)]) + if SHUFFLE_MOVEMENT_OPS and self.optype == BinaryOps and not self.realized and (op in {MovementOps.SHRINK, MovementOps.STRIDE, MovementOps.PERMUTE} or (op == MovementOps.RESHAPE and self.op.op in UnaryOps)) and not self.children: + return self.op.replace_with_movement_ops([(op, arg)]) if REMOVE_MOVEMENT_NOPS and not self.realized and st.contiguous: # MovementOps aren't stacked any more, they each have one parent, find the root root = get_movementroot(self) @@ -332,7 +324,7 @@ def _push_movement_ops(srcs:Tuple[LazyBuffer, ...]) -> Tuple[LazyBuffer, ...]: assert isinstance(bx.op.src[0], LazyBuffer) bx = bx.op.src[0] # NOTE: can't push pads past anything where f(0, 0) != 0 or f(0) != 0 - if mops and not bx.realized and bx.optype is BinaryOps and len(bx.children) <= 1 and (all(y[0] is not MovementOps.PAD for y in mops) or all(y.op not in UNSAFE_PAD_OPS for y in bx.op.get_lazyops())): + if mops and not bx.realized and bx.optype is BinaryOps and len(bx.children) <= 1 and (all(x[0] is not MovementOps.PAD for x in mops) or all(x.op not in UNSAFE_PAD_OPS for x in bx.op.get_lazyops())): new_srcs.append(bx.op.replace_with_movement_ops(mops[::-1])) else: new_srcs.append(x)