From c2504357afbc3caf77e06249f190781df70d3e4e Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Mon, 13 Jan 2025 23:53:13 +0300
Subject: [PATCH 01/29] am: lock to access dev (#8594)

* amm lock to access dev

* wording

* just works

* disbale
---
 tinygrad/runtime/support/am/amdev.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/tinygrad/runtime/support/am/amdev.py b/tinygrad/runtime/support/am/amdev.py
index a070ba0b99..aaf4a4ab74 100644
--- a/tinygrad/runtime/support/am/amdev.py
+++ b/tinygrad/runtime/support/am/amdev.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
-import ctypes, collections, time, dataclasses, pathlib
-from tinygrad.helpers import to_mv, mv_address, getenv, round_up, DEBUG
+import ctypes, collections, time, dataclasses, pathlib, fcntl, os, signal
+from tinygrad.helpers import to_mv, mv_address, getenv, round_up, DEBUG, temp
 from tinygrad.runtime.autogen.am import am, mp_11_0, mp_13_0_0, nbio_4_3_0, mmhub_3_0_0, gc_11_0_0, osssys_6_0_0
 from tinygrad.runtime.support.allocator import TLSFAllocator
 from tinygrad.runtime.support.am.ip import AM_SOC21, AM_GMC, AM_IH, AM_PSP, AM_SMU, AM_GFX, AM_SDMA
@@ -255,6 +255,15 @@ class AMDev:
     self.pcidev, self.devfmt = pcidev, devfmt
     self.vram, self.doorbell64, self.mmio = vram_bar, doorbell_bar, mmio_bar
 
+    os.umask(0) # Set umask to 0 to allow creating files with 0666 permissions
+
+    # Avoid O_CREAT because we don’t want to re-create/replace an existing file (triggers extra perms checks) when opening as non-owner.
+    if os.path.exists(lock_name:=temp(f"am_{self.devfmt}.lock")): self.lock_fd = os.open(lock_name, os.O_RDWR)
+    else: self.lock_fd = os.open(lock_name, os.O_RDWR | os.O_CREAT, 0o666)
+
+    try: fcntl.flock(self.lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+    except OSError: raise RuntimeError(f"Failed to open AM device {self.devfmt}. It's already in use.")
+
     self._run_discovery()
     self._build_regs()
 
@@ -284,14 +293,17 @@ class AMDev:
     self.sdma:AM_SDMA = AM_SDMA(self)
 
     if self.partial_boot and (self.reg("regCP_MEC_RS64_CNTL").read() & gc_11_0_0.CP_MEC_RS64_CNTL__MEC_HALT_MASK == 0):
-      print(f"am {self.devfmt}: MEC is active. Someone might be using the GPU? Issue a full reset.")
+      if DEBUG >= 2: print(f"am {self.devfmt}: MEC is active. Issue a full reset.")
       self.partial_boot = False
 
     if not self.partial_boot:
-      if self.psp.is_sos_alive(): self.smu.mode1_reset()
-      for ip in [self.soc21, self.gmc, self.ih, self.psp, self.smu]:
-        ip.init()
-        if DEBUG >= 2: print(f"am {self.devfmt}: {ip.__class__.__name__} initialized")
+      try: # do not interrupt the boot process
+        signal.signal(signal.SIGINT, signal.SIG_IGN)
+        if self.psp.is_sos_alive(): self.smu.mode1_reset()
+        for ip in [self.soc21, self.gmc, self.ih, self.psp, self.smu]:
+          ip.init()
+          if DEBUG >= 2: print(f"am {self.devfmt}: {ip.__class__.__name__} initialized")
+      finally: signal.signal(signal.SIGINT, signal.default_int_handler)
 
     # Booting done
     self.is_booting = False

From ae2229d7275e6a52568a4e7909a322fbac1755f8 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Mon, 13 Jan 2025 16:32:07 -0500
Subject: [PATCH 02/29] assert kernel buffer limit at compile time [pr] (#8595)

* remove the BUF_LIMIT assert

* skip the base one
---
 test/external/external_model_benchmark.py | 5 +++--
 test/test_linearizer.py                   | 3 +--
 test/test_schedule.py                     | 5 +++--
 tinygrad/engine/schedule.py               | 6 ------
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/test/external/external_model_benchmark.py b/test/external/external_model_benchmark.py
index 1ebe97641e..3ed7b82746 100644
--- a/test/external/external_model_benchmark.py
+++ b/test/external/external_model_benchmark.py
@@ -9,6 +9,7 @@ from onnx2torch import convert
 from extra.onnx import get_run_onnx
 from tinygrad.helpers import OSX, DEBUG, fetch
 from tinygrad import Tensor, Device
+from tinygrad.device import CompileError
 
 MODELS = {
   "resnet50": "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
@@ -72,10 +73,10 @@ def benchmark_model(m, devices, validate_outs=False):
       for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
       benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}) # noqa: F821
       del inputs, tinygrad_model, tinygrad_jitted_model
-    except RuntimeError as e:
+    except CompileError as e:
       # TODO: we don't run the dm model on METAL for now
       if Device.DEFAULT == "METAL":
-        assert "buffer count limit" in str(e)
+        assert "no 'buffer' resource location available" in str(e)
         return
       else: raise e
 
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index a4d8da48ca..cd274b6aaf 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -12,7 +12,6 @@ from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 # from tinygrad.ops import Variable
 from tinygrad.tensor import Tensor, _to_np_dtype
-from tinygrad.engine.schedule import BUF_LIMIT
 from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner
 from tinygrad.helpers import prod, Context, getenv, CI, flatten, dedup, AMX
 from tinygrad.dtype import DType, dtypes
@@ -1701,7 +1700,7 @@ class TestHandCodedOpts(unittest.TestCase):
     # float4/other hcopt shouldn't upcast last axis, since we already have 7 upcast, and the last axis is not very contiguous
     assert k.upcasted == 1 and k.full_shape[-1] == 7
 
-  @unittest.skipIf((buf_max:=BUF_LIMIT.get(Device.DEFAULT)) is not None and buf_max <= 37, "this test uses too many bufs")
+  @unittest.skipIf(Device.DEFAULT == "METAL", "METAL can only run kernels with up to 32 buffers")
   def test_masked_upcast_wino(self):
     monster = Tensor.stack(*[Tensor.stack(*[Tensor.rand(16) for _ in range(6)]) for _ in range(6)])
 
diff --git a/test/test_schedule.py b/test/test_schedule.py
index d69e9dbaa6..e83ae13c28 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -16,7 +16,7 @@ from tinygrad.shape.view import View
 from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views
 from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context
 from tinygrad.codegen.kernel import verify_ast
-from tinygrad.engine.schedule import BUF_LIMIT, ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
+from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
 from tinygrad.engine.realize import CompiledRunner, run_schedule, lower_schedule
 from extra.models.llama import precompute_freqs_cis
 
@@ -1363,8 +1363,9 @@ class TestSchedule(unittest.TestCase):
   @unittest.expectedFailure
   def test_conv2d_fused_half(self): _test_conv2d(5, dtype=dtypes.half)
 
+  @unittest.skip("splitting kernels exceeding device buffer count is not yet supported")
   def _test_buf_cnt(self, cnt:int, allowed:int):
-    if (m:=BUF_LIMIT.get(Device.DEFAULT)) is None or m != 32: self.skipTest(f"test needs a buf_max of 32 {Device.DEFAULT}")
+    #if (m:=BUF_LIMIT.get(Device.DEFAULT)) is None or m != 32: self.skipTest(f"test needs a buf_max of 32 {Device.DEFAULT}")
     alu = functools.reduce(lambda x,y: x+y, [Tensor.ones((1, 1)).contiguous().realize() for _ in range(cnt-1)])
     s = alu.schedule()
     assert len(s) == allowed
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 34cea54d8d..26c789aeb2 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -13,8 +13,6 @@ from tinygrad.device import Buffer
 # creation can recurse a lot
 sys.setrecursionlimit(10000)
 
-BUF_LIMIT = {"METAL":32}
-
 # **** big graph spec
 
 tensor_uop_spec = PatternMatcher([
@@ -236,10 +234,6 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem:
   sink = graph_rewrite(graph_rewrite(sink, view_left), view_right)
   # convert to AST
   sink = graph_rewrite(graph_rewrite(sink, to_si+check_preload if len(si_ctx.assigns) != 0 else to_si, si_ctx), append_bufs, si_ctx)
-  # assert buffer count limit
-  if (limit:=BUF_LIMIT.get(device:=si_ctx.bufs[0].device)) is not None and len(si_ctx.bufs) >= limit:
-    if DEBUG >= 3: print(sink)
-    raise RuntimeError(f"Kernel for {si_ctx.metadata} exceeded the {limit} buffer count limit for {device} with {len(si_ctx.bufs)} buffers.")
   # we also allow masked views. if it has a single view and it's equal when you shrink a contig, it's fine
   for ubuf,ops in si_ctx.assign_adj.items():
     if si_ctx.sinked.get(ubuf) is not None and not all((s:=x.st_arg).contiguous or (len(s.views) == 1 and (m:=s.views[0].mask) is not None \

From c4e33048c6c3a6896e8ec04a69f055b00f64366b Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Mon, 13 Jan 2025 20:13:44 -0500
Subject: [PATCH 03/29] test Tensor.clone has a different lazydata [pr] (#8600)

---
 test/test_tensor.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test_tensor.py b/test/test_tensor.py
index 610fa7c6af..a5152caa95 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -646,11 +646,20 @@ class TestZeroShapeTensor(unittest.TestCase):
 
   def test_clone(self):
     a = Tensor.rand(16, 16).realize()
+    self.assertIsNot(a.lazydata, a.clone().lazydata)
     np.testing.assert_allclose(a.numpy(), a.clone().numpy())
 
     a = Tensor.rand(16, 16).mul(5.0).add(5.0)
+    self.assertIsNot(a.lazydata, a.clone().lazydata)
     np.testing.assert_allclose(a.numpy(), a.clone().numpy())
 
+  def test_clone_with_shrink(self):
+    a = Tensor.empty(16, 16)
+    self.assertIsNot(a.lazydata, a.clone().lazydata)
+
+    b = a.shrink(((2, 10), None))
+    self.assertIsNot(b.lazydata, b.clone().lazydata)
+
   def test_clone_with_grad(self):
     a = Tensor.rand(16, 16, requires_grad=True)
     a.mul(5.0).add(5.0).mean().backward()

From 227d96d7a30734ba110efa61846e3a1ebcc00d9e Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Mon, 13 Jan 2025 20:28:14 -0500
Subject: [PATCH 04/29] remove unused src from metaop [pr] (#8601)

---
 tinygrad/ops.py    | 2 +-
 tinygrad/tensor.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index ee810a4c57..6a039d54bf 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -440,7 +440,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   # *** from LazyBuffer ***
 
   @staticmethod
-  def metaop(op:Ops, shape:tuple[sint, ...], dtype:DType, device:str, arg=None, src:tuple[UOp, ...]=()) -> UOp:
+  def metaop(op:Ops, shape:tuple[sint, ...], dtype:DType, device:str, arg=None) -> UOp:
     from tinygrad.shape.shapetracker import ShapeTracker
     # Tensor const is CONST(VIEW(DEVICE)) -> RESHAPE -> EXPAND
     if op is Ops.CONST:
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 250f262102..70fcac9274 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -41,9 +41,9 @@ class Function:
 
 import tinygrad.function as F
 
-def _metaop(op, shape:tuple[sint,...], dtype:DType, device:Union[str, tuple[str, ...]], arg=None, src:tuple[UOp, ...]=()):
-  if isinstance(device, str): return UOp.metaop(op, shape, dtype, device, arg, src)
-  return MultiLazyBuffer([UOp.metaop(op, shape, dtype, d, arg, src) for d in device], None)
+def _metaop(op, shape:tuple[sint,...], dtype:DType, device:Union[str, tuple[str, ...]], arg=None):
+  if isinstance(device, str): return UOp.metaop(op, shape, dtype, device, arg)
+  return MultiLazyBuffer([UOp.metaop(op, shape, dtype, d, arg) for d in device], None)
 
 def _from_np_dtype(npdtype:'np.dtype') -> DType: # type: ignore [name-defined] # noqa: F821
   import numpy as np

From d443e91d82598a862dfc44af868e3af99e3782de Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Mon, 13 Jan 2025 21:29:13 -0500
Subject: [PATCH 05/29] remove custom splits in Tensor.shard [pr] (#8602)

towards even split only
---
 test/test_multitensor.py | 18 ++++++++----------
 tinygrad/tensor.py       | 19 ++++++++-----------
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index a80014c084..67cb9757e1 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -460,18 +460,15 @@ class TestMultiTensor(unittest.TestCase):
 
   def test_uneven_shard_with_empty(self):
     N = 4
-    X = Tensor.rand(16, 1, 17).contiguous().realize()
+    X = Tensor.rand(16, 1, 3).contiguous().realize()
     np_x = X.numpy()
     devices = tuple(f"{Device.DEFAULT}:{i}" for i in range(N))
 
     # test empty shard
-    np.testing.assert_equal(X.shard(devices, 0, (2, 2, 12, 0)).numpy(), np_x)
+    np.testing.assert_equal(X.shard(devices, 0).numpy(), np_x)
 
     # test reshape with empty shard
-    np.testing.assert_equal(X.shard(devices, 0, (2, 2, 12, 0)).reshape(8, 1, 34).numpy(), np_x.reshape(8, 1, 34))
-
-    # test elementwise with empty shard
-    np.testing.assert_equal((X.shard(devices, 0, (2, 2, 12, 0)) + X.shard(devices, 0, (0, 0, 1, 15))).numpy(), np_x + np_x)
+    np.testing.assert_equal(X.shard(devices, 0).reshape(8, 1, 6).numpy(), np_x.reshape(8, 1, 6))
 
   def test_multiple_uneven_shard(self):
     N = 4
@@ -479,8 +476,8 @@ class TestMultiTensor(unittest.TestCase):
     Y = Tensor.rand(4, 1, 257).contiguous().realize()
     np_x, np_y = X.numpy(), Y.numpy()
     devices = tuple(f"{Device.DEFAULT}:{i}" for i in range(N))
-    X.shard_(devices, 2, (2, 38, 47, 170))
-    Y.shard_(devices, 2, (34, 53, 51, 119))
+    X.shard_(devices, 2)
+    Y.shard_(devices, 2)
     np.testing.assert_equal(X.numpy(), np_x)
     np.testing.assert_equal(Y.numpy(), np_y)
     np.testing.assert_equal((X + Y).numpy(), np_x + np_y)
@@ -534,6 +531,7 @@ class TestMultiTensor(unittest.TestCase):
     with self.assertRaises((AssertionError, ValueError)):
       t0.reshape((26*15,7))
 
+  @unittest.skip("no longer supports splits")
   def test_reshape_on_axis_uneven(self):
     def reshape_helper(t0, t, t_axis):
       np.testing.assert_allclose(t0.reshape(t.shape).numpy(), t.numpy())
@@ -606,7 +604,7 @@ class TestMultiTensor(unittest.TestCase):
     self.assertEqual(t.lazydata.axis, t2.lazydata.axis)
 
   def test_rand_like_uneven_shard(self):
-    t = Tensor.empty((4, 42, 15)).shard(devices_3, axis=1, splits=(14, 7, 21))
+    t = Tensor.empty((4, 42, 15)).shard(devices_3, axis=1)
     t2 = Tensor.rand_like(t)
     self.assertEqual(t.shape, t2.shape)
     self.assertEqual(t.device, t2.device)
@@ -657,7 +655,7 @@ class TestMultiTensor(unittest.TestCase):
 
   def test_dropout_on_uneven_shard_axis(self):
     with Tensor.train():
-      X = Tensor.ones(256).shard(devices_3, axis=0, splits=(100, 50, 106))
+      X = Tensor.ones(256).shard(devices_3, axis=0)
       output = X.dropout(0.5).numpy()
       unique, counts = np.unique(output, return_counts=True)
       assert set(unique) == {0, 2}, unique
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 70fcac9274..6ca7bcb652 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -394,33 +394,30 @@ class Tensor(SimpleMathTrait):
     if self.grad is not None and real.grad is not None: self.grad.replace(real.grad)
     return self.replace(real)
 
-  def shard(self, devices:tuple[str, ...], axis:Optional[int]=None, splits:Optional[tuple[int, ...]]=None) -> Tensor:
+  def shard(self, devices:tuple[str, ...], axis:Optional[int]=None) -> Tensor:
     """
-    Shards the tensor across the given devices. Optionally specify which axis to shard on, and how to split it across devices.
+    Shards the tensor across the given devices. Optionally specify which axis to shard on.
 
     ```python exec="true" source="above" session="tensor" result="python"
     t = Tensor.empty(2, 3)
-    print(t.shard((t.device, t.device), axis=1, splits=(2, 1)).lazydata)
+    print(t.shard((t.device, t.device), axis=1).lazydata)
     ```
-
     """
     assert isinstance(self.lazydata, UOp), "can't shard a MultiLazyBuffer"
     devices, bounds = tuple(Device.canonicalize(x) for x in devices), None
     if axis is not None:
       axis = self._resolve_dim(axis)
-      if splits is None:
-        if not isinstance(total:=self.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {self.shape=}, {axis=}")
-        sz = ceildiv(total, len(devices))
-        splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))])
-      assert sum(splits) == self.shape[axis], "specified splits do not sum up to axis shape"
+      if not isinstance(total:=self.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {self.shape=}, {axis=}")
+      sz = ceildiv(total, len(devices))
+      splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))])
       bounds = tuple(itertools.pairwise(itertools.accumulate(splits, initial=0)))
     return Tensor(MultiLazyBuffer.from_sharded(self.lazydata, devices, axis, bounds), device=devices, requires_grad=self.requires_grad)
 
-  def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None, splits:Optional[tuple[int, ...]]=None):
+  def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):
     """
     Shards the tensor across the given devices in place.
     """
-    return self.replace(self.shard(devices, axis, splits))
+    return self.replace(self.shard(devices, axis))
 
   @staticmethod
   def from_uop(y:UOp, **kwargs) -> Tensor:

From 05e54f00d38c7a13df35359b7432488fb53bc9fb Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Mon, 13 Jan 2025 23:40:05 -0500
Subject: [PATCH 06/29] remove bounds from MultiLazyBuffer.from_sharded [pr]
 (#8603)

without a custom bound, the bound is uniquely determined by shape and axis
---
 tinygrad/multi.py  | 10 +++++++---
 tinygrad/tensor.py | 13 ++++---------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index 20f49b3877..97a17fe714 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import functools, itertools, operator
-from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv
+from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv, ceildiv
 from tinygrad.dtype import DType
 from tinygrad.ops import Ops, MathTrait, UOp, sint
 
@@ -62,8 +62,12 @@ class MultiLazyBuffer(MathTrait):
   def __repr__(self): return f"<MLB {self.axis=} {self.real=} {chr(10)}{chr(10).join([f'{x.device} {x.st}' for x in self.lbs])}>"
 
   @staticmethod
-  def from_sharded(lb:UOp, devices:tuple[str, ...], axis:int|None, bounds:tuple[tuple[int, int], ...]|None):
-    assert (axis is None) == (bounds is None), "must specify bounds iff axis is specified"
+  def from_sharded(lb:UOp, devices:tuple[str, ...], axis:int|None):
+    if axis is not None:
+      if not isinstance(total:=lb.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {lb.shape=}, {axis=}")
+      sz = ceildiv(total, len(devices))
+      splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))])
+      bounds = tuple(itertools.pairwise(itertools.accumulate(splits, initial=0)))
     lbs = [lb] * len(devices)
     sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis, bounds) if axis is not None and bounds is not None else lbs, devices)]
     # NOTE: this contiguous is making it impossible for the scheduler to do late const folding
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 6ca7bcb652..60b9735bd9 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -179,7 +179,7 @@ class Tensor(SimpleMathTrait):
     # data might be on a different device
     if isinstance(device, str): self.lazydata:Union[UOp, MultiLazyBuffer] = data if data.device == device else data.copy_to_device(device)
     # if device is a tuple, we should have/construct a MultiLazyBuffer
-    elif isinstance(data, UOp): self.lazydata = MultiLazyBuffer.from_sharded(data, device, None, None)
+    elif isinstance(data, UOp): self.lazydata = MultiLazyBuffer.from_sharded(data, device, None)
     else:
       assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}"
       self.lazydata = data
@@ -404,14 +404,9 @@ class Tensor(SimpleMathTrait):
     ```
     """
     assert isinstance(self.lazydata, UOp), "can't shard a MultiLazyBuffer"
-    devices, bounds = tuple(Device.canonicalize(x) for x in devices), None
-    if axis is not None:
-      axis = self._resolve_dim(axis)
-      if not isinstance(total:=self.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {self.shape=}, {axis=}")
-      sz = ceildiv(total, len(devices))
-      splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))])
-      bounds = tuple(itertools.pairwise(itertools.accumulate(splits, initial=0)))
-    return Tensor(MultiLazyBuffer.from_sharded(self.lazydata, devices, axis, bounds), device=devices, requires_grad=self.requires_grad)
+    devices = tuple(Device.canonicalize(x) for x in devices)
+    if axis is not None: axis = self._resolve_dim(axis)
+    return Tensor(MultiLazyBuffer.from_sharded(self.lazydata, devices, axis), device=devices, requires_grad=self.requires_grad)
 
   def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):
     """

From 863abc71401c496fa34ca0ab49b0e68a9f4ffff2 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Tue, 14 Jan 2025 03:01:59 -0500
Subject: [PATCH 07/29] scheduling graph_rewrite prereqs for BLOCK in ASSIGN
 (#8598)

* remove the BUF_LIMIT assert

* skip the base one

* work

* work

* good error

* ok comment

* shorter check
---
 .../external/process_replay/process_replay.py |  6 +-
 tinygrad/engine/schedule.py                   | 57 +++++++++----------
 2 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/test/external/process_replay/process_replay.py b/test/external/process_replay/process_replay.py
index b5432bb6a8..1bc597c6b9 100755
--- a/test/external/process_replay/process_replay.py
+++ b/test/external/process_replay/process_replay.py
@@ -2,7 +2,7 @@
 # compare kernels created by HEAD against master
 from collections import defaultdict
 import os, multiprocessing, logging, pickle, sqlite3, difflib, functools, warnings
-from typing import Callable, List, Set, Tuple, Union, cast
+from typing import Callable, List, Tuple, Union, cast
 from tinygrad.helpers import VERSION, Context, ContextVar, colored, db_connection, getenv, tqdm
 from tinygrad.engine.schedule import ScheduleContext, schedule_uop
 from tinygrad.codegen.kernel import Kernel, Opt
@@ -30,9 +30,9 @@ class ProcessReplayWarning(Warning): pass
 
 # *** recreators
 
-def recreate_sched(ast:UOp, assigns:Set[UOp]) -> UOp:
+def recreate_sched(ast:UOp) -> UOp:
   # NOTE: process replay isn't meant to actually schedule anything
-  return schedule_uop(ast, ScheduleContext(assigns=assigns, tensor_uops=defaultdict(list))).ast
+  return schedule_uop(ast, ScheduleContext(tensor_uops=defaultdict(list))).ast
 def recreate_kernel(ast:UOp, opts:Renderer, applied_opts:List[Opt], name:str) -> str:
   k = Kernel(ast, opts=opts)
   for opt in applied_opts: k.apply_opt(opt)
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 26c789aeb2..5803e6c655 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -183,14 +183,9 @@ view_right = merge_views+PatternMatcher([
 
 @dataclass(frozen=True)
 class ScheduleItemContext:
-  ops_metadata: dict[UOp, Metadata]
-  assigns: set[UOp]
   var_vals: dict[Variable, int]
-  sinked: dict[UOp, UOp]
   sts: set[ShapeTracker] = field(default_factory=set)
   bufs: list[UOp] = field(default_factory=list)
-  metadata: set[Metadata] = field(default_factory=set)
-  assign_adj: dict[UOp, list[UOp]] = field(default_factory=dict)
 
 def _append_st_vars(ctx:ScheduleItemContext, x:UOp) -> UOp|None:
   if (st:=unwrap(x.st)) in ctx.sts: return None
@@ -204,47 +199,47 @@ def _append_buf(ctx:ScheduleItemContext, x:UOp) -> UOp:
   return UOp(Ops.DEFINE_GLOBAL, x.dtype.ptr(size=x.size), (), len(ctx.bufs)-1)
 append_bufs = PatternMatcher([(UPat(Ops.BUFFER, name="x"), _append_buf)])
 
-def _append_preload(ctx:ScheduleItemContext, x:UOp, b:UOp) -> UOp:
-  (adj_loads:=ctx.assign_adj.setdefault(b, [])).append(x)
-  if not all_same([x.op for x in adj_loads]): raise RuntimeError(f"Detected cycle when fusing {adj_loads}. Can only fuse PRELOAD or LOAD of {b}")
-  return x.replace(op=Ops.LOAD)
-check_preload = PatternMatcher([(UPat(Ops.PRELOAD, src=(UPat.var("b"), UPat()), name="x"), _append_preload),])
-
 to_si = PatternMatcher([
   (UPat(Ops.VIEW, name="x"), _append_st_vars),
   (UPat(Ops.SINK, src=(UPat.store(UPat.var("b"), UPat(), UPat(GroupOp.Meta, name="x")),)), lambda b,x: x.replace(src=(b, *x.src))),
   # don't need contiguous or assign anymore
   (UPat(Ops.CONTIGUOUS, src=(UPat.var("x"),)), lambda x: x),
   (UPat(Ops.ASSIGN, src=(UPat(), UPat.var("x"),)), lambda x: x),
+  # PRELOAD becomes LOAD
+  (UPat(Ops.PRELOAD, name="root"), lambda root:root.replace(op=Ops.LOAD)),
 ])
 
-add_metadata = PatternMatcher([(UPat(tuple(Ops), name="x"), lambda ctx,x: None if (m:=ctx.ops_metadata.get(x)) is None else ctx.metadata.add(m)),])
 add_assign_adjacents = PatternMatcher([(UPat.load(UPat.var("b"), UPat(), name="x"), lambda ctx,b,x: ctx.assign_adj.setdefault(b, []).append(x)
                                if b in ctx.assigns else None)])
 
-# late folding for multi output kernels
-multioutput = PatternMatcher([(UPat.load(UPat.var("b"), UPat()), lambda ctx,b: ctx.sinked.get(b)),])
+# LOAD(BUFFER) -> the STORE value if it's we're doing the STORE in the same kernel
+multioutput = PatternMatcher([(UPat.load(UPat.var("b"), UPat()), lambda ctx,b: ctx.get(b)),])
 
 def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem:
-  # create the ast context
-  si_ctx = ScheduleItemContext(ctx.ops_metadata, ctx.assigns, ctx.var_vals, {x.buf_uop:x.src[2] for x in pre.src})
-  create_ctx = add_metadata if len(si_ctx.assigns) == 0 else add_metadata+add_assign_adjacents
-  sink = graph_rewrite(pre, create_ctx if len(si_ctx.sinked) == 1 else multioutput+create_ctx, si_ctx)
-  # do movement ops
-  sink = graph_rewrite(graph_rewrite(sink, view_left), view_right)
-  # convert to AST
-  sink = graph_rewrite(graph_rewrite(sink, to_si+check_preload if len(si_ctx.assigns) != 0 else to_si, si_ctx), append_bufs, si_ctx)
-  # we also allow masked views. if it has a single view and it's equal when you shrink a contig, it's fine
-  for ubuf,ops in si_ctx.assign_adj.items():
-    if si_ctx.sinked.get(ubuf) is not None and not all((s:=x.st_arg).contiguous or (len(s.views) == 1 and (m:=s.views[0].mask) is not None \
-        and ShapeTracker.from_shape(s.shape).shrink(m) == s.shrink(m)) for x in ops):
-      raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n"
-                         +colored("   - a += a.T\n", "red")+colored("   + a += a.T.contiguous()", "green"))
+  # remove movement ops + substitute LOAD of fused STORE with just the value
+  sink = graph_rewrite(graph_rewrite(pre, multioutput+view_left, store_bufs:={x.buf_uop:x.src[2] for x in pre.src}), view_right)
+  # remove extra uops from SINK + substitue BUFFER with DEFINE_GLOBAL
+  ast = graph_rewrite(sink, to_si+append_bufs, si_ctx:=ScheduleItemContext(ctx.var_vals))
   # capture process replay
   if CAPTURE_PROCESS_REPLAY:
-    with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, si_ctx.assigns, ContextVar._cache, sink))
-  return ScheduleItem(sink, tuple(u.buffer for u in si_ctx.bufs if u.size != 0), tuple(si_ctx.metadata),
-                      tuple(ubuf for ubuf,ops in si_ctx.assign_adj.items() if any(x.op is Ops.PRELOAD for x in ops)))
+    with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, ContextVar._cache, sink))
+  # deal with ASSIGN
+  assign_preloads: list[UOp] = []
+  if len(ctx.assigns) != 0:
+    for x in list(sink.toposort)[::-1]:
+      # we only allow a kernel to depend on either the before ASSIGN or after ASSIGN version of a BUFFER
+      if x.op is Ops.LOAD and x.buf_uop in assign_preloads: raise RuntimeError("cycle detected in graph")
+      # PRELOAD tells the toposort this kernel should run before ASSIGN
+      if x.op is Ops.PRELOAD:
+        assign_preloads.append(x.buf_uop)
+        # if this kernel also assigns to the buffer, we only allow either contiguous or masked views for the LOAD
+        if x.buf_uop in store_bufs and not (st:=x.st_arg).contiguous:
+          # if it has a single view and it's equal when you shrink a contig, it's fine
+          if len(st.views) != 1 or (mask:=st.views[0].mask) is None or ShapeTracker.from_shape(st.shape).shrink(mask) != st.shrink(mask):
+            raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n"
+                               +colored("   - a += a.T\n", "red")+colored("   + a += a.T.contiguous()", "green"))
+  return ScheduleItem(ast, tuple(u.buffer for u in si_ctx.bufs if u.size != 0),
+                      tuple(dedup(m for x in pre.toposort if (m:=ctx.ops_metadata.get(x)) is not None)), tuple(dedup(assign_preloads)))
 
 PROCESS_REPLAY_CAPTURE: dict[str, bytes] = {}
 if CAPTURE_PROCESS_REPLAY:

From 5aab2806f0dab1f3f6087ab5f2ab08f38c06d465 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Tue, 14 Jan 2025 05:09:56 -0500
Subject: [PATCH 08/29] rename to test_tensor_uop + use upats for asserting
 [pr] (#8604)

* rename to test_tensor_uop + use upats for asserting [pr]

* fix pr
---
 test/{test_lazybuffer.py => test_tensor_uop.py} | 13 +++++++------
 tinygrad/engine/schedule.py                     |  6 +++---
 2 files changed, 10 insertions(+), 9 deletions(-)
 rename test/{test_lazybuffer.py => test_tensor_uop.py} (90%)

diff --git a/test/test_lazybuffer.py b/test/test_tensor_uop.py
similarity index 90%
rename from test/test_lazybuffer.py
rename to test/test_tensor_uop.py
index 7d702a82be..a3b01604e5 100644
--- a/test/test_lazybuffer.py
+++ b/test/test_tensor_uop.py
@@ -3,9 +3,9 @@ import numpy as np
 import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.engine.realize import run_schedule
-from tinygrad.ops import Ops, UOp
+from tinygrad.ops import Ops, UOp, UPat
 
-class TestLazyBuffer(unittest.TestCase):
+class TestTensorUOp(unittest.TestCase):
   def test_fromcpu_shape_tracker(self):
     def helper(a: np.ndarray):
       print(a.shape, a.strides, a.flags.c_contiguous)
@@ -68,7 +68,7 @@ class TestLazyBuffer(unittest.TestCase):
     assert lb.const_like(1).const_arg == 1.0
     assert type(lb.const_like(1).const_arg) is float
 
-  def test_forced_realized_alu(self):
+  def test_contiguous_alu(self):
     a = Tensor.randn(2, 2).realize()
     b = Tensor.randn(2, 2).realize()
     add = (a+b).contiguous()
@@ -84,13 +84,14 @@ class TestLazyBuffer(unittest.TestCase):
     sched = empty.schedule()
     self.assertEqual(len(sched), 0)
 
+reduce_kernel = UPat(Ops.SINK, src=(UPat(Ops.STORE, src=(UPat(), UPat(), UPat(Ops.REDUCE_AXIS)))))
 class TestReduceOp(unittest.TestCase):
   def test_no_split_reduce_kernel(self):
     a = Tensor.rand(4, 4).realize()
     a = a.sum()
     sched = a.schedule()
     assert len(sched) == 1
-    self.assertIs(sched[0].ast.src[0].src[2].op, Ops.REDUCE_AXIS)
+    assert reduce_kernel.match(sched[0].ast, {})
 
   def test_split_reduce_kernel_dim0(self):
     a = Tensor.rand(256, 255).realize()
@@ -98,7 +99,7 @@ class TestReduceOp(unittest.TestCase):
     sched = a.schedule()
     assert len(sched) == 2
     for s in sched:
-      self.assertIs(s.ast.src[0].src[2].op, Ops.REDUCE_AXIS)
+      assert reduce_kernel.match(s.ast, {})
 
   def test_split_reduce_kernel_dim1(self):
     a = Tensor.rand(255, 256).realize()
@@ -106,7 +107,7 @@ class TestReduceOp(unittest.TestCase):
     sched = a.schedule()
     assert len(sched) == 2
     for s in sched:
-      self.assertIs(s.ast.src[0].src[2].op, Ops.REDUCE_AXIS)
+      assert reduce_kernel.match(s.ast, {})
 
 if __name__ == "__main__":
   unittest.main()
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 5803e6c655..972a7a2b18 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -220,9 +220,6 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem:
   sink = graph_rewrite(graph_rewrite(pre, multioutput+view_left, store_bufs:={x.buf_uop:x.src[2] for x in pre.src}), view_right)
   # remove extra uops from SINK + substitue BUFFER with DEFINE_GLOBAL
   ast = graph_rewrite(sink, to_si+append_bufs, si_ctx:=ScheduleItemContext(ctx.var_vals))
-  # capture process replay
-  if CAPTURE_PROCESS_REPLAY:
-    with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, ContextVar._cache, sink))
   # deal with ASSIGN
   assign_preloads: list[UOp] = []
   if len(ctx.assigns) != 0:
@@ -238,6 +235,9 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem:
           if len(st.views) != 1 or (mask:=st.views[0].mask) is None or ShapeTracker.from_shape(st.shape).shrink(mask) != st.shrink(mask):
             raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n"
                                +colored("   - a += a.T\n", "red")+colored("   + a += a.T.contiguous()", "green"))
+  # capture process replay
+  if CAPTURE_PROCESS_REPLAY:
+    with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, ContextVar._cache, ast))
   return ScheduleItem(ast, tuple(u.buffer for u in si_ctx.bufs if u.size != 0),
                       tuple(dedup(m for x in pre.toposort if (m:=ctx.ops_metadata.get(x)) is not None)), tuple(dedup(assign_preloads)))
 

From 97ec564b03ec5f7253c9f247b854de5d964d1ccc Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Tue, 14 Jan 2025 07:47:17 -0500
Subject: [PATCH 09/29] noop changes from the block_assign branch [pr] (#8606)

---
 tinygrad/engine/schedule.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 972a7a2b18..d4f12e3975 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -197,10 +197,13 @@ def _append_st_vars(ctx:ScheduleItemContext, x:UOp) -> UOp|None:
 def _append_buf(ctx:ScheduleItemContext, x:UOp) -> UOp:
   ctx.bufs.append(x)
   return UOp(Ops.DEFINE_GLOBAL, x.dtype.ptr(size=x.size), (), len(ctx.bufs)-1)
-append_bufs = PatternMatcher([(UPat(Ops.BUFFER, name="x"), _append_buf)])
 
 to_si = PatternMatcher([
+  # BUFFER -> DEFINE_GLOBAL
+  (UPat(Ops.BUFFER, name="x"), _append_buf),
+  # simplify and unbind the final VIEWs
   (UPat(Ops.VIEW, name="x"), _append_st_vars),
+  # don't need SINK on COPY or BUFFER_VIEW
   (UPat(Ops.SINK, src=(UPat.store(UPat.var("b"), UPat(), UPat(GroupOp.Meta, name="x")),)), lambda b,x: x.replace(src=(b, *x.src))),
   # don't need contiguous or assign anymore
   (UPat(Ops.CONTIGUOUS, src=(UPat.var("x"),)), lambda x: x),
@@ -209,9 +212,6 @@ to_si = PatternMatcher([
   (UPat(Ops.PRELOAD, name="root"), lambda root:root.replace(op=Ops.LOAD)),
 ])
 
-add_assign_adjacents = PatternMatcher([(UPat.load(UPat.var("b"), UPat(), name="x"), lambda ctx,b,x: ctx.assign_adj.setdefault(b, []).append(x)
-                               if b in ctx.assigns else None)])
-
 # LOAD(BUFFER) -> the STORE value if it's we're doing the STORE in the same kernel
 multioutput = PatternMatcher([(UPat.load(UPat.var("b"), UPat()), lambda ctx,b: ctx.get(b)),])
 
@@ -219,7 +219,7 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem:
   # remove movement ops + substitute LOAD of fused STORE with just the value
   sink = graph_rewrite(graph_rewrite(pre, multioutput+view_left, store_bufs:={x.buf_uop:x.src[2] for x in pre.src}), view_right)
   # remove extra uops from SINK + substitue BUFFER with DEFINE_GLOBAL
-  ast = graph_rewrite(sink, to_si+append_bufs, si_ctx:=ScheduleItemContext(ctx.var_vals))
+  ast = graph_rewrite(sink, to_si, si_ctx:=ScheduleItemContext(ctx.var_vals))
   # deal with ASSIGN
   assign_preloads: list[UOp] = []
   if len(ctx.assigns) != 0:
@@ -540,12 +540,13 @@ def create_schedule_with_vars(outs:list[UOp], skip_check:bool=not __debug__) ->
   # preschedule realize groups
   prescheduled: list[ScheduleItem] = []
   for store_uops in store_groups:
-    if len(stores:=[ctx.realizes[u] for u in store_uops if ctx.realizes[u].op is Ops.STORE]) != 0:
-      prescheduled.append(schedule_uop(UOp.sink(*stores), ctx))
-      # can only schedule once
-      for buf_uop in store_uops:
-        for luop in ctx.tensor_uops[buf_uop]: ctx.becomes_map[luop] = buf_uop.view(unwrap(luop.st))
-  # do BFS
+    if len(stores:=[ctx.realizes[u] for u in store_uops if ctx.realizes[u].op is Ops.STORE]) == 0: continue
+    prescheduled.append(schedule_uop(UOp.sink(*stores), ctx))
+    # can only schedule once
+    for buf_uop in store_uops:
+      for luop in ctx.tensor_uops[buf_uop]: ctx.becomes_map[luop] = buf_uop.view(unwrap(luop.st))
+
+  # add kernel children
   schedule_targets = {out:si for si in prescheduled for out in si.outputs}
   graph: defaultdict[ScheduleItem, list[ScheduleItem]] = defaultdict(list)
   in_degree: defaultdict[ScheduleItem, int] = defaultdict(int)
@@ -560,6 +561,8 @@ def create_schedule_with_vars(outs:list[UOp], skip_check:bool=not __debug__) ->
     for x in scheduled_parents:
       graph[x].append(si)
       in_degree[si] += 1
+
+  # do BFS
   queue = deque(si for si in prescheduled if in_degree[si] == 0)
   schedule: list[ScheduleItem] = []
   while queue:

From 1ff6862a3d1cc63bbc551bed513a8557a4f7f576 Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Tue, 14 Jan 2025 15:55:23 +0300
Subject: [PATCH 10/29] ci: sleep a bit to let the driver unload the prev pid
 (#8605)

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index be6c4ace11..1d5236d835 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -372,7 +372,7 @@ jobs:
     #- name: Fuzz Padded Tensor Core GEMM
     #  run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py
     - name: Remove amdgpu
-      run: sudo rmmod amdgpu
+      run: sleep 5 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid.
     - name: Run Stable Diffusion
       run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
     - name: Run SDXL

From 4057b98f7ff599310311add7057126940698a8b6 Mon Sep 17 00:00:00 2001
From: ignaciosica <p4ssenger.developer@gmail.com>
Date: Tue, 14 Jan 2025 13:27:05 -0300
Subject: [PATCH 11/29] rename i and j into k and row/col (#8607)

---
 tinygrad/runtime/ops_python.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py
index 650f6a8e12..618a356af5 100644
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@@ -138,31 +138,31 @@ class PythonProgram:
             ul[i] = wmma_helper(32, 8, 2, 2, 2, a_b_elem, a_b_elem, c_map)
           elif arg[4] == "AMD":
             # A (16 elements on 32 threads): col major, lane 16-32 == lane 0-15
-            def a_elem(x, i, j, goff):
-              assert x[i][goff+j] == x[i][goff+j+16], "warp elements not duplicated properly across lanes"
-              return x[i][goff+j]
+            def a_elem(x, k, row, goff):
+              assert x[k][goff+row] == x[k][goff+row+16], "warp elements not duplicated properly across lanes"
+              return x[k][goff+row]
             # B (16 elements on 32 threads): row major, lane 16-32 == lane 0-15
-            def b_elem(x, i, j, goff): return a_elem(x, j, i, goff)  # pylint: disable=arguments-out-of-order
+            def b_elem(x, col, k, goff): return a_elem(x, k, col, goff)  # pylint: disable=arguments-out-of-order
             def c_map(lane, elem): return (lane%16, lane//16+elem*2) # (i, j), C, D (8 elements on 32 threads): row major
             ul[i] = wmma_helper(32, 16, 16, 16, 8, a_elem, b_elem, c_map)
           elif arg[4] == "CUDA":
             # A (8 elements on 32 threads)
-            def a_elem(x, i, j, goff): return x[(i%2)+(j//8)*2+(i//8)*4][goff+((i//2)%4)+(j%8)*4]
+            def a_elem(x, k, row, goff): return x[(k%2)+(row//8)*2+(k//8)*4][goff+((k//2)%4)+(row%8)*4]
             # B (4 elements on 32 threads)
-            def b_elem(x, i, j, goff): return x[(j%2)+(j//8)*2][goff+(j//2)%4+(i)*4]
+            def b_elem(x, col, k, goff): return x[(k%2)+(k//8)*2][goff+(k//2)%4+(col)*4]
             # (i, j), C, D (4 elements on 32 threads)
             def c_map(lane, elem): return ((elem%2)+(lane%4)*2, (lane//4)+(elem//2)*8)
             ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map)
           elif arg[4] == "INTEL":
             # A (16 elements on 8 threads)
-            def a_elem(x, i, j, goff): return x[i%2+j*2][goff+i//2]
+            def a_elem(x, k, row, goff): return x[k%2+row*2][goff+k//2]
             # B (16 elements on 8 threads)
-            def b_elem(x, i, j, goff): return x[j][goff+i]
+            def b_elem(x, col, k, goff): return x[k][goff+col]
             # C, D (8 elements on 8 threads)
             def c_map(lane, elem): return (lane, elem)
             ul[i] = wmma_helper(8, 16, 16, 16, 8, a_elem, b_elem, c_map)
           elif arg[4] == "CLANG":
-            def elem(x, i, j, _): return x[i+j][0]
+            def elem(x, col, row, _): return x[col+row][0] # k is always 0
             def c_map(_, elem): return (elem%16, elem//16)
             ul[i] = wmma_helper(1, 1, 16, 16, 256, elem, elem, c_map)
           else: raise NotImplementedError(f"unimplemented tensor core {arg}")

From 76a03e950a0a930bedfe41ccf265ce3310dc3317 Mon Sep 17 00:00:00 2001
From: Francis Lata <francisalbertlata@gmail.com>
Date: Tue, 14 Jan 2025 11:27:45 -0500
Subject: [PATCH 12/29] make kits19 dataset samples have small sizes (#8591)

---
 test/external/external_test_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/external/external_test_datasets.py b/test/external/external_test_datasets.py
index 005336b68c..6abd5fe6cc 100644
--- a/test/external/external_test_datasets.py
+++ b/test/external/external_test_datasets.py
@@ -19,7 +19,7 @@ class ExternalTestDatasets(unittest.TestCase):
   def _create_samples(self, val, num_samples=2):
     self._set_seed()
 
-    img, lbl = np.random.rand(190, 392, 392).astype(np.float32), np.random.randint(0, 100, size=(190, 392, 392)).astype(np.uint8)
+    img, lbl = np.random.rand(8, 8, 8).astype(np.float32), np.random.randint(0, 100, size=(8, 8, 8)).astype(np.uint8)
     img, lbl = nib.Nifti1Image(img, np.eye(4)), nib.Nifti1Image(lbl, np.eye(4))
     dataset = "val" if val else "train"
     preproc_pth = Path(tempfile.gettempdir() + f"/{dataset}")

From 52e7003414633f4b813d1a340e5cbc618328f946 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 14 Jan 2025 12:24:27 -0500
Subject: [PATCH 13/29] Revert "make kits19 dataset samples have small sizes
 (#8591)" (#8610)

This reverts commit 76a03e950a0a930bedfe41ccf265ce3310dc3317.
---
 test/external/external_test_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/external/external_test_datasets.py b/test/external/external_test_datasets.py
index 6abd5fe6cc..005336b68c 100644
--- a/test/external/external_test_datasets.py
+++ b/test/external/external_test_datasets.py
@@ -19,7 +19,7 @@ class ExternalTestDatasets(unittest.TestCase):
   def _create_samples(self, val, num_samples=2):
     self._set_seed()
 
-    img, lbl = np.random.rand(8, 8, 8).astype(np.float32), np.random.randint(0, 100, size=(8, 8, 8)).astype(np.uint8)
+    img, lbl = np.random.rand(190, 392, 392).astype(np.float32), np.random.randint(0, 100, size=(190, 392, 392)).astype(np.uint8)
     img, lbl = nib.Nifti1Image(img, np.eye(4)), nib.Nifti1Image(lbl, np.eye(4))
     dataset = "val" if val else "train"
     preproc_pth = Path(tempfile.gettempdir() + f"/{dataset}")

From cbfd51f5a5030e1d10ebd31e802c45ec88653403 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 14 Jan 2025 13:25:54 -0500
Subject: [PATCH 14/29] make MultiLazyBuffer.bounds a property [pr] (#8614)

determined by lbs shapes and axis
---
 tinygrad/multi.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index 97a17fe714..07a3430f7b 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -46,9 +46,6 @@ class MultiLazyBuffer(MathTrait):
     assert all(isinstance(x, UOp) for x in lbs) and len(lbs), "all lbs must be LazyBuffers, and we need at least one of them"
     assert all_same([x.dtype for x in lbs]), f"all multilazybuffer needs same dtype, getting {[x.dtype for x in lbs]}"
     self.lbs, self.axis, self.dtype, self.device, self.real = lbs, axis, lbs[0].dtype, tuple(x.device for x in lbs), real or [True]*len(lbs)
-    if axis is not None:
-      splits = list(itertools.accumulate([lb.shape[axis] for lb in lbs], initial=0))
-      self.bounds = tuple(zip(splits, splits[1:]))
 
   @property
   def shape(self): return tuple(sum(y.shape[a] for y in self.real_lbs) if a == self.axis else s for a,s in enumerate(self.real_lbs[0].shape))
@@ -59,6 +56,11 @@ class MultiLazyBuffer(MathTrait):
   @property
   def real_lbs(self): return [lb for lb,r in zip(self.lbs, self.real) if r]
 
+  @property
+  def bounds(self):
+    if self.axis is None: raise RuntimeError("bounds is not defined when axis is None")
+    return tuple(itertools.pairwise(itertools.accumulate([lb.shape[self.axis] for lb in self.lbs], initial=0)))
+
   def __repr__(self): return f"<MLB {self.axis=} {self.real=} {chr(10)}{chr(10).join([f'{x.device} {x.st}' for x in self.lbs])}>"
 
   @staticmethod

From d5a646d492071bedd0800542eacf6711d088a8e1 Mon Sep 17 00:00:00 2001
From: ignaciosica <p4ssenger.developer@gmail.com>
Date: Tue, 14 Jan 2025 15:35:14 -0300
Subject: [PATCH 15/29] CUDA Turing TC (#8597)

* init turing tc

* reorder tc

* hotfix: remove some spaces

* revert var name to x

* consistent order of factors

* revert order of terms to match old stuff

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
---
 .github/workflows/test.yml     |  1 +
 tinygrad/renderer/cstyle.py    | 16 +++++++++++-----
 tinygrad/renderer/ptx.py       |  5 +++--
 tinygrad/runtime/ops_python.py | 24 ++++++++++++++++--------
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 596f219b64..ef18f559f8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -166,6 +166,7 @@ jobs:
     - name: Test emulated CUDA tensor cores
       run: |
         DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16
+        DEBUG=2 EMULATE_CUDA_SM75=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16
         PYTHONPATH="." DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
     - name: Test emulated INTEL OpenCL tensor cores
       run: DEBUG=2 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 HALF=1 N=64 python3 ./extra/gemm/simple_matmul.py
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index 65855c5a34..5171839cef 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -291,17 +291,23 @@ class MetalRenderer(CStyleLanguage):
     return super().render_kernel(function_name, kernel, bufs, uops, prefix)
 
 _nms = "xyzwabcdefghijkl"
+cuda_tc_opts = ("u0","l0","l0","l1","l1","l1","u1")  # shared by all shapes with M=16 N=8
 
 class CUDARenderer(CStyleLanguage):
   device = "CUDA"
   global_max = (2147483647, 65535, 65535)
   local_max = (1024, 1024, 64)
   shared_max = 49152
-  # https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-fragment-mma-16816-float
-  tensor_cores = [TensorCore(dims=(8,16,16), threads=32, elements_per_thread=(8,4,4), dtype_in=di, dtype_out=do,
-    opts=("u0","l0","l0","l1","l1","l1","u1"), swizzle=(((6,7,2,3,4),(0,1,9,5,10,8)), ((6,7,9,0,1),(2,3,4,10,5,8))))
-    for di,do in ([(dtypes.half,dtypes.float),(dtypes.bfloat16,dtypes.float)])]
-  def __init__(self, arch:str): self.tensor_cores, self.arch = CUDARenderer.tensor_cores if int(arch[3:]) >= 80 else [], arch
+  # https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-multiply-accumulate-instructions
+  tc_81616 = [TensorCore(dims=(8,16,16), threads=32, elements_per_thread=(8,4,4), dtype_in=di,dtype_out=do, opts=cuda_tc_opts,
+    swizzle=(((6,7,2,3,4),(0,1,9,5,10,8)), ((6,7,9,0,1),(2,3,4,10,5,8)))) for di,do in [(dtypes.half,dtypes.float), (dtypes.bfloat16,dtypes.float)]]
+  tc_8168_f16 = [TensorCore(dims=(8,16,8), threads=32, elements_per_thread=(4,2,4), dtype_in=dtypes.half, dtype_out=dtypes.float, opts=cuda_tc_opts,
+    swizzle=(((6,7,2,3,4),(0,1,8,5,9)), ((6,7,8,0,1),(2,3,4,9,5))))]
+
+  tc_sm80 = tc_81616 + tc_8168_f16
+  tc_sm75 = tc_8168_f16
+  def __init__(self, arch:str):
+    self.tensor_cores, self.arch = CUDARenderer.tc_sm80 if int(arch[3:]) >= 80 else CUDARenderer.tc_sm75 if int(arch[3:]) >= 75 else [], arch
   def __reduce__(self): return self.__class__, (self.arch,)
 
   # language options
diff --git a/tinygrad/renderer/ptx.py b/tinygrad/renderer/ptx.py
index c7eaf6f32f..c7bb49fdfb 100644
--- a/tinygrad/renderer/ptx.py
+++ b/tinygrad/renderer/ptx.py
@@ -124,11 +124,12 @@ class PTXRenderer(Renderer):
   device = "CUDA"
   suffix = "PTX"
   global_max, local_max, shared_max = CUDARenderer.global_max, CUDARenderer.local_max, CUDARenderer.shared_max
-  tensor_cores = [tc for tc in CUDARenderer.tensor_cores if tc.dtype_in == dtypes.half]
+  tc_sm80 = [tc for tc in CUDARenderer.tc_sm80 if tc.dtype_in == dtypes.half]
   code_for_op = asm_for_op
   extra_matcher = ptx_matcher
   def __init__(self, arch:str, device="CUDA"):
-    self.device, self.tensor_cores, self.arch = device, PTXRenderer.tensor_cores if int(arch[3:]) >= 80 else [], arch
+    self.device, self.arch = device, arch
+    self.tensor_cores = PTXRenderer.tc_sm80 if int(arch[3:]) >= 80 else CUDARenderer.tc_sm75 if int(arch[3:]) >= 75 else []
   def __reduce__(self): return self.__class__, (self.arch, self.device)
 
   # language options
diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py
index 618a356af5..4961104492 100644
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@@ -146,13 +146,20 @@ class PythonProgram:
             def c_map(lane, elem): return (lane%16, lane//16+elem*2) # (i, j), C, D (8 elements on 32 threads): row major
             ul[i] = wmma_helper(32, 16, 16, 16, 8, a_elem, b_elem, c_map)
           elif arg[4] == "CUDA":
-            # A (8 elements on 32 threads)
-            def a_elem(x, k, row, goff): return x[(k%2)+(row//8)*2+(k//8)*4][goff+((k//2)%4)+(row%8)*4]
-            # B (4 elements on 32 threads)
-            def b_elem(x, col, k, goff): return x[(k%2)+(k//8)*2][goff+(k//2)%4+(col)*4]
-            # (i, j), C, D (4 elements on 32 threads)
-            def c_map(lane, elem): return ((elem%2)+(lane%4)*2, (lane//4)+(elem//2)*8)
-            ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map)
+            # (col, row) given (lane, elem) for C & D (4 elements on 32 threads); shared by all tc shapes with M=16 N=8
+            def c_map(lane, elem): return (elem%2 + (lane%4)*2, lane//4 + (elem//2)*8)
+
+            if arg[1] == (8,16,16):
+              def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2 + (k//8)*4][goff + (k//2)%4 + (row%8)*4]
+              def b_elem(x, col, k, goff): return x[k%2 + (k//8)*2][goff + (k//2)%4 + col*4]
+              ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map)
+
+            elif arg[1] == (8,16,8):
+              def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2][goff + k//2 + (row%8)*4]
+              def b_elem(x, col, k, goff): return x[k%2][goff + k//2 + col*4]
+              ul[i] = wmma_helper(32, 8, 4, 2, 4, a_elem, b_elem, c_map)
+
+            else: raise NotImplementedError(f"unimplemented tensor core {arg}")
           elif arg[4] == "INTEL":
             # A (16 elements on 8 threads)
             def a_elem(x, k, row, goff): return x[k%2+row*2][goff+k//2]
@@ -179,7 +186,8 @@ class PythonRenderer(Renderer):
   def __init__(self):
     if getenv("EMULATE_METAL"): self.device, self.tensor_cores = "METAL", MetalRenderer.tensor_cores
     if getenv("EMULATE_AMD"): self.device, self.tensor_cores = "AMD", AMDRenderer.tensor_cores
-    if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tensor_cores
+    if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm80
+    if getenv("EMULATE_CUDA_SM75"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm75
     if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores
     if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CLANG", ClangRenderer.tensor_cores
 

From 393eec3201cea5b6ab6491968ea6fd9e2b9e43f3 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 14 Jan 2025 14:51:48 -0500
Subject: [PATCH 16/29] raise RuntimeError for uneven shard [pr] (#8593)

no 7B llama on 6 GPUs

skip 70B
---
 .github/workflows/benchmark.yml | 16 ++++++++--------
 test/test_multitensor.py        | 10 +++++++++-
 test/test_nn.py                 | 13 +++++++------
 tinygrad/multi.py               |  2 +-
 tinygrad/tensor.py              |  2 +-
 5 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 1d5236d835..0596f22055 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -204,10 +204,10 @@ jobs:
       run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
     - name: Run LLaMA-3 8B on 4 GPUs
       run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
-    - name: Run LLaMA-3 8B on 6 GPUs
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
-    - name: Run LLaMA-2 70B
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
+    # - name: Run LLaMA-3 8B on 6 GPUs
+    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
+    # - name: Run LLaMA-2 70B
+    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
     - name: Run Mixtral 8x7B
       run: time NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
     - name: Run GPT2
@@ -391,12 +391,12 @@ jobs:
       run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
     - name: Run LLaMA-3 8B on 4 GPUs
       run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
-    - name: Run LLaMA-3 8B on 6 GPUs
-      run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
+    # - name: Run LLaMA-3 8B on 6 GPUs
+    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
     - name: Restore amdgpu
       run: sudo modprobe amdgpu
-    - name: Run LLaMA-2 70B
-      run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
+    # - name: Run LLaMA-2 70B
+    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_2_70B.txt
     - name: Run Mixtral 8x7B
       run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
     - name: Run GPT2
diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index 67cb9757e1..f53056c406 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -157,6 +157,7 @@ class TestMultiTensor(unittest.TestCase):
          strat.sampled_from((Ops.ADD, Ops.MUL, Ops.MAX)),
          strat.sampled_from((None, 0, 1)), strat.sampled_from((None, 0, 1)), strat.sampled_from((1, 0, -1)))
   def test_simple_reduce(self, N, devices, rop, shard_axis, reduce_axis, sign):
+    N = N * len(devices)
     X = Tensor.rand(N*N).reshape(N, N).mul(sign)
     n = X.numpy()
     X.shard_(devices, shard_axis)
@@ -438,6 +439,7 @@ class TestMultiTensor(unittest.TestCase):
     assert isinstance(jf.jit_cache[4].prg, BufferCopy)
     assert isinstance(jf.jit_cache[5].prg, graph_d1)
 
+  @unittest.skip("no longer supports uneven shard")
   def test_uneven_shard(self):
     for N in range(1, 6):
       X = Tensor.rand(4, 1, 257).contiguous().realize()
@@ -450,6 +452,7 @@ class TestMultiTensor(unittest.TestCase):
       np.testing.assert_equal(X.expand((4, 4, 257)).numpy(), np.tile(n, (1, 4, 1)))
       np.testing.assert_equal(X.permute((0, 2, 1)).numpy(), np.transpose(n, (0, 2, 1)))
 
+  @unittest.skip("no longer supports uneven shard")
   def test_uneven_multiple_zeros(self):
     for data in ([1, 2, 3, 4], [1, 2, 3], [1, 2], [1], []):
       for N in (1, 2, 3, 4):
@@ -458,6 +461,7 @@ class TestMultiTensor(unittest.TestCase):
         X = ((Tensor(data).shard(devices, axis=0) + 1).realize() - 1).realize()
         np.testing.assert_equal(X.numpy(), data)
 
+  @unittest.skip("no longer supports uneven shard")
   def test_uneven_shard_with_empty(self):
     N = 4
     X = Tensor.rand(16, 1, 3).contiguous().realize()
@@ -470,6 +474,7 @@ class TestMultiTensor(unittest.TestCase):
     # test reshape with empty shard
     np.testing.assert_equal(X.shard(devices, 0).reshape(8, 1, 6).numpy(), np_x.reshape(8, 1, 6))
 
+  @unittest.skip("no longer supports uneven shard")
   def test_multiple_uneven_shard(self):
     N = 4
     X = Tensor.rand(4, 1, 257).contiguous().realize()
@@ -531,7 +536,7 @@ class TestMultiTensor(unittest.TestCase):
     with self.assertRaises((AssertionError, ValueError)):
       t0.reshape((26*15,7))
 
-  @unittest.skip("no longer supports splits")
+  @unittest.skip("no longer supports uneven shard")
   def test_reshape_on_axis_uneven(self):
     def reshape_helper(t0, t, t_axis):
       np.testing.assert_allclose(t0.reshape(t.shape).numpy(), t.numpy())
@@ -603,6 +608,7 @@ class TestMultiTensor(unittest.TestCase):
     self.assertEqual(t.dtype, t2.dtype)
     self.assertEqual(t.lazydata.axis, t2.lazydata.axis)
 
+  @unittest.skip("no longer supports uneven shard")
   def test_rand_like_uneven_shard(self):
     t = Tensor.empty((4, 42, 15)).shard(devices_3, axis=1)
     t2 = Tensor.rand_like(t)
@@ -653,6 +659,7 @@ class TestMultiTensor(unittest.TestCase):
       assert set(unique) == {0, 2}, unique
       assert 200 < counts[0] < 312, counts[0]
 
+  @unittest.skip("no longer supports uneven shard")
   def test_dropout_on_uneven_shard_axis(self):
     with Tensor.train():
       X = Tensor.ones(256).shard(devices_3, axis=0)
@@ -814,6 +821,7 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase):
       np.testing.assert_allclose(a.reshape((2, 1, 8)).expand((2, 5, 8)).numpy(), b.reshape((2, 1, 8)).expand((2, 5, 8)).numpy(), rtol=1e-7, atol=1e-3)
       np.testing.assert_allclose(a.flip(-1).numpy(), b.flip(-1).numpy(), rtol=1e-7, atol=1e-3)
 
+  @unittest.skip("no longer supports uneven shard")
   def test_uneven(self):
     t = Tensor.arange(24).reshape(3, 8).contiguous().realize()
     t.shard_([f"{Device.DEFAULT}:{i}" for i in range(2)], axis=0)
diff --git a/test/test_nn.py b/test/test_nn.py
index e36b805c48..738574a989 100755
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -551,7 +551,7 @@ class TestNN(unittest.TestCase):
 
   @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI")
   def test_load_state_dict_sharded_model(self):
-    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2")
+    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")
 
     layer = Conv2d(3, 5, kernel_size=3)
     layer.weight.shard_(devices, 3)
@@ -572,7 +572,7 @@ class TestNN(unittest.TestCase):
 
   @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI")
   def test_load_state_dict_sharded_dict(self):
-    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2")
+    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")
 
     layer = Conv2d(3, 5, kernel_size=3)
     state_dict = {
@@ -589,7 +589,7 @@ class TestNN(unittest.TestCase):
 
   @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI")
   def test_load_state_dict_sharded_model_dict_same_axis(self):
-    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2")
+    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")
 
     layer = Conv2d(3, 5, kernel_size=3)
     layer.weight.shard_(devices, 3)
@@ -610,7 +610,8 @@ class TestNN(unittest.TestCase):
 
   @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI")
   def test_load_state_dict_sharded_model_dict_different_axis(self):
-    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2")
+    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")
+    devices5 = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3", f"{Device.DEFAULT}:4", f"{Device.DEFAULT}:5")
 
     layer = Conv2d(3, 5, kernel_size=3)
     layer.weight.shard_(devices, 3)
@@ -619,14 +620,14 @@ class TestNN(unittest.TestCase):
     # different shard axis
     state_dict = {
       'weight': Tensor.randn(5, 3, 3, 3).shard(devices, None),
-      'bias': Tensor.randn(5).shard(devices, 0),
+      'bias': Tensor.randn(5).shard(devices5, 0),
     }
     load_state_dict(layer, state_dict)
 
     # NOTE: model and state_dict shard differently, use the state_dict sharding  # TODO: revisit this?
     self.assertEqual(layer.weight.device, devices)
     self.assertEqual(layer.weight.lazydata.axis, None)
-    self.assertEqual(layer.bias.device, devices)
+    self.assertEqual(layer.bias.device, devices5)
     self.assertEqual(layer.bias.lazydata.axis, 0)
     np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy())
     np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy())
diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index 07a3430f7b..919d33a97f 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -38,7 +38,7 @@ def all_reduce(bop: Ops, lbs: list[UOp]) -> list[UOp]:
   return [functools.reduce(operator.add, [c.pad(pad) for pad,c in zip(pads,lb_c)]).reshape(shape) for lb_c in chunked]
 
 def to_sharded(lbs:list[UOp], axis:int, bounds: tuple[tuple[int, int], ...]) -> list[UOp]:
-  if DEBUG >= 3 and lbs[0].shape[axis] % len(lbs) != 0: print(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}, bounds={bounds}")
+  if lbs[0].shape[axis] % len(lbs) != 0: raise RuntimeError(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}, bounds={bounds}")
   return [lb.shrink(tuple((0,s) if a != axis else bound for a,s in enumerate(lb.shape))) for i, (bound, lb) in enumerate(zip(bounds, lbs))]
 
 class MultiLazyBuffer(MathTrait):
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 60b9735bd9..171c7afe31 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -399,7 +399,7 @@ class Tensor(SimpleMathTrait):
     Shards the tensor across the given devices. Optionally specify which axis to shard on.
 
     ```python exec="true" source="above" session="tensor" result="python"
-    t = Tensor.empty(2, 3)
+    t = Tensor.empty(2, 4)
     print(t.shard((t.device, t.device), axis=1).lazydata)
     ```
     """

From bfbe81df717e2dfec8a128a28b26c73f5ec8088c Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Tue, 14 Jan 2025 12:04:58 -0800
Subject: [PATCH 17/29] remove cast before view (#8613)

* remove cast before view

* greener

* indexing

* that passes too

* openpilot too

* ack

---------

Co-authored-by: qazal <qazal.software@gmail.com>
---
 .github/workflows/benchmark.yml |  6 +++---
 .github/workflows/test.yml      |  2 +-
 test/test_arange.py             |  3 ++-
 test/test_const_folding.py      |  5 +++--
 test/test_image_dtype.py        |  2 +-
 test/test_schedule.py           | 12 ++++++++----
 test/test_tiny.py               |  2 ++
 tinygrad/engine/schedule.py     |  2 +-
 tinygrad/ops.py                 | 13 +------------
 9 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0596f22055..27b2764556 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -91,7 +91,7 @@ jobs:
     - name: Run GPT2 w HALF
       run: HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
     - name: Run GPT2 w HALF/BEAM
-      run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
+      run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
     - name: Train MNIST
       run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt
     - name: Run 10 CIFAR training steps
@@ -217,7 +217,7 @@ jobs:
     - name: Run GPT2 w HALF
       run: NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
     - name: Run GPT2 w HALF/BEAM
-      run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
+      run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
     - uses: actions/upload-artifact@v4
       with:
         name: Speed (NVIDIA)
@@ -406,7 +406,7 @@ jobs:
     - name: Run GPT2 w HALF
       run: AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
     - name: Run GPT2 w HALF/BEAM
-      run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
+      run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
     - uses: actions/upload-artifact@v4
       with:
         name: Speed (AMD)
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ef18f559f8..417b222f50 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -297,7 +297,7 @@ jobs:
       - if: ${{ matrix.task == 'optimage' }}
         name: Test openpilot model kernel count and gate usage
         run: |
-          PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2105 ALLOWED_GATED_READ_IMAGE=29 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
+          PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2104 ALLOWED_GATED_READ_IMAGE=29 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
       - if: ${{ matrix.task == 'optimage' }}
         name: Test openpilot alt model correctness (float32)
         run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
diff --git a/test/test_arange.py b/test/test_arange.py
index d8a215faab..a5c8b535bb 100644
--- a/test/test_arange.py
+++ b/test/test_arange.py
@@ -66,7 +66,8 @@ class TestArange(unittest.TestCase):
     return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
 
 class TestIndexing(unittest.TestCase):
-  @unittest.expectedFailure
+  # update: passing after CAST_BEFORE_VIEW=1 deletion
+  # @unittest.expectedFailure
   def test_arange_2_reduce(self):
     needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
     needle[1337] = 1
diff --git a/test/test_const_folding.py b/test/test_const_folding.py
index 2a3215935f..4ca2359912 100644
--- a/test/test_const_folding.py
+++ b/test/test_const_folding.py
@@ -132,11 +132,12 @@ class TestMovedConstFolding(unittest.TestCase):
 
   def test_cast_padded(self):
     # NOTE: this is folded due to CAST_BEFORE_VIEW
+    # update: CAST_BEFORE_VIEW=1 is no longer supported
     if is_dtype_supported(dtypes.int16):
-      _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
+      _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
       np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
     if is_dtype_supported(dtypes.uint16):
-      _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
+      _check_ast_count(1, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
       np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
     # not folded
     if is_dtype_supported(dtypes.int64):
diff --git a/test/test_image_dtype.py b/test/test_image_dtype.py
index 2a8c719aff..686d60ca9b 100644
--- a/test/test_image_dtype.py
+++ b/test/test_image_dtype.py
@@ -120,7 +120,7 @@ class TestImageDType(unittest.TestCase):
       loss = x.image_dot(w1).image_dot(w2).float().max()
       loss.backward()
       sched = unwrap(w1.grad).schedule()
-      self.assertEqual(len(sched), 10)
+      self.assertEqual(len(sched), 9)
       for s,ei in zip(sched, lower_schedule(sched[:])):
         ei.run()
         if s.outputs[0].dtype == dtypes.float:
diff --git a/test/test_schedule.py b/test/test_schedule.py
index e83ae13c28..2448c9f8a1 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -1436,6 +1436,7 @@ class TestSchedule(unittest.TestCase):
   def test_late_fusion_post_expand(self):
     self._test_fusion([(32, 32)], lambda a:a-a.sum(1), 2)
 
+  @unittest.skip("CAST_BEFORE_VIEW=1 is not supported")
   def test_cast_padded_view(self):
     a = Tensor.arange(4).reshape(1, 4)
     casted_view = a.pad(((0, 1), (0, 0))).cast(dtypes.float)
@@ -1446,6 +1447,7 @@ class TestSchedule(unittest.TestCase):
     self.assertListEqual(realized_view.tolist(), [[0.0, 1.0, 2.0, 3.0], [0.0, 0.0, 0.0, 0.0]])
 
   # NOTE: we might want to reconsider pushing this cast before the shrink
+  @unittest.skip("CAST_BEFORE_VIEW=1 is not supported")
   def test_cast_after_shrink(self):
     a = Tensor.arange(4).reshape(1, 4)
     casted_view = a.shrink(((0, 1), (0, 2))).cast(dtypes.float)
@@ -1455,6 +1457,7 @@ class TestSchedule(unittest.TestCase):
     self.assertEqual(realized_view.lazydata.base.realized.size, 2)
     self.assertListEqual(realized_view.tolist(), [[0, 1]])
 
+  @unittest.skip("CAST_BEFORE_VIEW=1 is not supported")
   def test_cast_const_view(self):
     a = Tensor.ones((4, 4), dtype=dtypes.float32)
     casted_view = a.cast(dtypes.int32)
@@ -1464,6 +1467,7 @@ class TestSchedule(unittest.TestCase):
     run_schedule(check_schedule(realized_const_view, 1))
     self.assertListEqual(realized_const_view.tolist(), [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]])
 
+  @unittest.skip("CAST_BEFORE_VIEW=1 is not supported")
   def test_cast_padded_const(self):
     a = Tensor(1, dtype=dtypes.int32).reshape(1, 1).pad(((1, 1), None))
     casted_view = a.cast(dtypes.float32)
@@ -1566,7 +1570,7 @@ class TestIndexing(unittest.TestCase):
     x = Tensor.randn(5, 2).realize()
     a = Tensor.arange(10)
     out = (x + a[2]).sum()
-    self.check_schedule(out, 1)
+    self.check_schedule(out, 2)
     np.testing.assert_allclose(out.numpy(), (x.numpy()+np.arange(10)[2]).sum(), atol=1e-5, rtol=1e-6)
 
   def test_arange_index_contiguous(self):
@@ -1574,7 +1578,7 @@ class TestIndexing(unittest.TestCase):
     x = Tensor.randn(5, 2).realize()
     a = Tensor.arange(10).contiguous()
     out = (x + a[2]).sum()
-    self.check_schedule(out, 2)
+    self.check_schedule(out, 3)
     np.testing.assert_allclose(out.numpy(), (x.numpy()+np.arange(10)[2]).sum(), atol=1e-5, rtol=1e-6)
 
   def test_arange_index_child(self):
@@ -1582,7 +1586,7 @@ class TestIndexing(unittest.TestCase):
     x = Tensor.randn(5, 2).realize()
     a = Tensor.arange(10)+1
     out = (x + a[2]).sum()
-    self.check_schedule(out, 1)
+    self.check_schedule(out, 2)
     np.testing.assert_allclose(out.numpy(), (x.numpy()+(np.arange(10)+1)[2]).sum(), atol=1e-5, rtol=1e-6)
 
   def test_arange_index_contiguous_child(self):
@@ -1590,7 +1594,7 @@ class TestIndexing(unittest.TestCase):
     x = Tensor.randn(5, 2).realize()
     a = (Tensor.arange(10)+1).contiguous()
     out = (x + a[2]).sum()
-    self.check_schedule(out, 2)
+    self.check_schedule(out, 3)
     np.testing.assert_allclose(out.numpy(), (x.numpy()+(np.arange(10)+1)[2]).sum(), atol=1e-5, rtol=1e-6)
 
   def test_arange_childless_base(self):
diff --git a/test/test_tiny.py b/test/test_tiny.py
index e256d05e0e..6b3ff6ed06 100644
--- a/test/test_tiny.py
+++ b/test/test_tiny.py
@@ -81,6 +81,8 @@ class TestTiny(unittest.TestCase):
 
   # *** a model ***
 
+  # TODO: this is failing because of how swizzling rewrites the ShapeTracker of the final STORE
+  @unittest.skipIf(IMAGE>0, "failing because of make things that can't be images not images")
   def test_mnist_model(self):
     layers = [
       nn.Conv2d(1, 32, 5), Tensor.relu,
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index d4f12e3975..7dc36febb8 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -500,7 +500,7 @@ break_sched = PatternMatcher([
 def append_uop(ctx:ScheduleContext, view:UOp, buf_uop:UOp) -> None:
   ctx.allbufs[buf_uop] = view
   if (op:=uval(view)).op is Ops.ASSIGN: ctx.assigns.add(buf_uop)
-  for x in op.src:
+  for x in op.base.src:
     if is_scheduled(x.base): ctx.children.setdefault(x.base.buf_uop, {})[buf_uop] = None
   # BUFFER_VIEW overrides the underlying buffer
   # TODO: this should be a shrink on the buffer
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 6a039d54bf..a818ecc985 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -361,17 +361,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   def cast(self, dtype:DType, bitcast=False):
     if bitcast: return self.bitcast(dtype)
     if self._device is not None and self._device.startswith("DISK"): raise RuntimeError("CAST isn't supported on DISK")
-    if getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self is not self.base:
-      # NOTE: we have to apply the movementops here, we can't use VIEW (yet)
-      # TODO: move this to the scheduler
-      ret = self.base.cast(dtype, bitcast)
-      op_arg = []
-      mop = self
-      while mop is not self.base:
-        op_arg.append((mop.op, mop.arg))
-        mop = mop.src[0]
-      for op,arg in reversed(op_arg): ret = UOp(op, ret.dtype, (ret,), arg)
-      return ret
     return UOp(Ops.CAST, dtype, (self,))
   def bitcast(self, dtype:DType):
     if self.st is not None and self.shape and ((self.shape[-1]*self.dtype.itemsize)%dtype.itemsize != 0):
@@ -477,7 +466,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   @property
   def base(self) -> UOp:
     if self.op in GroupOp.Movement: return self.src[0].base
-    return self.src[0] if self.op is Ops.VIEW and len(self.src) == 1 and self.src[0].op is not Ops.BUFFER else self
+    return self.src[0].base if self.op is Ops.VIEW and len(self.src) == 1 and self.src[0].op is not Ops.BUFFER else self
   def view(self, new_st:ShapeTracker) -> UOp:
     if self.st is None: return UOp(Ops.VIEW, self.dtype.base if not isinstance(self.dtype, ImageDType) else self.dtype, (self,), new_st)
     ret = UOp(Ops.VIEW, self.dtype, (self.base,), new_st)

From c5782e85d2e8ad3d42ddabbfe60099f38af377cf Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Tue, 14 Jan 2025 23:48:07 +0300
Subject: [PATCH 18/29] tlsf: optimize alloc (#8608)

---
 tinygrad/runtime/support/allocator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tinygrad/runtime/support/allocator.py b/tinygrad/runtime/support/allocator.py
index 4e6dbd8a41..8da3ec259a 100644
--- a/tinygrad/runtime/support/allocator.py
+++ b/tinygrad/runtime/support/allocator.py
@@ -14,6 +14,7 @@ class TLSFAllocator:
   def __init__(self, size:int, base:int=0, block_size:int=16, lv2_cnt:int=16):
     self.size, self.base, self.block_size, self.l2_cnt = size, base, block_size, lv2_cnt.bit_length()
     self.storage:list = [collections.defaultdict(list) for _ in range(size.bit_length() + 1)]
+    self.lv1_entries:list[int] = [0] * len(self.storage)
 
     # self.blocks is more like a linked list, where each entry is a contigous block.
     self.blocks:dict[int, tuple[int, int|None, int|None, bool]] = {0: (size, None, None, True)} # size, next, prev, is_free
@@ -25,12 +26,14 @@ class TLSFAllocator:
   def _insert_block(self, start:int, size:int, prev:int|None=None):
     if prev is None: prev = self.blocks[start][2]
     self.storage[self.lv1(size)][self.lv2(size)].append(start)
+    self.lv1_entries[self.lv1(size)] += 1
     self.blocks[start] = (size, start + size, prev, True)
     return self
 
   def _remove_block(self, start:int, size:int, prev:int|None=None):
     if prev is None: prev = self.blocks[start][2]
     self.storage[self.lv1(size)][self.lv2(size)].remove(start)
+    self.lv1_entries[self.lv1(size)] -= 1
     self.blocks[start] = (size, start + size, prev, False)
     return self
 
@@ -67,6 +70,7 @@ class TLSFAllocator:
 
     # Search for the smallest block that can fit the requested size. Start with the it's bucket and go up until any block is found.
     for l1 in range(self.lv1(size), len(self.storage)):
+      if self.lv1_entries[l1] == 0: continue
       for l2 in range(self.lv2(size) if l1 == size.bit_length() else 0, (1 << self.l2_cnt)):
         if len(self.storage[l1][l2]) > 0:
           nsize = self.blocks[self.storage[l1][l2][0]][0]

From dddd4e5f9fc7b45e03de36707ddffd60773f0d0c Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Tue, 14 Jan 2025 16:03:17 -0500
Subject: [PATCH 19/29] hotfix: remove duplicate TestTensorMutates [pr] (#8619)

* hotfix: remove duplicate TestTensorMutates [pr]

* imports
---
 test/unit/test_rewrite_map.py | 35 +----------------------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

diff --git a/test/unit/test_rewrite_map.py b/test/unit/test_rewrite_map.py
index 0cafb9b9ab..f0b833dd19 100644
--- a/test/unit/test_rewrite_map.py
+++ b/test/unit/test_rewrite_map.py
@@ -1,39 +1,6 @@
 import unittest
-from tinygrad import dtypes, Tensor
+from tinygrad import dtypes
 from tinygrad.ops import UOp, symbolic, graph_rewrite_map, _substitute
-from test.unit.test_tensor_uop_representation import is_pattern, realized_pattern, is_pattern_uop
-
-class TestTensorMutates(unittest.TestCase):
-  def test_mutate_add(self):
-    a = Tensor([1,2,3])
-    b = Tensor([4,5,6])
-    ret = a+b
-    pa = a.lazydata
-    pb = b.lazydata
-    pr = ret.lazydata
-    ret.schedule()
-    self.assertIsNot(pa, a.lazydata)
-    self.assertIsNot(pb, b.lazydata)
-    self.assertIsNot(pr, ret.lazydata)
-    for t in [a,b,ret]: is_pattern(t, realized_pattern)
-
-  def test_reshape_is_same_parent(self):
-    a = Tensor([1,2,3])
-    b = Tensor([4,5,6])
-    c = a+b
-    d = (a+b).reshape(3,1)
-    d.realize()
-    is_pattern_uop(d.lazydata.base, realized_pattern)
-    is_pattern_uop(c.lazydata.base, realized_pattern)
-
-  def test_reshape_is_same_child(self):
-    a = Tensor([1,2,3])
-    b = Tensor([4,5,6])
-    c = a+b
-    d = (a+b).reshape(3,1)
-    c.realize()
-    is_pattern_uop(c.lazydata.base, realized_pattern)
-    is_pattern_uop(d.lazydata.base, realized_pattern)
 
 class TestRewriteMap(unittest.TestCase):
   def test_substitute(self):

From fdd46c9f284969d10a9a865992991d2eea8904c8 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Tue, 14 Jan 2025 13:15:13 -0800
Subject: [PATCH 20/29] delete view instant rule (#8616)

* remove cast before view

* greener

* indexing

* delete view instant rule

* that passes too

* openpilot too

* ack

* base on cast_before_view

* add it as a rewrite rule

* VIEW(DEVICE) is also fine

* test_shard_memory depends on forced_realize removal

* put that back, will go soon

* UOp representations change once we don't instantly fold things

* do not duplicate tests

---------

Co-authored-by: qazal <qazal.software@gmail.com>
Co-authored-by: qazal <77887910+Qazalin@users.noreply.github.com>
---
 test/test_multitensor.py                    |  1 +
 test/unit/test_tensor_uop_representation.py | 12 ++++++++----
 tinygrad/ops.py                             | 12 +++++-------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index f53056c406..ff21906016 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -694,6 +694,7 @@ class TestMultiTensor(unittest.TestCase):
         assert ast.src[2].src[0].op is Ops.LOAD
         assert ast.src[2].src[1].src[1].op is Ops.CONST and ast.src[2].src[1].src[1].arg == 3
 
+  @unittest.skip("TODO: this requires forced_realize to be deleted.")
   def test_shard_memory(self):
     devices = (d0, d1, d2, d3)
     t = Tensor.zeros(16, 16).contiguous()
diff --git a/test/unit/test_tensor_uop_representation.py b/test/unit/test_tensor_uop_representation.py
index 867acc1a60..dc8d0b64aa 100644
--- a/test/unit/test_tensor_uop_representation.py
+++ b/test/unit/test_tensor_uop_representation.py
@@ -19,7 +19,10 @@ class TestTensorMutates(unittest.TestCase):
     self.assertIsNot(pa, a.lazydata)
     self.assertIsNot(pb, b.lazydata)
     self.assertIsNot(pr, ret.lazydata)
-    for t in [a,b,ret]: is_pattern(t, realized_pattern)
+    # NOTE: this becomes a VIEW(VIEW(BUFFER)) because UOp.view no longer instantly folds contiguous VIEW of the same shape
+    # this is fine because realized exists on the base.
+    # TODO: we can make this always be a VIEW(BUFFER) once BUFFER has a ShapeTracker of shape=(N,)
+    for t in [a,b,ret]: is_pattern_uop(t.lazydata.base, realized_pattern)
 
   def test_reshape_is_same_parent(self):
     a = Tensor([1,2,3])
@@ -43,14 +46,14 @@ class TestTensorUopRepresentation(unittest.TestCase):
   def test_realized(self):
     a = Tensor([1.,2,3]).realize()
     print(a.lazydata)
-    is_pattern(a, realized_pattern)
+    is_pattern_uop(a.lazydata.base, realized_pattern)
 
   def test_add_realized(self):
     a = Tensor([1.,2,3]).realize()
     b = Tensor([4.,5,6]).realize()
     c = a+b
     print(c.lazydata)
-    is_pattern(c, UPat(Ops.ADD, src=(realized_pattern, realized_pattern)))
+    is_pattern(c, UPat(Ops.ADD, src=(UPat(Ops.VIEW, src=(realized_pattern,)), UPat(Ops.VIEW, src=(realized_pattern,)))))
 
   def test_const_pattern(self):
     a = Tensor(1)
@@ -107,7 +110,8 @@ class TestTensorUopRepresentation(unittest.TestCase):
     a = Tensor([1.,2,3]).realize()
     c = a.to("TEST")   # NOTE: this isn't checked
     print(c.lazydata)
-    is_pattern(c, UPat(Ops.COPY, src=(UPat(Ops.DEVICE), realized_pattern,)))
+    # TODO: COPY on a Tensor becomes a VIEW(COPY), this should be done in the scheduler not in ops
+    is_pattern(c, UPat(Ops.VIEW, src=(UPat(Ops.COPY, src=(UPat(Ops.DEVICE), realized_pattern,)),)))
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index a818ecc985..2c23541a64 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -467,12 +467,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   def base(self) -> UOp:
     if self.op in GroupOp.Movement: return self.src[0].base
     return self.src[0].base if self.op is Ops.VIEW and len(self.src) == 1 and self.src[0].op is not Ops.BUFFER else self
-  def view(self, new_st:ShapeTracker) -> UOp:
-    if self.st is None: return UOp(Ops.VIEW, self.dtype.base if not isinstance(self.dtype, ImageDType) else self.dtype, (self,), new_st)
-    ret = UOp(Ops.VIEW, self.dtype, (self.base,), new_st)
-    # instant folding rules
-    if new_st.contiguous and self.base.shape == new_st.shape: return self.base
-    return ret
+  def view(self, new_st:ShapeTracker) -> UOp: return UOp(Ops.VIEW, self.dtype, (self.base,), new_st)
 
   def _mop(self, op:Ops, arg):
     ret = UOp(op, self.dtype, (self,), arg)
@@ -1293,7 +1288,10 @@ ConstLike = Union[ConstType, Variable, tuple[ConstType, ...]]
 
 # *** uop swizzling ***
 
-merge_views = PatternMatcher([(UPat(Ops.VIEW, name="s0").view(name="s1"), lambda s0,s1: s0.replace(arg=s0.st+s1.st))])
+merge_views = PatternMatcher([
+  (UPat(Ops.VIEW, name="s0").view(name="s1"), lambda s0,s1: s0.replace(arg=s0.st+s1.st)),
+  (UPat(Ops.VIEW, name="mv", src=(UPat.var("x"),)), lambda mv,x: x if mv.st.contiguous and x.st is not None and x.shape == mv.shape else None),
+])
 
 # push VIEW to loads
 view_left = merge_views+PatternMatcher([

From c85737c200dc4fdf28cda3846a8e00e60eb8a6c3 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Tue, 14 Jan 2025 13:26:56 -0800
Subject: [PATCH 21/29] assert to prepare for grad uop [pr] (#8280)

* assert to prepare for grad uop [pr]

* fix test_nn

* fix most of test_tensor

* few more tests

* fix multi

* uniform gradient

* acc_dtype

* any for multi

* fix typing

* fix assert, CAST_BEFORE_VIEW is still the issue

* explict test for CAST_BEFORE_VIEW

---------

Co-authored-by: qazal <77887910+Qazalin@users.noreply.github.com>
---
 test/test_dtype.py         |  3 ++-
 test/test_tensor.py        |  2 +-
 test/unit/test_gradient.py | 12 ++++++++++++
 tinygrad/multi.py          |  2 ++
 tinygrad/tensor.py         |  3 +++
 5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/test/test_dtype.py b/test/test_dtype.py
index a09ae0c7f9..7a1677bdd0 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -781,7 +781,8 @@ class TestAutoCastType(unittest.TestCase):
         if DEBUG >= 2:
           print(f"testing {default_dtype=}, {dtype=}")
         a = Tensor([1, 2, 3], dtype=dtype, requires_grad=True)
-        b = (a * 5).sum()
+        # NOTE: this is broken without default_dtype because of CAST_BEFORE_VIEW
+        b = (a * 5).sum(acc_dtype=default_dtype)
         b.backward()  # if there is dtype mismatch, lazy should assert
         assert a.grad.dtype == a.dtype
         np.testing.assert_allclose(a.grad.numpy(), [5, 5, 5])
diff --git a/test/test_tensor.py b/test/test_tensor.py
index a5152caa95..82f4065593 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -464,7 +464,7 @@ class TestTinygrad(unittest.TestCase):
   def test_repr_with_grad(self):
     a = Tensor([1], requires_grad=True)
     b = Tensor([1])
-    c = (a + b).mean().backward()
+    c = (a + b).sum().backward()
     print(a)
     print(c)
 
diff --git a/test/unit/test_gradient.py b/test/unit/test_gradient.py
index 21182874f2..b36b81f243 100644
--- a/test/unit/test_gradient.py
+++ b/test/unit/test_gradient.py
@@ -93,6 +93,12 @@ class TestTensorGradient(unittest.TestCase):
     dx = z.gradient(x, gradient=dz)[0]
     self.assertListEqual(dx.tolist(), [2.0, 4.0, 6.0])
 
+  def test_cast_before_view(self):
+    x = Tensor([1.0, 1, 1, 1])
+    x_reshaped = x.reshape(2,2)
+    x_casted = x_reshaped.cast(dtypes.float16)
+    x_casted.mean().gradient(x_reshaped)
+
 class TestRealizeMeansRealize(unittest.TestCase):
   def test_randn_realizes(self):
     x = Tensor.randn(2, 3, 64, 64, requires_grad=True).realize()
@@ -104,5 +110,11 @@ class TestRealizeMeansRealize(unittest.TestCase):
     print(x.lazydata)
     self.assertEqual(x.lazydata.op, Ops.VIEW)
 
+  # NOTE: even though it doesn't realize, this seems fine
+  def test_uniform_gradient(self):
+    x = Tensor.uniform(16, 3, 3, 3, requires_grad=True).realize()
+    y = x * 2
+    y.sum().gradient(x)[0].realize()
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index 919d33a97f..ab521e9978 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -97,6 +97,8 @@ class MultiLazyBuffer(MathTrait):
   def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)
   def clone(self) -> MultiLazyBuffer: return MultiLazyBuffer([lb.clone() for lb in self.lbs], self.axis, self.real)
   def detach(self) -> MultiLazyBuffer: return MultiLazyBuffer([lb.detach() for lb in self.lbs], self.axis, self.real)
+  @property
+  def toposort(self) -> dict[UOp, None]: return {l:None for x in self.lbs for l in x.toposort}
 
   # elementwise is simple
   def alu(self, op:Ops, *in_srcs:MultiLazyBuffer) -> MultiLazyBuffer:
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 171c7afe31..53ebb9b86c 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -946,6 +946,7 @@ class Tensor(SimpleMathTrait):
       # this is "implicit gradient creation"
       gradient = Tensor(1.0, dtype=self.dtype, device=self.device, requires_grad=False)
 
+    toposort_uop = self.lazydata.toposort
     assert self.shape == gradient.shape, f"grad shape must match tensor shape, {gradient.shape!r} != {self.shape!r}"
     self.grad = gradient
     for t0 in reversed(toposorted):
@@ -958,6 +959,8 @@ class Tensor(SimpleMathTrait):
       for t, g in zip(t0._ctx.parents, grads):
         if g is not None and t.requires_grad:
           assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}"
+          assert t.lazydata in toposort_uop or (isinstance(t.lazydata, MultiLazyBuffer) and any(x in toposort_uop for x in t.lazydata.lbs)), \
+            f"grad uop must have a path from self\ngrad uop: {t.lazydata}"
           t.grad = g if t.grad is None else (t.grad + g)
       if not retain_graph: del t0._ctx
     return self

From 0790d8059fb6157977b50b7752bc5742ed720650 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 14 Jan 2025 18:00:49 -0500
Subject: [PATCH 22/29] remove MultiLazyBuffer.from_sharded [pr] (#8620)

it's eqivalent to taking the lazydata from Tensor.split, then copy to devices
---
 test/test_multitensor.py |  2 +-
 tinygrad/multi.py        | 14 +-------------
 tinygrad/tensor.py       | 14 +++++++++++---
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index ff21906016..2de7052e66 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -75,7 +75,7 @@ class TestMultiTensor(unittest.TestCase):
       ei.run()
     assert names[-2] == names[-1], "function was relinearized"
 
-  @unittest.skip("this doesn't fold because from_sharded calls contiguous on all lbs")
+  @unittest.skip("this doesn't fold because shard_ calls contiguous on all lbs")
   def test_sharded_memory(self):
     # Buffer may be stuck in track_cross_buffer
     for x in (d0, d1, d2, d3, d4): Device[x].synchronize()
diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index ab521e9978..8e65f46ac9 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import functools, itertools, operator
-from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv, ceildiv
+from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv
 from tinygrad.dtype import DType
 from tinygrad.ops import Ops, MathTrait, UOp, sint
 
@@ -63,18 +63,6 @@ class MultiLazyBuffer(MathTrait):
 
   def __repr__(self): return f"<MLB {self.axis=} {self.real=} {chr(10)}{chr(10).join([f'{x.device} {x.st}' for x in self.lbs])}>"
 
-  @staticmethod
-  def from_sharded(lb:UOp, devices:tuple[str, ...], axis:int|None):
-    if axis is not None:
-      if not isinstance(total:=lb.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {lb.shape=}, {axis=}")
-      sz = ceildiv(total, len(devices))
-      splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))])
-      bounds = tuple(itertools.pairwise(itertools.accumulate(splits, initial=0)))
-    lbs = [lb] * len(devices)
-    sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis, bounds) if axis is not None and bounds is not None else lbs, devices)]
-    # NOTE: this contiguous is making it impossible for the scheduler to do late const folding
-    return MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
-
   def copy_to_device(self, device:str) -> UOp:
     if self.axis is None:
       # if we already have a copy on the device, return that
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 53ebb9b86c..71fe8b5693 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -179,7 +179,7 @@ class Tensor(SimpleMathTrait):
     # data might be on a different device
     if isinstance(device, str): self.lazydata:Union[UOp, MultiLazyBuffer] = data if data.device == device else data.copy_to_device(device)
     # if device is a tuple, we should have/construct a MultiLazyBuffer
-    elif isinstance(data, UOp): self.lazydata = MultiLazyBuffer.from_sharded(data, device, None)
+    elif isinstance(data, UOp): self.lazydata = Tensor(data).shard(device).lazydata
     else:
       assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}"
       self.lazydata = data
@@ -405,8 +405,16 @@ class Tensor(SimpleMathTrait):
     """
     assert isinstance(self.lazydata, UOp), "can't shard a MultiLazyBuffer"
     devices = tuple(Device.canonicalize(x) for x in devices)
-    if axis is not None: axis = self._resolve_dim(axis)
-    return Tensor(MultiLazyBuffer.from_sharded(self.lazydata, devices, axis), device=devices, requires_grad=self.requires_grad)
+    if axis is None: lbs = [self.lazydata] * len(devices)
+    else:
+      axis = self._resolve_dim(axis)
+      sz = ceildiv(self.shape[axis], len(devices))
+      sizes = [max(0, min(sz, self.shape[axis] - sz*i)) for i in range(len(devices))]
+      lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)]
+    sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)]
+    # NOTE: this contiguous is making it impossible for the scheduler to do late const folding
+    mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
+    return Tensor(mlb, device=devices, requires_grad=self.requires_grad)
 
   def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):
     """

From 930728c06997adee49c61b0db96ced7e51f45a88 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 14 Jan 2025 18:41:41 -0500
Subject: [PATCH 23/29] bert BS 72->66 [pr] (#8621)

72 does not fit now
---
 .../benchmarks/bert/implementations/tinybox_green/dev_beam.sh   | 2 +-
 .../benchmarks/bert/implementations/tinybox_green/dev_run.sh    | 2 +-
 .../bert/implementations/tinybox_green/run_and_time.sh          | 2 +-
 .../benchmarks/bert/implementations/tinybox_red/dev_beam.sh     | 2 +-
 .../benchmarks/bert/implementations/tinybox_red/dev_run.sh      | 2 +-
 .../benchmarks/bert/implementations/tinybox_red/run_and_time.sh | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
index 7318b3bb29..99b99f7e89 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
 
 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
index 555ed80cb4..70a3b6a6cb 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
 
 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
index 4e732799ae..a213f4d682 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@@ -3,7 +3,7 @@
 export PYTHONPATH="."
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_green"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
 
 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
index 692e0b49a1..08ffd354b3 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
 
 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
index 86221ca594..c42c1f65b6 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
 
 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
index 272b0ddb12..d6ff9fd2cc 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@@ -3,7 +3,7 @@
 export PYTHONPATH="."
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_red"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36
 
 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1

From 7860a808015568d4324985a481a97174be338ae5 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 14 Jan 2025 19:19:13 -0500
Subject: [PATCH 24/29] simpler MultiLazyBuffer alu [pr] (#8622)

---
 tinygrad/multi.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index 8e65f46ac9..c9990534ce 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -102,12 +102,10 @@ class MultiLazyBuffer(MathTrait):
     assert any(new_real), "output contains no real lb"
     for mlb in msrcs:
       if (mlb.axis == axis and (mlb.axis is None or mlb.bounds == bounds)) or not_all_real: srcs.append(mlb.lbs)
-      elif mlb.axis is None and axis is not None:
-        assert bounds is not None
-        srcs.append(to_sharded(mlb.lbs, axis, bounds))
       else:
         assert axis is not None and bounds is not None
-        srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.lbs], axis, bounds))
+        if mlb.axis is None: srcs.append(to_sharded(mlb.lbs, axis, bounds))
+        else: srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.lbs], axis, bounds))
     new_real_lbs:dict[int,UOp] = {i:lsrcs[0].alu(op, *lsrcs[1:]) for i,(lsrcs,r) in enumerate(zip(zip(*srcs), new_real)) if r}
     # NOTE: const dtype should match real
     new_dtype = next(iter(new_real_lbs.values())).dtype

From 4ee3243c93cd43982683ed341da9a19848bfd9c3 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 14 Jan 2025 19:52:38 -0500
Subject: [PATCH 25/29] JITBEAM=2 for LLaMA-3 8B on 4 GPUs [pr] (#8623)

is it fast?
---
 .github/workflows/benchmark.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 27b2764556..cccf231dee 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -202,8 +202,8 @@ jobs:
     #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
     - name: Run LLaMA-3 8B BEAM
       run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
-    - name: Run LLaMA-3 8B on 4 GPUs
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
+    - name: Run LLaMA-3 8B on 4 GPUs with BEAM
+      run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
     # - name: Run LLaMA-3 8B on 6 GPUs
     #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
     # - name: Run LLaMA-2 70B
@@ -389,8 +389,8 @@ jobs:
     #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
     - name: Run LLaMA-3 8B BEAM
       run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
-    - name: Run LLaMA-3 8B on 4 GPUs
-      run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
+    - name: Run LLaMA-3 8B on 4 GPUs with BEAM
+      run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
     # - name: Run LLaMA-3 8B on 6 GPUs
     #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
     - name: Restore amdgpu

From f29d6f54b874c3579b56d66638bd3e90128c702a Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Tue, 14 Jan 2025 18:33:33 -0800
Subject: [PATCH 26/29] support multilb gradient [pr] (#8624)

---
 test/models/test_real_world.py |  2 +-
 test/test_multitensor.py       |  6 ++++++
 tinygrad/gradient.py           |  2 ++
 tinygrad/tensor.py             | 22 ++++++++++++++--------
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index c22ed5bb73..a213e5ec88 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -33,7 +33,7 @@ def helper_test(nm, gen, model, max_memory_allowed, max_kernels_allowed, all_jit
     kernels_used = len(model.jit_cache) if hasattr(model, "jit_cache") else None
     print(f"{nm}: used {mem_used/1e9:.2f} GB and {kernels_used} kernels in {min(tms)/1e6:.2f} ms")
     assert mem_used/1e9 < max_memory_allowed, f"{nm} used more than {max_memory_allowed:.2f} GB - {mem_used/1e9:.2} GB used"
-    assert not kernels_used or kernels_used <= max_kernels_allowed, f"{nm} used more than {max_kernels_allowed} kernels"
+    assert not kernels_used or kernels_used <= max_kernels_allowed, f"{nm} used more than {max_kernels_allowed} kernels, it used {kernels_used}"
     if all_jitted:
       assert kernels_used > 0 and kernels_used == GlobalCounters.kernel_count or (kernels_used <= GlobalCounters.kernel_count and getattr(Device[Device.DEFAULT], "graph", None)), f"only {kernels_used} out of {GlobalCounters.kernel_count} were jitted"  # noqa: E501
 
diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index 2de7052e66..c8325137d7 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -43,6 +43,12 @@ class TestMultiTensor(unittest.TestCase):
       assert lb.shape == (256,)
     (X + X).realize()
 
+  def test_gradient(self):
+    X = Tensor.ones(256).contiguous().realize()
+    X.to_(devices_2)
+    grad = X.sum().gradient(X)[0]
+    grad.realize()
+
   def test_shard(self):
     X = Tensor.ones(256).contiguous().realize()
     X.shard_(devices_2, 0)
diff --git a/tinygrad/gradient.py b/tinygrad/gradient.py
index c9ea1d8028..1b93e9374b 100644
--- a/tinygrad/gradient.py
+++ b/tinygrad/gradient.py
@@ -39,6 +39,8 @@ pm_gradient = PatternMatcher([
 
   # there's no gradient for...is this ASSIGN?
   (UPat(Ops.VIEW, src=(UPat(Ops.BUFFER), UPat(Ops.BUFFER_VIEW))), lambda: (None, None)),
+  # also no gradient for bitcast
+  (UPat(Ops.BITCAST), lambda ctx: (None,)),
 ])
 
 # copied from tensor.py, get relevant toposort of gradients
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 71fe8b5693..7d00e441d4 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -915,15 +915,21 @@ class Tensor(SimpleMathTrait):
     print(dy.tolist())  # dz/dy
     ```
     """
-    assert isinstance(self.lazydata, UOp), "multi isn't supported yet"
-    target_uops: list[UOp] = [x.lazydata for x in targets if isinstance(x.lazydata, UOp)]
     assert gradient is not None or self.shape == tuple(), "when no gradient is provided, backward must be called on a scalar tensor"
-    grads = compute_gradient(self.lazydata, self.lazydata.const_like(1) if gradient is None else cast(UOp, gradient.lazydata), target_uops)
-    ret = []
-    for x in target_uops:
-      if (y:=grads.get(x)) is None: raise RuntimeError(f"{x}\n\nnot found in\n\n{self.lazydata}")
-      ret.append(Tensor(y, device=x.device))
-    return ret
+    if gradient is None: gradient = Tensor(1.0, dtype=self.dtype, device=self.device, requires_grad=False)
+    rets = []
+    for i,(uop,grad) in enumerate(zip(self.lazydata.lbs, gradient.lazydata.lbs)):
+      target_uops = [x.lazydata.lbs[i] for x in targets]
+      grads = compute_gradient(uop, grad, target_uops)
+      ret = []
+      for x in target_uops:
+        if (y:=grads.get(x)) is None: raise RuntimeError(f"{x}\n\nnot found in\n\n{uop}")
+        ret.append(y)
+      rets.append(ret)
+    # create returned Tensors
+    if isinstance(self.lazydata, UOp): return [Tensor(u, device=t.device) for t,u in zip(targets, rets[0])]
+    return [Tensor(MultiLazyBuffer(list(u), cast(MultiLazyBuffer, t.lazydata).axis, cast(MultiLazyBuffer, t.lazydata).real),
+                   device=t.device) for t,u in zip(targets, zip(*rets))]
 
   def _deepwalk(self):
     def _walk(node, visited):

From 504ad08e736636fd8581e9473602836eb029ac16 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Tue, 14 Jan 2025 19:03:17 -0800
Subject: [PATCH 27/29] hotfix: add test_example_matmul_same

---
 test/test_schedule.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test_schedule.py b/test/test_schedule.py
index 2448c9f8a1..2d531f0170 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -585,6 +585,15 @@ class TestSchedule(unittest.TestCase):
     run_schedule(check_schedule(out, 2))
     np.testing.assert_allclose(out.numpy(), np.ones((64,64)))
 
+  def test_example_matmul_same(self):
+    x = Tensor.eye(64, requires_grad=True)
+    z = x.matmul(x).sum()
+    z.backward()
+    out = x.grad.contiguous()
+    run_schedule(check_schedule(out, 2))
+    # NOTE: the gradient flows twice
+    np.testing.assert_allclose(out.numpy(), 2*np.ones((64,64)))
+
   def test_contiguous_add(self):
     x = Tensor.empty(32)
     y = Tensor.empty(32)

From 7fb1c7af6101f757880e83bc5c007b85afb29c06 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 14 Jan 2025 22:25:23 -0500
Subject: [PATCH 28/29] minor multi cleanups [pr] (#8625)

---
 tinygrad/multi.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index c9990534ce..0dbb3520cc 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -14,11 +14,10 @@ def all_reduce(bop: Ops, lbs: list[UOp]) -> list[UOp]:
   if DEBUG >= 2: print(f"{'RING ALLREDUCE' if use_ring else 'NAIVE ALLREDUCE'} {n_lbs}x{numel} | {lbs[0].dtype}")
   if not use_ring: return [functools.reduce(lambda x,y: x.alu(bop, y), [x.copy_to_device(lb.device) for x in lbs]) for lb in lbs]
 
-  factor = next(f for f in [32, 16, 8, 4, 2, 1] if numel % f == 0)
+  factor = next((f for f in [32, 16, 8, 4, 2] if numel % f == 0), 1)
   base, left = (numel // factor) // n_lbs, (numel // factor) % n_lbs
   chunk_sizes = [(base + 1) * factor] * left + [base * factor] * (n_lbs - left)
-  acc = 0
-  chunks = [(acc, (acc := acc + i)) for i in chunk_sizes if i > 0]
+  chunks = list(itertools.pairwise(itertools.accumulate(chunk_sizes, initial=0)))
   chunked = [[lb.reshape((numel,)).shrink(((s,e),)) for s,e in chunks] for lb in lbs]
 
   # scatter-reduce
@@ -64,9 +63,8 @@ class MultiLazyBuffer(MathTrait):
   def __repr__(self): return f"<MLB {self.axis=} {self.real=} {chr(10)}{chr(10).join([f'{x.device} {x.st}' for x in self.lbs])}>"
 
   def copy_to_device(self, device:str) -> UOp:
-    if self.axis is None:
-      # if we already have a copy on the device, return that
-      return next((lb for lb in self.real_lbs if lb.device == device), self.real_lbs[0].copy_to_device(device))
+    # if we already have a copy on the device, return that
+    if self.axis is None: return next((lb for lb in self.real_lbs if lb.device == device), self.real_lbs[0].copy_to_device(device))
     # copy lbs to device, pad to final shape, and sum
     llbs:list[UOp] = []
     for lb,real,(start,end) in zip(self.lbs, self.real, self.bounds):
@@ -78,8 +76,7 @@ class MultiLazyBuffer(MathTrait):
   # passthroughs
   @property
   def is_realized(self) -> bool: return all(lb.base.realized is not None for lb in self.real_lbs)
-  def cast(self, dtype:DType, bitcast:bool=False):
-    return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real)
+  def cast(self, dtype:DType, bitcast:bool=False): return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real)
   def const_like(self, b) -> MultiLazyBuffer: return MultiLazyBuffer([x.const_like(b) for x in self.lbs], self.axis, self.real)
   def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real)
   def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)

From e1f7c90459200fd93ef142f899f4d9a4167625c4 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Tue, 14 Jan 2025 20:48:23 -0800
Subject: [PATCH 29/29] gradient is a set [pr] (#8626)

* gradient is a set [pr]

* typing for deepwalk
---
 tinygrad/gradient.py |  8 ++++----
 tinygrad/tensor.py   | 17 +++++++++--------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tinygrad/gradient.py b/tinygrad/gradient.py
index 1b93e9374b..a2fa71a98d 100644
--- a/tinygrad/gradient.py
+++ b/tinygrad/gradient.py
@@ -1,4 +1,4 @@
-from typing import cast
+from typing import cast, Iterator
 import math, functools
 from tinygrad.dtype import dtypes, sum_acc_dtype
 from tinygrad.ops import UOp, PatternMatcher, UPat, Ops
@@ -44,10 +44,10 @@ pm_gradient = PatternMatcher([
 ])
 
 # copied from tensor.py, get relevant toposort of gradients
-def _deepwalk(root:UOp, targets:list[UOp]):
+def _deepwalk(root:UOp, targets:set[UOp]) -> list[UOp]:
   @functools.lru_cache(None)
   def is_in_target_path(x:UOp) -> bool: return any(u in targets or is_in_target_path(u) for u in x.src)
-  def _walk(node:UOp, visited:set[UOp]):
+  def _walk(node:UOp, visited:set[UOp]) -> Iterator[UOp]:
     visited.add(node)
     if node.op is Ops.DETACH: return
     if is_in_target_path(node):
@@ -56,7 +56,7 @@ def _deepwalk(root:UOp, targets:list[UOp]):
       yield node
   return list(_walk(root, set()))
 
-def compute_gradient(root:UOp, root_grad:UOp, targets:list[UOp]) -> dict[UOp, UOp]:
+def compute_gradient(root:UOp, root_grad:UOp, targets:set[UOp]) -> dict[UOp, UOp]:
   grads = {root: root_grad}
   for t0 in reversed(_deepwalk(root, targets)):
     if t0 not in grads: continue
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 7d00e441d4..e3d87f51eb 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -920,7 +920,7 @@ class Tensor(SimpleMathTrait):
     rets = []
     for i,(uop,grad) in enumerate(zip(self.lazydata.lbs, gradient.lazydata.lbs)):
       target_uops = [x.lazydata.lbs[i] for x in targets]
-      grads = compute_gradient(uop, grad, target_uops)
+      grads = compute_gradient(uop, grad, set(target_uops))
       ret = []
       for x in target_uops:
         if (y:=grads.get(x)) is None: raise RuntimeError(f"{x}\n\nnot found in\n\n{uop}")
@@ -931,13 +931,13 @@ class Tensor(SimpleMathTrait):
     return [Tensor(MultiLazyBuffer(list(u), cast(MultiLazyBuffer, t.lazydata).axis, cast(MultiLazyBuffer, t.lazydata).real),
                    device=t.device) for t,u in zip(targets, zip(*rets))]
 
-  def _deepwalk(self):
-    def _walk(node, visited):
+  def _deepwalk(self) -> list[Tensor]:
+    def _walk(node:Tensor, visited:set[Tensor]):
       visited.add(node)
       # if tensor is not leaf, reset grad
       if (ctx := getattr(node, "_ctx", None)) is not None and len(ctx.parents) != 0: node.grad = None
       if ctx:
-        for i in node._ctx.parents:
+        for i in cast(Function, node._ctx).parents:
           if i not in visited: yield from _walk(i, visited)
         yield node
     return list(_walk(self, set()))
@@ -965,12 +965,13 @@ class Tensor(SimpleMathTrait):
     self.grad = gradient
     for t0 in reversed(toposorted):
       if t0.grad is None: raise RuntimeError(f"tensor {t0} has no grad")
-      token = _METADATA.set(dataclasses.replace(md, backward=True) if (md := t0._ctx.metadata) is not None else None)
-      grads = t0._ctx.backward(t0.grad.lazydata)
+      ctx = cast(Function, t0._ctx)
+      token = _METADATA.set(dataclasses.replace(md, backward=True) if (md := ctx.metadata) is not None else None)
+      grads = ctx.backward(t0.grad.lazydata)
       _METADATA.reset(token)
       grads = [Tensor(g, device=self.device, requires_grad=False) if g is not None else None
-        for g in ([grads] if len(t0._ctx.parents) == 1 else grads)]
-      for t, g in zip(t0._ctx.parents, grads):
+        for g in ([grads] if len(ctx.parents) == 1 else grads)]
+      for t, g in zip(ctx.parents, grads):
         if g is not None and t.requires_grad:
           assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}"
           assert t.lazydata in toposort_uop or (isinstance(t.lazydata, MultiLazyBuffer) and any(x in toposort_uop for x in t.lazydata.lbs)), \