From e13f2a30920f58f2b5754c435225553722329274 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Fri, 16 May 2025 23:14:23 -0700
Subject: [PATCH] multi is O(1) (#10183)

* multi is O(1)

* allreduce

* no new uops needed

* junk

* something

* simple

* that's really what i want

* closer

* inject _device_num

* pretty print

* cleanups

* this

* early dnum

* ops allreduce is good

* ish

* device is the tuple and this is fine

* simpler

* progress

* copy_multi

* work

* more tests

* more tests pass

* work

* no None axis

* tests

* no none multi

* type fixes

* pre commit passes

* lil

* remove this

* mlperf dataloader on mac

* that test was wrong

* unbind

* support DEBUG=2

* realize

* only unbind bound vars

* don't include fixedvars

* graph test

* one test

* fixedvars in hcq

* new ring reduce

* ring reduce

* simpler ring

* mselect

* mselect doesn't work

* Revert "mselect doesn't work"

This reverts commit c78b77bd7d30d14c375300f462da97617de72c74.

* Revert "mselect"

This reverts commit bb2e430ac36cd1d644c0c00c24ea8b44664228e5.

* simpler

* fixups

* no optional

* fix jit

* move things around

* cleanup multi

* simpler multi

* simpler reshape
---
 test/test_multitensor.py    |  12 ++--
 test/unit/test_allreduce.py |   1 +
 tinygrad/device.py          |   9 +++
 tinygrad/engine/grouper.py  |   3 +-
 tinygrad/engine/jit.py      |   9 +--
 tinygrad/engine/multi.py    | 128 ++++++++++++++++++------------------
 tinygrad/engine/schedule.py |  35 ++++++++--
 tinygrad/ops.py             |  62 ++++++++---------
 tinygrad/tensor.py          |  20 +++---
 9 files changed, 157 insertions(+), 122 deletions(-)

diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index b19bef4783..a059899835 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -5,7 +5,6 @@ from tinygrad.ops import Ops, UOp
 from tinygrad.helpers import CI, getenv, prod, Context, OSX
 from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
-from tinygrad.engine.multi import all_reduce
 import numpy as np
 from hypothesis import given, strategies as strat, settings
 from tinygrad.device import is_dtype_supported
@@ -31,7 +30,7 @@ N = 128
 def _test_allreduce(t:Tensor):
   aa = (t[0:64] + t[64:128] + t[128:192] + t[192:256]).repeat([4,1]).realize()
   ts = t.shard(devices_4, 0).realize()
-  b = Tensor(UOp.multi(*all_reduce(Ops.ADD, ts.lazydata.src), axis=0))
+  b = Tensor(UOp.allreduce(ts.lazydata, Ops.ADD, ts.device))
   b.realize()
   return aa, b
 
@@ -84,7 +83,7 @@ class TestMultiTensor(unittest.TestCase):
     for si, ei in lower_schedule(sched):
       if isinstance(ei.prg, CompiledRunner): names.append(ei.prg.p.name)
       ei.run()
-    self.assertEqual(len(set(names)), 2), "function was relinearized"
+    self.assertEqual(len(set(names)), 1), "function was relinearized"
 
   @unittest.skip("this doesn't fold because shard_ calls contiguous on all lbs")
   def test_sharded_memory(self):
@@ -226,17 +225,16 @@ class TestMultiTensor(unittest.TestCase):
       out = f(tt)
       assert out.item() == 1+2+3+4
 
-  @unittest.skip("slow")
   def test_fuzz_allreduce(self):
     random.seed(41)
-    for it in range(100):
+    for it in range(2):
       for n in range(2, 4+1):
         shape = tuple([(n if i == 0 else 1) * random.randint(1, 10) for i in range(random.randint(1, 4))])
         t = Tensor.rand(shape).shard_(tuple([d0, d1, d2, d3][:n]), 0)
         with Context(RING=0):
-          a = Tensor(UOp.multi(*all_reduce(Ops.ADD, t.lazydata.src), axis=0))
+          a = Tensor(UOp.allreduce(t.lazydata, Ops.ADD, t.device))
         with Context(RING=2):
-          b = Tensor(UOp.multi(*all_reduce(Ops.ADD, t.lazydata.src), axis=0))
+          b = Tensor(UOp.allreduce(t.lazydata, Ops.ADD, t.device))
         diff = a - b
         mean_err = diff.reshape((prod(diff.shape),)).abs().mean().numpy()
         max_err = diff.reshape((prod(diff.shape),)).abs().max().numpy()
diff --git a/test/unit/test_allreduce.py b/test/unit/test_allreduce.py
index c45b28ab7f..e99a44b1c6 100644
--- a/test/unit/test_allreduce.py
+++ b/test/unit/test_allreduce.py
@@ -4,6 +4,7 @@ from tinygrad.helpers import Context
 from tinygrad.ops import Ops
 
 class TestRingAllReduce(unittest.TestCase):
+  @unittest.skip("still broken")
   def test_schedule_ring(self):
     with Context(RING=2):
       N = 4
diff --git a/tinygrad/device.py b/tinygrad/device.py
index c57bb12dea..281983ab0d 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -91,6 +91,15 @@ class BufferSpec:
   nolru: bool = False
   external_ptr: Optional[int] = None
 
+class MultiBuffer:
+  def __init__(self, device:tuple[str, ...], size:int, dtype:DType):
+    self.bufs = [Buffer(d, size, dtype) for d in device]
+    self.dtype = dtype
+  def ref(self, cnt):
+    for b in self.bufs: b.ref(cnt)
+    return self
+  def is_allocated(self): return all(x.is_allocated() for x in self.bufs)
+
 class Buffer:
   def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None, options:Optional[BufferSpec]=None, initial_value:Optional[bytes]=None,
                lb_refcount=0, base:Optional[Buffer]=None, offset:int=0, preallocate=False):
diff --git a/tinygrad/engine/grouper.py b/tinygrad/engine/grouper.py
index 50e41b4c17..8f0df3547e 100644
--- a/tinygrad/engine/grouper.py
+++ b/tinygrad/engine/grouper.py
@@ -7,6 +7,7 @@ from tinygrad.codegen.symbolic import symbolic_simple
 from tinygrad.helpers import Metadata, all_int, all_same, colored, prod, dedup, unwrap, getenv, pluralize, ContextVar, Context, diskcache_put
 from tinygrad.helpers import FUSE_CONV_BW, FUSE_ARANGE, DEBUG, DONT_REALIZE_EXPAND, DONT_GROUP_REDUCES, SPLIT_REDUCEOP, CAPTURE_PROCESS_REPLAY
 from tinygrad.dtype import ImageDType
+from tinygrad.engine.multi import replace_allreduce
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View, strides_for_shape
 from tinygrad.spec import type_verify, sched_spec
@@ -485,7 +486,7 @@ def get_name(becomes_map:dict[UOp, UOp]) -> str:
 @track_rewrites(name_fxn=get_name)
 def get_becomes_map(big_sink:UOp) -> dict[UOp, UOp]:
   # merge_views + simplify
-  tensor_map = graph_rewrite_map(big_sink, insert_fuse+do_fuse+merge_views+sym+replace_contiguous, ctx={}, name="merge_views")
+  tensor_map = graph_rewrite_map(big_sink, replace_allreduce+insert_fuse+do_fuse+merge_views+sym+replace_contiguous, ctx={}, name="merge_views")
 
   # display the cleaned up tensor graph
   if getenv("VIZ"): graph_rewrite(tensor_map[big_sink], PatternMatcher([]), name="View Tensor Graph")
diff --git a/tinygrad/engine/jit.py b/tinygrad/engine/jit.py
index a0489646c2..936159b3d5 100644
--- a/tinygrad/engine/jit.py
+++ b/tinygrad/engine/jit.py
@@ -2,7 +2,7 @@ from typing import TypeVar, Generic, Callable, Union, cast, Optional, Any
 import functools, collections
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, BEAM, getenv, colored, JIT, dedup, partition, unwrap
-from tinygrad.device import Buffer, Compiled, Device
+from tinygrad.device import Buffer, Compiled, Device, MultiBuffer
 from tinygrad.dtype import DType
 from tinygrad.ops import UOp, Variable, sym_infer, Ops
 from tinygrad.shape.shapetracker import ShapeTracker
@@ -90,7 +90,7 @@ class GraphRunner(Runner):
     for j,ji in enumerate(jit_cache):
       estimates += ji.prg.estimates
       if isinstance(ji.prg, CompiledRunner):
-        if ji.prg.p.vars: self.var_vals_replace[j] = [self.vars.index(v) for v in ji.prg.p.vars]
+        if ji.prg.p.vars: self.var_vals_replace[j] = [self.vars.index(v) for v in ji.prg.p.vars if v not in ji.fixedvars]
 
         global_dim_idx, local_dim_idx = find_symbolic_dim(ji.prg.p.global_size), find_symbolic_dim(ji.prg.p.local_size)
         if global_dim_idx is not None or local_dim_idx is not None:
@@ -211,9 +211,10 @@ def _prepare_jit_inputs(args, kwargs):
   input_tensors: list[tuple[int|str, Tensor]] = [(name,t) for name,t in list(enumerate(args))+sorted(kwargs.items()) if t.__class__ is Tensor]
   names, tensors = [name for name,_ in input_tensors], [t for _,t in input_tensors]
   if len(unrealized_tensors := [x for x in tensors if not x.lazydata.is_realized]): Tensor.realize(*unrealized_tensors)
-  # TODO: should we be unpacking multi here?
+  # TODO: this multi unpack stuff is not well tested.
   lbs: list[UOp] = flatten([t.lazydata.src if t.lazydata.op is Ops.MULTI else [t.lazydata] for t in tensors])
-  input_buffers: list[Buffer] = [lb.base.realized for lb in lbs if lb.base.realized is not None]
+  input_buffers: list[Buffer] = flatten([rb.bufs if isinstance(rb:=lb.base.realized, MultiBuffer) else [rb]
+                                         for lb in lbs if lb.base.realized is not None])
   assert len(set(input_buffers)) == len(input_buffers), "duplicate inputs to JIT"
   st_varval_dtype_device = [(*unwrap(lb.st).unbind(), lb.dtype, lb.device) for lb in lbs]
   var_vals = merge_dicts([x[1] for x in st_varval_dtype_device] + [dict(v.unbind() for v in (args + tuple(kwargs.values())) if isinstance(v, UOp))])
diff --git a/tinygrad/engine/multi.py b/tinygrad/engine/multi.py
index c6f1e380e6..c3013047ea 100644
--- a/tinygrad/engine/multi.py
+++ b/tinygrad/engine/multi.py
@@ -1,98 +1,103 @@
 import functools, itertools, operator
-from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv
-from tinygrad.ops import Ops, UOp, sint
+from tinygrad.helpers import all_same, all_int, prod, DEBUG, RING, getenv
+from tinygrad.ops import Ops, UOp, sint, PatternMatcher, UPat, GroupOp, graph_rewrite_map, track_rewrites
 
-def all_reduce(bop: Ops, lbs: list[UOp]) -> list[UOp]:
-  assert all_int(lbs[0].shape), f"does not support symbolic shape {lbs[0].shape}"
-  assert all_same([lb.shape[0] for lb in lbs]), "allreduce with uneven shards is undefined"
-  n_lbs, shape, numel = len(lbs), lbs[0].shape, prod(lbs[0].shape)
+# *** allreduce implementation ***
+
+def handle_allreduce(buf:UOp, red:UOp) -> UOp|None:
+  if not isinstance(buf.device, tuple): return None
+  assert all_int(buf.shape), f"does not support symbolic shape {buf.shape}"
+  n_lbs, shape, numel = len(buf.device), buf.shape, prod(buf.shape)
   # ring allreduce doesn't provide a benefit with only 2 nodes or where number of elements is less than 256k (empirically)
   # fallback to naive allreduce to save on kernel dispatch, chunking and reassembling chunks.
   use_ring = (RING >= 2 or (n_lbs > 2 and numel > getenv("RING_ALLREDUCE_THRESHOLD", 256_000) and RING >= 1))
-  if DEBUG >= 2: print(f"{'RING ALLREDUCE' if use_ring else 'NAIVE ALLREDUCE'} {n_lbs}x{numel} | {lbs[0].dtype}")
-  if not use_ring: return [functools.reduce(lambda x,y: x.alu(bop, y), [x.copy_to_device(lb.device) for x in lbs]) for lb in lbs]
+  if DEBUG >= 2: print(f"{'RING ALLREDUCE' if use_ring else 'NAIVE ALLREDUCE'} {n_lbs}x{numel} | {buf.dtype}")
 
+  # contiguous before we copy it
+  buf = buf.contiguous()
+
+  # copy to all devices. if you shrink later, that'll be handled
+  if not use_ring: return functools.reduce(lambda x,y: x.alu(red.arg, y),
+                                           [UOp(Ops.COPY, buf.dtype, (buf, red.src[1]), arg=i) for i in range(len(buf.device))])
+
+  # new ring reduce
   factor = next((f for f in [32, 16, 8, 4, 2] if numel % f == 0), 1)
   base, left = (numel // factor) // n_lbs, (numel // factor) % n_lbs
   chunk_sizes = [(base + 1) * factor] * left + [base * factor] * (n_lbs - left)
   chunks = list(itertools.pairwise(itertools.accumulate(chunk_sizes, initial=0)))
-  chunked = [[lb.reshape((numel,)).shrink(((s,e),)) for s,e in chunks] for lb in lbs]
 
-  # scatter-reduce
-  for step in range(n_lbs-1):
-    for i in range(len(chunks)):
+  # extract chunks and scatter-reduce
+  reduced_chunks = []
+  for i,(s,e) in enumerate(chunks):
+    chunk = buf.reshape((numel,)).shrink(((s,e),))
+    reduced_chunk = chunk
+    for step in range(n_lbs-1):
       src, dest = (i+step)%n_lbs, (i+step+1)%n_lbs
-      chunked[dest][i] = chunked[dest][i].alu(bop, chunked[src][i].copy_to_device(chunked[dest][i].device))
+      # copy the chunk from the src device to the dest (operating device), and select the chunk on the dest device
+      reduced_chunk = reduced_chunk.copy_to_device(buf.device[dest], src).alu(red.arg, chunk.copy_to_device(buf.device[dest], dest))
+    reduced_chunks.append(reduced_chunk)
 
-  # allgather
-  for step in range(n_lbs-1):
-    for i in range(len(chunks)):
-      src, dest = (i+step-1)%n_lbs, (i+step)%n_lbs
-      chunked[dest][i] = chunked[src][i].copy_to_device(chunked[dest][i].device)
-
-  # assemble chunks back
+  # allgather + reassemble
   pads = [((s,numel-e),) for s,e in chunks]
-  return [functools.reduce(operator.add, [c.pad(pad) for pad,c in zip(pads,lb_c)]).reshape(shape) for lb_c in chunked]
+  return functools.reduce(operator.add, [c.copy_to_device(buf.device).pad(pad) for pad,c in zip(pads, reduced_chunks)]).reshape(shape)
 
-def to_sharded(lbs:list[UOp], axis:int, bounds: tuple[tuple[int, int], ...]) -> list[UOp]:
-  if lbs[0].shape[axis] % len(lbs) != 0: raise RuntimeError(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}, bounds={bounds}")
-  return [lb.shrink(tuple((0,s) if a != axis else bound for a,s in enumerate(lb.shape))) for i, (bound, lb) in enumerate(zip(bounds, lbs))]
+replace_allreduce = PatternMatcher([(UPat(Ops.ALLREDUCE, src=(UPat.var("buf"), UPat()), name="red"), handle_allreduce),])
 
 # ***** multi functions *****
 
-from tinygrad.ops import PatternMatcher, UPat, GroupOp, graph_rewrite_map, track_rewrites
-
 def alu_multi(root:UOp):
   msrcs = root.src
-  assert all(x.op is Ops.MULTI for x in msrcs), f"all buffers must be MultiLazyBuffer {[x.op for x in msrcs]}"
   assert all_same([x.device for x in msrcs]), f"all buffers must have the same device {[x.device for x in msrcs]}"
-
   axis = root.axis
-  bounds = dedup([x.bounds for x in root.src if x.axis == axis])[-1] if axis is not None else None
-  srcs:list[list[UOp]] = []
+  assert axis is not None
+
+  srcs = []
   for mlb in msrcs:
-    if mlb.axis == axis and (mlb.axis is None or mlb.bounds == bounds): srcs.append(list(mlb.src))
+    if mlb.axis == axis:
+      # same axis, just copy through
+      assert mlb.op is Ops.MULTI
+      srcs.append(mlb.src[0])
+    elif mlb.axis is None:
+      # no axis, shard it
+      assert mlb.op is not Ops.MULTI
+      srcs.append(mlb._shard(axis))
     else:
-      assert axis is not None and bounds is not None
-      if mlb.axis is None: srcs.append(to_sharded(list(mlb.src), axis, bounds))
-      else: srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.src], axis, bounds))
-  new_lbs = [lsrcs[0].alu(root.op, *lsrcs[1:]) for lsrcs in zip(*srcs)]
-  return UOp.multi(*new_lbs, axis=axis)
+      # axis mismatch, unshard it, send it to all devices, and shard it correctly
+      assert mlb.op is Ops.MULTI
+      srcs.append(mlb.src[0]._unshard(mlb.axis).allreduce(Ops.ADD, mlb.device)._shard(axis))
+  return srcs[0].alu(root.op, *srcs[1:]).multi(axis)
 
 def reduce_multi(root:UOp, multi:UOp):
   op, axis = root.arg
   if multi.axis is not None and multi.axis in axis:
     # all-reduce on sharded axes
-    reduced_parts = [x.r(op, axis) for x in multi.src]
-    # if all partitions are real, do all_reduce
-    return UOp.multi(*all_reduce(op, reduced_parts), axis=root.axis)
+    return multi.src[0].r(op, axis).allreduce(op, multi.device)
   # reduce on non sharded axes, piecewise is fine. if axis is None this is also correct
-  return UOp.multi(*[x.r(op, axis) for x in multi.src], axis=root.axis)
+  return multi.src[0].r(op, axis).multi(axis=multi.axis)
 
 def _shape_to_single_shard(axis, shape:tuple[sint, ...], lb:UOp) -> tuple[sint, ...]:
   return tuple(lb.shape[axis] if a == axis else s for a,s in enumerate(shape))
 
 def reshape_multi(root:UOp, multi:UOp):
   arg = root.arg
-  if (new_axis:=root.axis) is None: return UOp.multi(*[x.reshape(arg) for x in multi.src], axis=new_axis)
+  if (new_axis:=root.axis) is None: return multi.src[0].reshape(arg).multi(new_axis)
   assert prod(multi.shape) == prod(arg), "reshape must maintain prod(shape)"
-  assert all(prod(lb.shape[multi.axis:])%prod(arg[new_axis+1:])==0 for lb in multi.src), \
-    f"reshape cannot move items between shards {multi.shape} -> {root.arg=}"
-  lbs = [x.reshape(tuple(s if a!=new_axis else prod(x.shape[multi.axis:])//prod(arg[new_axis+1:]) for a,s in enumerate(arg))) for x in multi.src]
-  return UOp.multi(*lbs, axis=new_axis)
+  assert prod(multi.src[0].shape[multi.axis:])%prod(arg[new_axis+1:]) == 0, f"reshape cannot move items between shards {multi.shape} -> {root.arg=}"
+  new_shape_axis = prod(multi.src[0].shape[multi.axis:]) // prod(arg[new_axis+1:])
+  return multi.src[0].reshape(tuple(s if a!=new_axis else new_shape_axis for a,s in enumerate(arg))).multi(new_axis)
 
 def expand_multi(root:UOp, multi:UOp):
   # NOTE: this assert isn't needed, sharded axis can have dim 1
   assert multi.axis is None or root.arg[multi.axis] == multi.shape[multi.axis], f"expand not supported on sharded axis {root.arg=}"
-  return UOp.multi(*[x.expand(_shape_to_single_shard(multi.axis, root.arg, x)) for x in multi.src], axis=multi.axis)
+  return multi.src[0].expand(_shape_to_single_shard(multi.axis, root.arg, multi.src[0])).multi(multi.axis)
 
 def pad_multi(root:UOp, multi:UOp):
   assert multi.axis is None or root.arg[multi.axis] == (0,0), f"padding not supported for {root.arg=}"
-  return UOp.multi(*[x.pad(root.arg) for x in multi.src], axis=multi.axis)
+  return multi.src[0].pad(root.arg).multi(multi.axis)
 
 def permute_multi(root:UOp, multi:UOp):
   # all permutes supported!
-  return UOp.multi(*[x.permute(root.arg) for x in multi.src], axis=root.axis)
+  return multi.src[0].permute(root.arg).multi(root.axis)
 
 def shrink_multi(root:UOp, multi:UOp):
   assert multi.axis is None or root.arg[multi.axis] == (0, multi.shape[multi.axis]) or root.arg[multi.axis] in multi.bounds, \
@@ -101,30 +106,25 @@ def shrink_multi(root:UOp, multi:UOp):
     assert all(root.arg[i] == (0, s) or i == multi.axis for i,s in enumerate(multi.shape)), \
       "cannot shrink sharded and non-sharded axis at the same time"
     # NOTE: shrink on the shard axis is only allowed when result is a single partition, denoted by the new real
-    idx = multi.bounds.index(root.arg[multi.axis])
-    return UOp.multi(*[multi.src[idx].copy_to_device(d) for d in root.device], axis=None)
-  return UOp.multi(*[x.shrink(tuple((0, x.shape[multi.axis]) if a == multi.axis else s for a,s in enumerate(root.arg))) for x in multi.src],
-                   axis=multi.axis)
+    # we just copy it to all the devices, no real. this will be optimized out later
+    return multi.src[0].copy_to_device(multi.device, arg=multi.bounds.index(root.arg[multi.axis]))
+  return multi.src[0].shrink(tuple((0, multi.src[0].shape[multi.axis]) if a == multi.axis else s for a,s in enumerate(root.arg))).multi(multi.axis)
 
 def flip_multi(root:UOp, multi:UOp):
   assert multi.axis is None or not root.arg[multi.axis], "flipping not supported on sharded axis"
-  return UOp.multi(*[x.flip(root.arg) for x in multi.src], axis=multi.axis)
+  return multi.src[0].flip(root.arg).multi(multi.axis)
 
+# from multiple devices -> one
 def copy_multi(multi:UOp, device:UOp):
-  # if we already have a copy on the device, return that
-  if multi.axis is None: return next((lb for lb in multi.src if lb.device == device.arg), multi.src[0].copy_to_device(device))
-  # copy lbs to device, pad to final shape, and sum
-  llbs:list[UOp] = []
-  for lb,(start,end) in zip(multi.src, multi.bounds):
-    pad_arg = tuple((0,0) if a != multi.axis else (start, multi.bounds[-1][1]-end) for a in range(len(lb.shape)))
-    llbs.append(lb.copy_to_device(device).pad(pad_arg))
-  return functools.reduce(operator.add, llbs)
+  assert multi.axis is not None, "all multi ops have axis"
+  return multi.src[0]._unshard(multi.axis).allreduce(Ops.ADD, device)
 
 def assign_multi(dest:UOp, src:UOp):
   if dest.axis != src.axis: raise RuntimeError(f"axis must match in assign {dest.axis} != {src.axis}")
-  return UOp.multi(*[x.assign(y) for x,y in zip(dest.src, src.src)], axis=src.axis)
+  return dest.src[0].assign(src.src[0]).multi(src.axis)
 
-def passthrough_multi(root:UOp, multi:UOp): return UOp.multi(*[root.replace(src=(m,)) for m in multi.src], axis=multi.axis)
+def passthrough_multi(root:UOp, multi:UOp):
+  return root.replace(src=(multi.src[0],)).multi(multi.axis)
 
 # NOTE: this is the same pattern as Ops.UNROLL
 multi_pm = PatternMatcher([
@@ -138,6 +138,8 @@ multi_pm = PatternMatcher([
   (UPat(Ops.FLIP, src=(UPat(Ops.MULTI, name="multi"), ), name="root"), flip_multi),
   (UPat(Ops.ASSIGN, src=(UPat(Ops.MULTI, name="dest"), UPat(Ops.MULTI, name="src"))), assign_multi),
   (UPat(Ops.COPY, src=(UPat(Ops.MULTI, name="multi"), UPat(Ops.DEVICE, name="device"))), copy_multi),
+  (UPat(Ops.ALLREDUCE, src=(UPat(Ops.MULTI, name="multi"), UPat(Ops.DEVICE, name="device")), name="red"),
+    lambda multi,device,red: multi.src[0].allreduce(red.arg, device).multi(axis=multi.axis)),
   (UPat((Ops.CAST, Ops.BITCAST, Ops.CONTIGUOUS, Ops.DETACH, Ops.CONTIGUOUS_BACKWARD, Ops.FUSE),
         src=(UPat(Ops.MULTI, name="multi"), ), name="root"), passthrough_multi),
 ])
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 615abf3980..fb650f87b3 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -1,7 +1,8 @@
+from typing import cast
 from dataclasses import dataclass, field
 from collections import deque, defaultdict
 from tinygrad.ops import UOp, Variable, Ops, UPat, PatternMatcher, graph_rewrite, buffers
-from tinygrad.device import Buffer
+from tinygrad.device import Buffer, MultiBuffer
 from tinygrad.helpers import Metadata, unwrap, merge_dicts
 
 # **** ScheduleItem return type
@@ -58,15 +59,37 @@ def create_schedule_with_vars(sched_sink:UOp) -> tuple[list[ScheduleItem], dict[
     ast = graph_rewrite(k.arg.ast, pm_unbind, ctx=local_var_vals, name="unbind vars")
     var_vals = merge_dicts([var_vals, *local_var_vals])
     # create subbuffers if needed
-    if ast.op is Ops.BUFFER_VIEW: buffers[k.src[0]] = (base:=k.src[1].buf_uop.buffer).view(k.size, ast.dtype, ast.arg[1]*base.dtype.itemsize)
-    schedule.append(ScheduleItem(ast, tuple(s.buf_uop.buffer for s in k.src), k.arg.metadata))
+    if ast.op is Ops.BUFFER_VIEW:
+      base = k.src[1].buf_uop.buffer
+      assert isinstance(base, Buffer), "base can't be MultiBuffer"
+      buffers[k.src[0]] = base.view(k.size, ast.dtype, ast.arg[1]*base.dtype.itemsize)
+    ubufs = tuple(s.buf_uop.buffer for s in k.src)
+    if any(isinstance(x, MultiBuffer) for x in ubufs):
+      if ast.op is Ops.COPY:
+        if isinstance(ubufs[1], MultiBuffer) and ast.arg is None:  # src is multiple buffers, none selected
+          if isinstance(ubufs[0], MultiBuffer):
+            # COPY ALL -> ALL
+            for b1,b2 in zip(ubufs[0].bufs, ubufs[1].bufs): schedule.append(ScheduleItem(ast, (b1, b2), k.arg.metadata))
+          else:
+            # COPY ANY -> ONE. Currently we just select the first
+            schedule.append(ScheduleItem(ast, (ubufs[0], ubufs[1].bufs[0]), k.arg.metadata))
+        else:
+          src_buf = ubufs[1].bufs[ast.arg] if isinstance(ubufs[1], MultiBuffer) else ubufs[1]
+          if isinstance(ubufs[0], MultiBuffer):
+            # COPY ONE -> ALL (BROADCAST)
+            for b in ubufs[0].bufs: schedule.append(ScheduleItem(ast, (b, src_buf), k.arg.metadata))
+          else: schedule.append(ScheduleItem(ast, (ubufs[0], src_buf), k.arg.metadata)) # COPY ONE -> ONE
+      else:
+        assert all(isinstance(x, MultiBuffer) for x in ubufs), "kernel must all be multibuffer"
+        dnums = [x for x in ast.variables() if x.arg[0] == '_device_num']
+        for i,bufs in enumerate(zip(*[x.bufs for x in cast(tuple[MultiBuffer, ...], ubufs)])):
+          schedule.append(ScheduleItem(ast, bufs, k.arg.metadata, {dnums[0]:i} if len(dnums) else {}))
+    else:
+      schedule.append(ScheduleItem(ast, cast(tuple[Buffer, ...], ubufs), k.arg.metadata))
     for x in children[k]:
       in_degree[x] -= 1
       if in_degree[x] == 0: queue.append(x)
 
-  # confirm everything was scheduled correctly
-  assert len(schedule) == len(in_degree), f"Schedule length mistmatch {len(schedule)} != {len(in_degree)}"
-
   # map ASSIGN to BUFFER after ScheduleItems are constructed
   becomes_map = {u:u.buf_uop for u in toposort if u.op is Ops.ASSIGN}
   assert all(u.op in {Ops.BUFFER, Ops.BUFFER_VIEW} for u in becomes_map.values()), f"Schedule didn't end with BUFFER {becomes_map.values()}"
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 0bd1ffccd3..9684233875 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -8,7 +8,7 @@ from tinygrad.helpers import ContextVar, all_int, prod, getenv, all_same, Contex
 from tinygrad.helpers import PICKLE_BUFFERS, dedup, cdiv, cmod
 if TYPE_CHECKING:
   from tinygrad.shape.shapetracker import ShapeTracker
-  from tinygrad.device import Buffer
+  from tinygrad.device import Buffer, MultiBuffer
 
 # wrapper around IntEnum that preserves Enum.__str__ and makes auto() unique across all FastEnum subclasses
 class FastEnum(IntEnum):
@@ -322,7 +322,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     if not (src_sts := [x.st for x in self.src if x.st is not None]): return None
     assert all_same([x.shape for x in src_sts]), f"UOp sources must have the same shape {self} {[x.shape for x in src_sts]}"
     match self.op:
-      case Ops.MULTI: shape = tuple(sum(y.shape[a] for y in self.src) if a == self.axis else s for a,s in enumerate(self.src[0].shape))
+      case Ops.MULTI: shape = tuple(self.src[0].shape[a]*len(self.device) if a == self.axis else s for a,s in enumerate(self.src[0].shape))
       case Ops.BITCAST:
         shape = src_sts[0].shape
         if self.dtype.itemsize != (input_sz:=self.src[0].dtype.itemsize): shape = shape[:-1]+((shape[-1]*input_sz) // self.dtype.itemsize,)
@@ -382,7 +382,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   def const_like(self, b:ConstLike):
     # constants can optionally have a DEVICE source
     if self._device is None: return UOp.const(self.dtype, b)
-    if isinstance(self.device, tuple): return UOp.multi(*[UOp.metaop(Ops.CONST, self.shape, self.dtype, d, b) for d in self.device], axis=None)
     return UOp.metaop(Ops.CONST, self.shape, self.dtype, self.device, b)
   def broadcast(self, count:int):
     assert self.dtype.count == 1
@@ -431,18 +430,21 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   def contiguous(self): return self.alu(Ops.CONTIGUOUS)
   def contiguous_backward(self): return self.alu(Ops.CONTIGUOUS_BACKWARD)
   def fuse(self): return self.alu(Ops.FUSE)
+  def allreduce(self, op, device:str|tuple[str, ...]|UOp):
+    assert isinstance(self.device, tuple), f"allreduce must be on tuple {self.device} isn't"
+    return UOp(Ops.ALLREDUCE, self.dtype, (self, UOp(Ops.DEVICE, arg=device) if not isinstance(device, UOp) else device), op)
 
   # *** from MultiLazyBuffer ***
 
-  def multi(self, *more:UOp, axis:int|None):
-    parents = (self,)+more
-    assert all_same([x.dtype for x in parents]), "multi parents must have the same dtype"
-    return UOp(Ops.MULTI, self.dtype, parents, axis)
+  def multi(self, axis:int|None):
+    assert isinstance(self.device, tuple), f"multi device must be tuple, {self.device} isn't"
+    assert axis is not None, "multi None is no longer supported"
+    return UOp(Ops.MULTI, self.dtype, (self,), axis)
 
   @property
   def bounds(self):
     if self.axis is None: raise RuntimeError("bounds is not defined when axis is None")
-    return tuple(itertools.pairwise(itertools.accumulate([lb.shape[self.axis] for lb in self.src], initial=0)))
+    return tuple(itertools.pairwise(itertools.accumulate([self.src[0].shape[self.axis] for _ in self.device], initial=0)))
 
   @functools.cached_property
   def axis(self) -> Optional[int]:
@@ -461,21 +463,23 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     if self.op is Ops.PERMUTE: return self.arg.index(src_axis) if src_axis is not None else None
     return src_axis
 
-  def shard(self, devices:tuple[str, ...], axis:Optional[int]=None) -> UOp:
-    lbs = [self.copy_to_device(d) if self.device != d else self for d in devices]
-    if axis is not None:
-      if self.shape[axis] % len(devices) != 0: raise RuntimeError(f"multi axis uneven: {self.shape[axis]=} {axis=} {len(devices)=}")
-      # NOTE: this works for both even shards and uneven shards
-      sz = self.shape[axis] // len(devices)
-      sizes = [max(0, min(sz, self.shape[axis] - sz*i)) for i in range(len(devices))]
-      lbs = [lb.shrink(tuple((0,s) if i != axis else (off,off+sz) for i,s in enumerate(self.shape)))
-        for lb,sz,off in zip(lbs, sizes, itertools.accumulate(sizes, initial=0))]
-    return UOp.multi(*lbs, axis=axis)
+  def _unshard(self, axis:int) -> UOp:
+    bsz, dcount = self.shape[axis], len(self.device)
+    dnum = UOp.variable("_device_num", 0, dcount-1)
+    return self.pad(tuple((0,0) if a != axis else (bsz*dnum, bsz*(dcount-1) - bsz*dnum) for a in range(len(self.shape))))
+
+  def _shard(self, axis:int) -> UOp:
+    dcount = len(self.device)
+    dnum = UOp.variable("_device_num", 0, dcount-1)
+    if self.shape[axis] % dcount != 0: raise RuntimeError(f"multi axis uneven: {self.shape[axis]=} {axis=} {dcount=}")
+    sz = self.shape[axis] // dcount
+    return self.shrink(tuple((0,s) if i != axis else (dnum*sz,dnum*sz+sz) for i,s in enumerate(self.shape)))
+  def shard(self, devices:tuple[str, ...], axis:int) -> UOp: return self.copy_to_device(devices)._shard(axis).multi(axis)
 
   # *** from LazyBuffer ***
 
   @staticmethod
-  def metaop(op:Ops, shape:tuple[sint, ...], dtype:DType, device:str, arg=None) -> UOp:
+  def metaop(op:Ops, shape:tuple[sint, ...], dtype:DType, device:str|tuple[str, ...], arg=None) -> UOp:
     from tinygrad.shape.shapetracker import ShapeTracker
     # Tensor const is CONST(VIEW(DEVICE)) -> RESHAPE -> EXPAND
     if op is Ops.CONST:
@@ -522,16 +526,13 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   # *** uop Buffer stuff ***
 
   @staticmethod
-  def new_buffer(device:str|tuple[str, ...], size:int, dtype:DType):
-    if isinstance(device, tuple): return UOp(Ops.BUFFER, dtype, (UOp.unique(), *[UOp(Ops.DEVICE, arg=d) for d in device]), size)
-    return UOp(Ops.BUFFER, dtype, (UOp.unique(), UOp(Ops.DEVICE, arg=device)), size)
+  def new_buffer(device:str|tuple[str, ...], size:int, dtype:DType): return UOp(Ops.BUFFER, dtype, (UOp.unique(), UOp(Ops.DEVICE, arg=device)), size)
   @property
   def device(self) -> str|tuple[str, ...]: return cast(str|tuple[str, ...], unwrap(self._device))
   @functools.cached_property
   def _device(self) -> Optional[str|tuple[str, ...]]:
     if self.op is Ops.DEVICE: return self.arg
-    if self.op is Ops.MULTI: return tuple(cast(str, x.device) for x in self.src)
-    if self.op in {Ops.COPY, Ops.BUFFER}: return self.src[1].device
+    if self.op in {Ops.COPY, Ops.BUFFER, Ops.ALLREDUCE}: return self.src[1].device
     return dsrcs[0]._device if len(dsrcs:=[x for x in self.src if x._device is not None]) != 0 else None
   @property
   def buf_uop(self) -> UOp:
@@ -539,19 +540,20 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     assert self.op is Ops.ASSIGN, f"must be ASSIGN {self.op}"
     return self.src[0].base
   @property
-  def buffer(self) -> Buffer:
+  def buffer(self) -> Buffer|MultiBuffer:
     if self is not self.base:
       assert unwrap(self.st).contiguous, "VIEW only works here if it's contiguous"
       return self.src[0].buffer
     assert self.op is Ops.BUFFER, f"must be BUFFER {self.op}"
     if (cret:=buffers.get(self)) is not None: return cret
-    from tinygrad.device import Buffer
-    assert isinstance(self.device, str), f"buffer not supported on multi {self.device}"
-    buffers[self] = ret = Buffer(self.device, self.size, self.dtype if isinstance(self.dtype, ImageDType) else self.dtype.base)
-    ret.ref(1)
+    from tinygrad.device import Buffer, MultiBuffer
+    rdtype = self.dtype if isinstance(self.dtype, ImageDType) else self.dtype.base
+    if isinstance(self.device, tuple): ret = MultiBuffer(self.device, self.size, rdtype).ref(1)
+    else: ret = Buffer(self.device, self.size, rdtype).ref(1)
+    buffers[self] = ret
     return ret
   @property
-  def realized(self) -> Optional[Buffer]: return self.buffer if self.op is Ops.BUFFER and self.buffer.is_allocated() else None
+  def realized(self) -> Optional[Buffer|MultiBuffer]: return self.buffer if self.op is Ops.BUFFER and self.buffer.is_allocated() else None
   @property
   def is_realized(self) -> bool:
     return all(x.base.realized is not None for x in self.base.src) if self.base.op is Ops.MULTI else self.base.realized is not None
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 5f72c92d6e..d04a9712e7 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -50,9 +50,7 @@ def _apply_map_to_tensors(applied_map:dict[UOp, UOp], name:str|None=None) -> Non
 # this tracks the tensor.py METADATA
 _METADATA: contextvars.ContextVar[Optional[Metadata]] = contextvars.ContextVar("_METADATA", default=None)
 
-def _metaop(op, shape:tuple[sint,...], dtype:DType, device:str|tuple[str, ...], arg=None) -> UOp:
-  if isinstance(device, str): return UOp.metaop(op, shape, dtype, device, arg)
-  return UOp.multi(*[UOp.metaop(op, shape, dtype, d, arg) for d in device], axis=None)
+def _metaop(op, shape:tuple[sint,...], dtype:DType, device:str|tuple[str, ...], arg=None) -> UOp: return UOp.metaop(op, shape, dtype, device, arg)
 
 def _fromnp(x: 'np.ndarray') -> UOp:  # type: ignore [name-defined] # noqa: F821
   ret = UOp.new_buffer("NPY", x.size, _from_np_dtype(x.dtype))
@@ -286,7 +284,7 @@ class Tensor(SimpleMathTrait):
     # TODO: this is a hack for writing to DISK. remove with working assign
     if isinstance(self.device, str) and self.device.startswith("DISK"):
       if x.__class__ is not Tensor: x = Tensor(x, device="CPU", dtype=self.dtype)
-      self.contiguous().realize().lazydata.base.buffer.ensure_allocated().copyin(x._data())
+      cast(Buffer, self.contiguous().realize().lazydata.base.buffer).ensure_allocated().copyin(x._data())
       return self
     if x.__class__ is not Tensor: x = Tensor(x, device=self.device, dtype=self.dtype)
     if self.lazydata is x.lazydata: return self  # a self assign is a NOOP
@@ -304,7 +302,7 @@ class Tensor(SimpleMathTrait):
     """
     return Tensor(self.lazydata.detach(), device=self.device, requires_grad=False)
 
-  def _buffer(self) -> Buffer: return self.cast(self.dtype.base).contiguous().to("CPU").realize().lazydata.base.buffer
+  def _buffer(self) -> Buffer: return cast(Buffer, self.cast(self.dtype.base).contiguous().to("CPU").realize().lazydata.base.buffer)
   def _data(self) -> memoryview: return self._buffer().as_buffer()
 
   def data(self) -> memoryview:
@@ -403,7 +401,7 @@ class Tensor(SimpleMathTrait):
     """
     assert isinstance(self.device, str), "can't shard a MultiLazyBuffer"
     devices = tuple(Device.canonicalize(x) for x in devices)
-    mlb = self.lazydata.shard(devices, self._resolve_dim(axis) if axis is not None else None)
+    mlb = self.lazydata.shard(devices, self._resolve_dim(axis)) if axis is not None else self.lazydata.copy_to_device(devices)
     return Tensor(mlb, device=devices, requires_grad=self.requires_grad)
 
   def shard_(self, devices:tuple[str, ...], axis:int|None=None) -> Tensor:
@@ -437,10 +435,9 @@ class Tensor(SimpleMathTrait):
     """
     dtype, shape = to_dtype(dtype) if dtype is not None else dtypes.default_float, argfix(*shape)
     if not isinstance(size:=prod([x.vmax if isinstance(x, UOp) else x for x in shape]), int): raise ValueError(f"size must be int {size}")
-    if isinstance(device, tuple):
-      return Tensor(UOp.multi(*[UOp.new_buffer(Device.canonicalize(d), size, dtype).reshape(shape) for d in device], axis=None),
-                    device, dtype, **kwargs)
-    return Tensor(UOp.new_buffer(Device.canonicalize(device), size, dtype), device, dtype, **kwargs).reshape(shape)
+    # TODO: add test for multidevice tensor
+    device = tuple(Device.canonicalize(d) for d in device) if isinstance(device, tuple) else Device.canonicalize(device)
+    return Tensor(UOp.new_buffer(device, size, dtype), device, dtype, **kwargs).reshape(shape)
 
   @staticmethod
   def from_blob(ptr:int, shape:tuple[int, ...], **kwargs) -> Tensor:
@@ -452,7 +449,8 @@ class Tensor(SimpleMathTrait):
     Additionally, all other keyword arguments are passed to the constructor of the tensor.
     """
     r = Tensor.empty(*shape, **kwargs)
-    r.lazydata.buffer.allocate(external_ptr=ptr)
+    assert isinstance(r.device, str)
+    cast(Buffer, r.lazydata.buffer).allocate(external_ptr=ptr)
     return r
 
   @staticmethod