From c2504357afbc3caf77e06249f190781df70d3e4e Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Mon, 13 Jan 2025 23:53:13 +0300 Subject: [PATCH 01/29] am: lock to access dev (#8594) * amm lock to access dev * wording * just works * disbale --- tinygrad/runtime/support/am/amdev.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tinygrad/runtime/support/am/amdev.py b/tinygrad/runtime/support/am/amdev.py index a070ba0b99..aaf4a4ab74 100644 --- a/tinygrad/runtime/support/am/amdev.py +++ b/tinygrad/runtime/support/am/amdev.py @@ -1,6 +1,6 @@ from __future__ import annotations -import ctypes, collections, time, dataclasses, pathlib -from tinygrad.helpers import to_mv, mv_address, getenv, round_up, DEBUG +import ctypes, collections, time, dataclasses, pathlib, fcntl, os, signal +from tinygrad.helpers import to_mv, mv_address, getenv, round_up, DEBUG, temp from tinygrad.runtime.autogen.am import am, mp_11_0, mp_13_0_0, nbio_4_3_0, mmhub_3_0_0, gc_11_0_0, osssys_6_0_0 from tinygrad.runtime.support.allocator import TLSFAllocator from tinygrad.runtime.support.am.ip import AM_SOC21, AM_GMC, AM_IH, AM_PSP, AM_SMU, AM_GFX, AM_SDMA @@ -255,6 +255,15 @@ class AMDev: self.pcidev, self.devfmt = pcidev, devfmt self.vram, self.doorbell64, self.mmio = vram_bar, doorbell_bar, mmio_bar + os.umask(0) # Set umask to 0 to allow creating files with 0666 permissions + + # Avoid O_CREAT because we don’t want to re-create/replace an existing file (triggers extra perms checks) when opening as non-owner. + if os.path.exists(lock_name:=temp(f"am_{self.devfmt}.lock")): self.lock_fd = os.open(lock_name, os.O_RDWR) + else: self.lock_fd = os.open(lock_name, os.O_RDWR | os.O_CREAT, 0o666) + + try: fcntl.flock(self.lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + except OSError: raise RuntimeError(f"Failed to open AM device {self.devfmt}. It's already in use.") + self._run_discovery() self._build_regs() @@ -284,14 +293,17 @@ class AMDev: self.sdma:AM_SDMA = AM_SDMA(self) if self.partial_boot and (self.reg("regCP_MEC_RS64_CNTL").read() & gc_11_0_0.CP_MEC_RS64_CNTL__MEC_HALT_MASK == 0): - print(f"am {self.devfmt}: MEC is active. Someone might be using the GPU? Issue a full reset.") + if DEBUG >= 2: print(f"am {self.devfmt}: MEC is active. Issue a full reset.") self.partial_boot = False if not self.partial_boot: - if self.psp.is_sos_alive(): self.smu.mode1_reset() - for ip in [self.soc21, self.gmc, self.ih, self.psp, self.smu]: - ip.init() - if DEBUG >= 2: print(f"am {self.devfmt}: {ip.__class__.__name__} initialized") + try: # do not interrupt the boot process + signal.signal(signal.SIGINT, signal.SIG_IGN) + if self.psp.is_sos_alive(): self.smu.mode1_reset() + for ip in [self.soc21, self.gmc, self.ih, self.psp, self.smu]: + ip.init() + if DEBUG >= 2: print(f"am {self.devfmt}: {ip.__class__.__name__} initialized") + finally: signal.signal(signal.SIGINT, signal.default_int_handler) # Booting done self.is_booting = False From ae2229d7275e6a52568a4e7909a322fbac1755f8 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:32:07 -0500 Subject: [PATCH 02/29] assert kernel buffer limit at compile time [pr] (#8595) * remove the BUF_LIMIT assert * skip the base one --- test/external/external_model_benchmark.py | 5 +++-- test/test_linearizer.py | 3 +-- test/test_schedule.py | 5 +++-- tinygrad/engine/schedule.py | 6 ------ 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/test/external/external_model_benchmark.py b/test/external/external_model_benchmark.py index 1ebe97641e..3ed7b82746 100644 --- a/test/external/external_model_benchmark.py +++ b/test/external/external_model_benchmark.py @@ -9,6 +9,7 @@ from onnx2torch import convert from extra.onnx import get_run_onnx from tinygrad.helpers import OSX, DEBUG, fetch from tinygrad import Tensor, Device +from tinygrad.device import CompileError MODELS = { "resnet50": "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx", @@ -72,10 +73,10 @@ def benchmark_model(m, devices, validate_outs=False): for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()} benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}) # noqa: F821 del inputs, tinygrad_model, tinygrad_jitted_model - except RuntimeError as e: + except CompileError as e: # TODO: we don't run the dm model on METAL for now if Device.DEFAULT == "METAL": - assert "buffer count limit" in str(e) + assert "no 'buffer' resource location available" in str(e) return else: raise e diff --git a/test/test_linearizer.py b/test/test_linearizer.py index a4d8da48ca..cd274b6aaf 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -12,7 +12,6 @@ from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View # from tinygrad.ops import Variable from tinygrad.tensor import Tensor, _to_np_dtype -from tinygrad.engine.schedule import BUF_LIMIT from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner from tinygrad.helpers import prod, Context, getenv, CI, flatten, dedup, AMX from tinygrad.dtype import DType, dtypes @@ -1701,7 +1700,7 @@ class TestHandCodedOpts(unittest.TestCase): # float4/other hcopt shouldn't upcast last axis, since we already have 7 upcast, and the last axis is not very contiguous assert k.upcasted == 1 and k.full_shape[-1] == 7 - @unittest.skipIf((buf_max:=BUF_LIMIT.get(Device.DEFAULT)) is not None and buf_max <= 37, "this test uses too many bufs") + @unittest.skipIf(Device.DEFAULT == "METAL", "METAL can only run kernels with up to 32 buffers") def test_masked_upcast_wino(self): monster = Tensor.stack(*[Tensor.stack(*[Tensor.rand(16) for _ in range(6)]) for _ in range(6)]) diff --git a/test/test_schedule.py b/test/test_schedule.py index d69e9dbaa6..e83ae13c28 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -16,7 +16,7 @@ from tinygrad.shape.view import View from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context from tinygrad.codegen.kernel import verify_ast -from tinygrad.engine.schedule import BUF_LIMIT, ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops +from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops from tinygrad.engine.realize import CompiledRunner, run_schedule, lower_schedule from extra.models.llama import precompute_freqs_cis @@ -1363,8 +1363,9 @@ class TestSchedule(unittest.TestCase): @unittest.expectedFailure def test_conv2d_fused_half(self): _test_conv2d(5, dtype=dtypes.half) + @unittest.skip("splitting kernels exceeding device buffer count is not yet supported") def _test_buf_cnt(self, cnt:int, allowed:int): - if (m:=BUF_LIMIT.get(Device.DEFAULT)) is None or m != 32: self.skipTest(f"test needs a buf_max of 32 {Device.DEFAULT}") + #if (m:=BUF_LIMIT.get(Device.DEFAULT)) is None or m != 32: self.skipTest(f"test needs a buf_max of 32 {Device.DEFAULT}") alu = functools.reduce(lambda x,y: x+y, [Tensor.ones((1, 1)).contiguous().realize() for _ in range(cnt-1)]) s = alu.schedule() assert len(s) == allowed diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index 34cea54d8d..26c789aeb2 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -13,8 +13,6 @@ from tinygrad.device import Buffer # creation can recurse a lot sys.setrecursionlimit(10000) -BUF_LIMIT = {"METAL":32} - # **** big graph spec tensor_uop_spec = PatternMatcher([ @@ -236,10 +234,6 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem: sink = graph_rewrite(graph_rewrite(sink, view_left), view_right) # convert to AST sink = graph_rewrite(graph_rewrite(sink, to_si+check_preload if len(si_ctx.assigns) != 0 else to_si, si_ctx), append_bufs, si_ctx) - # assert buffer count limit - if (limit:=BUF_LIMIT.get(device:=si_ctx.bufs[0].device)) is not None and len(si_ctx.bufs) >= limit: - if DEBUG >= 3: print(sink) - raise RuntimeError(f"Kernel for {si_ctx.metadata} exceeded the {limit} buffer count limit for {device} with {len(si_ctx.bufs)} buffers.") # we also allow masked views. if it has a single view and it's equal when you shrink a contig, it's fine for ubuf,ops in si_ctx.assign_adj.items(): if si_ctx.sinked.get(ubuf) is not None and not all((s:=x.st_arg).contiguous or (len(s.views) == 1 and (m:=s.views[0].mask) is not None \ From c4e33048c6c3a6896e8ec04a69f055b00f64366b Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 13 Jan 2025 20:13:44 -0500 Subject: [PATCH 03/29] test Tensor.clone has a different lazydata [pr] (#8600) --- test/test_tensor.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_tensor.py b/test/test_tensor.py index 610fa7c6af..a5152caa95 100644 --- a/test/test_tensor.py +++ b/test/test_tensor.py @@ -646,11 +646,20 @@ class TestZeroShapeTensor(unittest.TestCase): def test_clone(self): a = Tensor.rand(16, 16).realize() + self.assertIsNot(a.lazydata, a.clone().lazydata) np.testing.assert_allclose(a.numpy(), a.clone().numpy()) a = Tensor.rand(16, 16).mul(5.0).add(5.0) + self.assertIsNot(a.lazydata, a.clone().lazydata) np.testing.assert_allclose(a.numpy(), a.clone().numpy()) + def test_clone_with_shrink(self): + a = Tensor.empty(16, 16) + self.assertIsNot(a.lazydata, a.clone().lazydata) + + b = a.shrink(((2, 10), None)) + self.assertIsNot(b.lazydata, b.clone().lazydata) + def test_clone_with_grad(self): a = Tensor.rand(16, 16, requires_grad=True) a.mul(5.0).add(5.0).mean().backward() From 227d96d7a30734ba110efa61846e3a1ebcc00d9e Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 13 Jan 2025 20:28:14 -0500 Subject: [PATCH 04/29] remove unused src from metaop [pr] (#8601) --- tinygrad/ops.py | 2 +- tinygrad/tensor.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index ee810a4c57..6a039d54bf 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -440,7 +440,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): # *** from LazyBuffer *** @staticmethod - def metaop(op:Ops, shape:tuple[sint, ...], dtype:DType, device:str, arg=None, src:tuple[UOp, ...]=()) -> UOp: + def metaop(op:Ops, shape:tuple[sint, ...], dtype:DType, device:str, arg=None) -> UOp: from tinygrad.shape.shapetracker import ShapeTracker # Tensor const is CONST(VIEW(DEVICE)) -> RESHAPE -> EXPAND if op is Ops.CONST: diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 250f262102..70fcac9274 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -41,9 +41,9 @@ class Function: import tinygrad.function as F -def _metaop(op, shape:tuple[sint,...], dtype:DType, device:Union[str, tuple[str, ...]], arg=None, src:tuple[UOp, ...]=()): - if isinstance(device, str): return UOp.metaop(op, shape, dtype, device, arg, src) - return MultiLazyBuffer([UOp.metaop(op, shape, dtype, d, arg, src) for d in device], None) +def _metaop(op, shape:tuple[sint,...], dtype:DType, device:Union[str, tuple[str, ...]], arg=None): + if isinstance(device, str): return UOp.metaop(op, shape, dtype, device, arg) + return MultiLazyBuffer([UOp.metaop(op, shape, dtype, d, arg) for d in device], None) def _from_np_dtype(npdtype:'np.dtype') -> DType: # type: ignore [name-defined] # noqa: F821 import numpy as np From d443e91d82598a862dfc44af868e3af99e3782de Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 13 Jan 2025 21:29:13 -0500 Subject: [PATCH 05/29] remove custom splits in Tensor.shard [pr] (#8602) towards even split only --- test/test_multitensor.py | 18 ++++++++---------- tinygrad/tensor.py | 19 ++++++++----------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/test/test_multitensor.py b/test/test_multitensor.py index a80014c084..67cb9757e1 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -460,18 +460,15 @@ class TestMultiTensor(unittest.TestCase): def test_uneven_shard_with_empty(self): N = 4 - X = Tensor.rand(16, 1, 17).contiguous().realize() + X = Tensor.rand(16, 1, 3).contiguous().realize() np_x = X.numpy() devices = tuple(f"{Device.DEFAULT}:{i}" for i in range(N)) # test empty shard - np.testing.assert_equal(X.shard(devices, 0, (2, 2, 12, 0)).numpy(), np_x) + np.testing.assert_equal(X.shard(devices, 0).numpy(), np_x) # test reshape with empty shard - np.testing.assert_equal(X.shard(devices, 0, (2, 2, 12, 0)).reshape(8, 1, 34).numpy(), np_x.reshape(8, 1, 34)) - - # test elementwise with empty shard - np.testing.assert_equal((X.shard(devices, 0, (2, 2, 12, 0)) + X.shard(devices, 0, (0, 0, 1, 15))).numpy(), np_x + np_x) + np.testing.assert_equal(X.shard(devices, 0).reshape(8, 1, 6).numpy(), np_x.reshape(8, 1, 6)) def test_multiple_uneven_shard(self): N = 4 @@ -479,8 +476,8 @@ class TestMultiTensor(unittest.TestCase): Y = Tensor.rand(4, 1, 257).contiguous().realize() np_x, np_y = X.numpy(), Y.numpy() devices = tuple(f"{Device.DEFAULT}:{i}" for i in range(N)) - X.shard_(devices, 2, (2, 38, 47, 170)) - Y.shard_(devices, 2, (34, 53, 51, 119)) + X.shard_(devices, 2) + Y.shard_(devices, 2) np.testing.assert_equal(X.numpy(), np_x) np.testing.assert_equal(Y.numpy(), np_y) np.testing.assert_equal((X + Y).numpy(), np_x + np_y) @@ -534,6 +531,7 @@ class TestMultiTensor(unittest.TestCase): with self.assertRaises((AssertionError, ValueError)): t0.reshape((26*15,7)) + @unittest.skip("no longer supports splits") def test_reshape_on_axis_uneven(self): def reshape_helper(t0, t, t_axis): np.testing.assert_allclose(t0.reshape(t.shape).numpy(), t.numpy()) @@ -606,7 +604,7 @@ class TestMultiTensor(unittest.TestCase): self.assertEqual(t.lazydata.axis, t2.lazydata.axis) def test_rand_like_uneven_shard(self): - t = Tensor.empty((4, 42, 15)).shard(devices_3, axis=1, splits=(14, 7, 21)) + t = Tensor.empty((4, 42, 15)).shard(devices_3, axis=1) t2 = Tensor.rand_like(t) self.assertEqual(t.shape, t2.shape) self.assertEqual(t.device, t2.device) @@ -657,7 +655,7 @@ class TestMultiTensor(unittest.TestCase): def test_dropout_on_uneven_shard_axis(self): with Tensor.train(): - X = Tensor.ones(256).shard(devices_3, axis=0, splits=(100, 50, 106)) + X = Tensor.ones(256).shard(devices_3, axis=0) output = X.dropout(0.5).numpy() unique, counts = np.unique(output, return_counts=True) assert set(unique) == {0, 2}, unique diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 70fcac9274..6ca7bcb652 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -394,33 +394,30 @@ class Tensor(SimpleMathTrait): if self.grad is not None and real.grad is not None: self.grad.replace(real.grad) return self.replace(real) - def shard(self, devices:tuple[str, ...], axis:Optional[int]=None, splits:Optional[tuple[int, ...]]=None) -> Tensor: + def shard(self, devices:tuple[str, ...], axis:Optional[int]=None) -> Tensor: """ - Shards the tensor across the given devices. Optionally specify which axis to shard on, and how to split it across devices. + Shards the tensor across the given devices. Optionally specify which axis to shard on. ```python exec="true" source="above" session="tensor" result="python" t = Tensor.empty(2, 3) - print(t.shard((t.device, t.device), axis=1, splits=(2, 1)).lazydata) + print(t.shard((t.device, t.device), axis=1).lazydata) ``` - """ assert isinstance(self.lazydata, UOp), "can't shard a MultiLazyBuffer" devices, bounds = tuple(Device.canonicalize(x) for x in devices), None if axis is not None: axis = self._resolve_dim(axis) - if splits is None: - if not isinstance(total:=self.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {self.shape=}, {axis=}") - sz = ceildiv(total, len(devices)) - splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))]) - assert sum(splits) == self.shape[axis], "specified splits do not sum up to axis shape" + if not isinstance(total:=self.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {self.shape=}, {axis=}") + sz = ceildiv(total, len(devices)) + splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))]) bounds = tuple(itertools.pairwise(itertools.accumulate(splits, initial=0))) return Tensor(MultiLazyBuffer.from_sharded(self.lazydata, devices, axis, bounds), device=devices, requires_grad=self.requires_grad) - def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None, splits:Optional[tuple[int, ...]]=None): + def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None): """ Shards the tensor across the given devices in place. """ - return self.replace(self.shard(devices, axis, splits)) + return self.replace(self.shard(devices, axis)) @staticmethod def from_uop(y:UOp, **kwargs) -> Tensor: From 05e54f00d38c7a13df35359b7432488fb53bc9fb Mon Sep 17 00:00:00 2001 From: chenyu Date: Mon, 13 Jan 2025 23:40:05 -0500 Subject: [PATCH 06/29] remove bounds from MultiLazyBuffer.from_sharded [pr] (#8603) without a custom bound, the bound is uniquely determined by shape and axis --- tinygrad/multi.py | 10 +++++++--- tinygrad/tensor.py | 13 ++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tinygrad/multi.py b/tinygrad/multi.py index 20f49b3877..97a17fe714 100644 --- a/tinygrad/multi.py +++ b/tinygrad/multi.py @@ -1,6 +1,6 @@ from __future__ import annotations import functools, itertools, operator -from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv +from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv, ceildiv from tinygrad.dtype import DType from tinygrad.ops import Ops, MathTrait, UOp, sint @@ -62,8 +62,12 @@ class MultiLazyBuffer(MathTrait): def __repr__(self): return f"" @staticmethod - def from_sharded(lb:UOp, devices:tuple[str, ...], axis:int|None, bounds:tuple[tuple[int, int], ...]|None): - assert (axis is None) == (bounds is None), "must specify bounds iff axis is specified" + def from_sharded(lb:UOp, devices:tuple[str, ...], axis:int|None): + if axis is not None: + if not isinstance(total:=lb.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {lb.shape=}, {axis=}") + sz = ceildiv(total, len(devices)) + splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))]) + bounds = tuple(itertools.pairwise(itertools.accumulate(splits, initial=0))) lbs = [lb] * len(devices) sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis, bounds) if axis is not None and bounds is not None else lbs, devices)] # NOTE: this contiguous is making it impossible for the scheduler to do late const folding diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 6ca7bcb652..60b9735bd9 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -179,7 +179,7 @@ class Tensor(SimpleMathTrait): # data might be on a different device if isinstance(device, str): self.lazydata:Union[UOp, MultiLazyBuffer] = data if data.device == device else data.copy_to_device(device) # if device is a tuple, we should have/construct a MultiLazyBuffer - elif isinstance(data, UOp): self.lazydata = MultiLazyBuffer.from_sharded(data, device, None, None) + elif isinstance(data, UOp): self.lazydata = MultiLazyBuffer.from_sharded(data, device, None) else: assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}" self.lazydata = data @@ -404,14 +404,9 @@ class Tensor(SimpleMathTrait): ``` """ assert isinstance(self.lazydata, UOp), "can't shard a MultiLazyBuffer" - devices, bounds = tuple(Device.canonicalize(x) for x in devices), None - if axis is not None: - axis = self._resolve_dim(axis) - if not isinstance(total:=self.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {self.shape=}, {axis=}") - sz = ceildiv(total, len(devices)) - splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))]) - bounds = tuple(itertools.pairwise(itertools.accumulate(splits, initial=0))) - return Tensor(MultiLazyBuffer.from_sharded(self.lazydata, devices, axis, bounds), device=devices, requires_grad=self.requires_grad) + devices = tuple(Device.canonicalize(x) for x in devices) + if axis is not None: axis = self._resolve_dim(axis) + return Tensor(MultiLazyBuffer.from_sharded(self.lazydata, devices, axis), device=devices, requires_grad=self.requires_grad) def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None): """ From 863abc71401c496fa34ca0ab49b0e68a9f4ffff2 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Tue, 14 Jan 2025 03:01:59 -0500 Subject: [PATCH 07/29] scheduling graph_rewrite prereqs for BLOCK in ASSIGN (#8598) * remove the BUF_LIMIT assert * skip the base one * work * work * good error * ok comment * shorter check --- .../external/process_replay/process_replay.py | 6 +- tinygrad/engine/schedule.py | 57 +++++++++---------- 2 files changed, 29 insertions(+), 34 deletions(-) diff --git a/test/external/process_replay/process_replay.py b/test/external/process_replay/process_replay.py index b5432bb6a8..1bc597c6b9 100755 --- a/test/external/process_replay/process_replay.py +++ b/test/external/process_replay/process_replay.py @@ -2,7 +2,7 @@ # compare kernels created by HEAD against master from collections import defaultdict import os, multiprocessing, logging, pickle, sqlite3, difflib, functools, warnings -from typing import Callable, List, Set, Tuple, Union, cast +from typing import Callable, List, Tuple, Union, cast from tinygrad.helpers import VERSION, Context, ContextVar, colored, db_connection, getenv, tqdm from tinygrad.engine.schedule import ScheduleContext, schedule_uop from tinygrad.codegen.kernel import Kernel, Opt @@ -30,9 +30,9 @@ class ProcessReplayWarning(Warning): pass # *** recreators -def recreate_sched(ast:UOp, assigns:Set[UOp]) -> UOp: +def recreate_sched(ast:UOp) -> UOp: # NOTE: process replay isn't meant to actually schedule anything - return schedule_uop(ast, ScheduleContext(assigns=assigns, tensor_uops=defaultdict(list))).ast + return schedule_uop(ast, ScheduleContext(tensor_uops=defaultdict(list))).ast def recreate_kernel(ast:UOp, opts:Renderer, applied_opts:List[Opt], name:str) -> str: k = Kernel(ast, opts=opts) for opt in applied_opts: k.apply_opt(opt) diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index 26c789aeb2..5803e6c655 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -183,14 +183,9 @@ view_right = merge_views+PatternMatcher([ @dataclass(frozen=True) class ScheduleItemContext: - ops_metadata: dict[UOp, Metadata] - assigns: set[UOp] var_vals: dict[Variable, int] - sinked: dict[UOp, UOp] sts: set[ShapeTracker] = field(default_factory=set) bufs: list[UOp] = field(default_factory=list) - metadata: set[Metadata] = field(default_factory=set) - assign_adj: dict[UOp, list[UOp]] = field(default_factory=dict) def _append_st_vars(ctx:ScheduleItemContext, x:UOp) -> UOp|None: if (st:=unwrap(x.st)) in ctx.sts: return None @@ -204,47 +199,47 @@ def _append_buf(ctx:ScheduleItemContext, x:UOp) -> UOp: return UOp(Ops.DEFINE_GLOBAL, x.dtype.ptr(size=x.size), (), len(ctx.bufs)-1) append_bufs = PatternMatcher([(UPat(Ops.BUFFER, name="x"), _append_buf)]) -def _append_preload(ctx:ScheduleItemContext, x:UOp, b:UOp) -> UOp: - (adj_loads:=ctx.assign_adj.setdefault(b, [])).append(x) - if not all_same([x.op for x in adj_loads]): raise RuntimeError(f"Detected cycle when fusing {adj_loads}. Can only fuse PRELOAD or LOAD of {b}") - return x.replace(op=Ops.LOAD) -check_preload = PatternMatcher([(UPat(Ops.PRELOAD, src=(UPat.var("b"), UPat()), name="x"), _append_preload),]) - to_si = PatternMatcher([ (UPat(Ops.VIEW, name="x"), _append_st_vars), (UPat(Ops.SINK, src=(UPat.store(UPat.var("b"), UPat(), UPat(GroupOp.Meta, name="x")),)), lambda b,x: x.replace(src=(b, *x.src))), # don't need contiguous or assign anymore (UPat(Ops.CONTIGUOUS, src=(UPat.var("x"),)), lambda x: x), (UPat(Ops.ASSIGN, src=(UPat(), UPat.var("x"),)), lambda x: x), + # PRELOAD becomes LOAD + (UPat(Ops.PRELOAD, name="root"), lambda root:root.replace(op=Ops.LOAD)), ]) -add_metadata = PatternMatcher([(UPat(tuple(Ops), name="x"), lambda ctx,x: None if (m:=ctx.ops_metadata.get(x)) is None else ctx.metadata.add(m)),]) add_assign_adjacents = PatternMatcher([(UPat.load(UPat.var("b"), UPat(), name="x"), lambda ctx,b,x: ctx.assign_adj.setdefault(b, []).append(x) if b in ctx.assigns else None)]) -# late folding for multi output kernels -multioutput = PatternMatcher([(UPat.load(UPat.var("b"), UPat()), lambda ctx,b: ctx.sinked.get(b)),]) +# LOAD(BUFFER) -> the STORE value if it's we're doing the STORE in the same kernel +multioutput = PatternMatcher([(UPat.load(UPat.var("b"), UPat()), lambda ctx,b: ctx.get(b)),]) def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem: - # create the ast context - si_ctx = ScheduleItemContext(ctx.ops_metadata, ctx.assigns, ctx.var_vals, {x.buf_uop:x.src[2] for x in pre.src}) - create_ctx = add_metadata if len(si_ctx.assigns) == 0 else add_metadata+add_assign_adjacents - sink = graph_rewrite(pre, create_ctx if len(si_ctx.sinked) == 1 else multioutput+create_ctx, si_ctx) - # do movement ops - sink = graph_rewrite(graph_rewrite(sink, view_left), view_right) - # convert to AST - sink = graph_rewrite(graph_rewrite(sink, to_si+check_preload if len(si_ctx.assigns) != 0 else to_si, si_ctx), append_bufs, si_ctx) - # we also allow masked views. if it has a single view and it's equal when you shrink a contig, it's fine - for ubuf,ops in si_ctx.assign_adj.items(): - if si_ctx.sinked.get(ubuf) is not None and not all((s:=x.st_arg).contiguous or (len(s.views) == 1 and (m:=s.views[0].mask) is not None \ - and ShapeTracker.from_shape(s.shape).shrink(m) == s.shrink(m)) for x in ops): - raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n" - +colored(" - a += a.T\n", "red")+colored(" + a += a.T.contiguous()", "green")) + # remove movement ops + substitute LOAD of fused STORE with just the value + sink = graph_rewrite(graph_rewrite(pre, multioutput+view_left, store_bufs:={x.buf_uop:x.src[2] for x in pre.src}), view_right) + # remove extra uops from SINK + substitue BUFFER with DEFINE_GLOBAL + ast = graph_rewrite(sink, to_si+append_bufs, si_ctx:=ScheduleItemContext(ctx.var_vals)) # capture process replay if CAPTURE_PROCESS_REPLAY: - with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, si_ctx.assigns, ContextVar._cache, sink)) - return ScheduleItem(sink, tuple(u.buffer for u in si_ctx.bufs if u.size != 0), tuple(si_ctx.metadata), - tuple(ubuf for ubuf,ops in si_ctx.assign_adj.items() if any(x.op is Ops.PRELOAD for x in ops))) + with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, ContextVar._cache, sink)) + # deal with ASSIGN + assign_preloads: list[UOp] = [] + if len(ctx.assigns) != 0: + for x in list(sink.toposort)[::-1]: + # we only allow a kernel to depend on either the before ASSIGN or after ASSIGN version of a BUFFER + if x.op is Ops.LOAD and x.buf_uop in assign_preloads: raise RuntimeError("cycle detected in graph") + # PRELOAD tells the toposort this kernel should run before ASSIGN + if x.op is Ops.PRELOAD: + assign_preloads.append(x.buf_uop) + # if this kernel also assigns to the buffer, we only allow either contiguous or masked views for the LOAD + if x.buf_uop in store_bufs and not (st:=x.st_arg).contiguous: + # if it has a single view and it's equal when you shrink a contig, it's fine + if len(st.views) != 1 or (mask:=st.views[0].mask) is None or ShapeTracker.from_shape(st.shape).shrink(mask) != st.shrink(mask): + raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n" + +colored(" - a += a.T\n", "red")+colored(" + a += a.T.contiguous()", "green")) + return ScheduleItem(ast, tuple(u.buffer for u in si_ctx.bufs if u.size != 0), + tuple(dedup(m for x in pre.toposort if (m:=ctx.ops_metadata.get(x)) is not None)), tuple(dedup(assign_preloads))) PROCESS_REPLAY_CAPTURE: dict[str, bytes] = {} if CAPTURE_PROCESS_REPLAY: From 5aab2806f0dab1f3f6087ab5f2ab08f38c06d465 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Tue, 14 Jan 2025 05:09:56 -0500 Subject: [PATCH 08/29] rename to test_tensor_uop + use upats for asserting [pr] (#8604) * rename to test_tensor_uop + use upats for asserting [pr] * fix pr --- test/{test_lazybuffer.py => test_tensor_uop.py} | 13 +++++++------ tinygrad/engine/schedule.py | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) rename test/{test_lazybuffer.py => test_tensor_uop.py} (90%) diff --git a/test/test_lazybuffer.py b/test/test_tensor_uop.py similarity index 90% rename from test/test_lazybuffer.py rename to test/test_tensor_uop.py index 7d702a82be..a3b01604e5 100644 --- a/test/test_lazybuffer.py +++ b/test/test_tensor_uop.py @@ -3,9 +3,9 @@ import numpy as np import unittest from tinygrad import Tensor, Device, dtypes from tinygrad.engine.realize import run_schedule -from tinygrad.ops import Ops, UOp +from tinygrad.ops import Ops, UOp, UPat -class TestLazyBuffer(unittest.TestCase): +class TestTensorUOp(unittest.TestCase): def test_fromcpu_shape_tracker(self): def helper(a: np.ndarray): print(a.shape, a.strides, a.flags.c_contiguous) @@ -68,7 +68,7 @@ class TestLazyBuffer(unittest.TestCase): assert lb.const_like(1).const_arg == 1.0 assert type(lb.const_like(1).const_arg) is float - def test_forced_realized_alu(self): + def test_contiguous_alu(self): a = Tensor.randn(2, 2).realize() b = Tensor.randn(2, 2).realize() add = (a+b).contiguous() @@ -84,13 +84,14 @@ class TestLazyBuffer(unittest.TestCase): sched = empty.schedule() self.assertEqual(len(sched), 0) +reduce_kernel = UPat(Ops.SINK, src=(UPat(Ops.STORE, src=(UPat(), UPat(), UPat(Ops.REDUCE_AXIS))))) class TestReduceOp(unittest.TestCase): def test_no_split_reduce_kernel(self): a = Tensor.rand(4, 4).realize() a = a.sum() sched = a.schedule() assert len(sched) == 1 - self.assertIs(sched[0].ast.src[0].src[2].op, Ops.REDUCE_AXIS) + assert reduce_kernel.match(sched[0].ast, {}) def test_split_reduce_kernel_dim0(self): a = Tensor.rand(256, 255).realize() @@ -98,7 +99,7 @@ class TestReduceOp(unittest.TestCase): sched = a.schedule() assert len(sched) == 2 for s in sched: - self.assertIs(s.ast.src[0].src[2].op, Ops.REDUCE_AXIS) + assert reduce_kernel.match(s.ast, {}) def test_split_reduce_kernel_dim1(self): a = Tensor.rand(255, 256).realize() @@ -106,7 +107,7 @@ class TestReduceOp(unittest.TestCase): sched = a.schedule() assert len(sched) == 2 for s in sched: - self.assertIs(s.ast.src[0].src[2].op, Ops.REDUCE_AXIS) + assert reduce_kernel.match(s.ast, {}) if __name__ == "__main__": unittest.main() diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index 5803e6c655..972a7a2b18 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -220,9 +220,6 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem: sink = graph_rewrite(graph_rewrite(pre, multioutput+view_left, store_bufs:={x.buf_uop:x.src[2] for x in pre.src}), view_right) # remove extra uops from SINK + substitue BUFFER with DEFINE_GLOBAL ast = graph_rewrite(sink, to_si+append_bufs, si_ctx:=ScheduleItemContext(ctx.var_vals)) - # capture process replay - if CAPTURE_PROCESS_REPLAY: - with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, ContextVar._cache, sink)) # deal with ASSIGN assign_preloads: list[UOp] = [] if len(ctx.assigns) != 0: @@ -238,6 +235,9 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem: if len(st.views) != 1 or (mask:=st.views[0].mask) is None or ShapeTracker.from_shape(st.shape).shrink(mask) != st.shrink(mask): raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n" +colored(" - a += a.T\n", "red")+colored(" + a += a.T.contiguous()", "green")) + # capture process replay + if CAPTURE_PROCESS_REPLAY: + with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, ContextVar._cache, ast)) return ScheduleItem(ast, tuple(u.buffer for u in si_ctx.bufs if u.size != 0), tuple(dedup(m for x in pre.toposort if (m:=ctx.ops_metadata.get(x)) is not None)), tuple(dedup(assign_preloads))) From 97ec564b03ec5f7253c9f247b854de5d964d1ccc Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Tue, 14 Jan 2025 07:47:17 -0500 Subject: [PATCH 09/29] noop changes from the block_assign branch [pr] (#8606) --- tinygrad/engine/schedule.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index 972a7a2b18..d4f12e3975 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -197,10 +197,13 @@ def _append_st_vars(ctx:ScheduleItemContext, x:UOp) -> UOp|None: def _append_buf(ctx:ScheduleItemContext, x:UOp) -> UOp: ctx.bufs.append(x) return UOp(Ops.DEFINE_GLOBAL, x.dtype.ptr(size=x.size), (), len(ctx.bufs)-1) -append_bufs = PatternMatcher([(UPat(Ops.BUFFER, name="x"), _append_buf)]) to_si = PatternMatcher([ + # BUFFER -> DEFINE_GLOBAL + (UPat(Ops.BUFFER, name="x"), _append_buf), + # simplify and unbind the final VIEWs (UPat(Ops.VIEW, name="x"), _append_st_vars), + # don't need SINK on COPY or BUFFER_VIEW (UPat(Ops.SINK, src=(UPat.store(UPat.var("b"), UPat(), UPat(GroupOp.Meta, name="x")),)), lambda b,x: x.replace(src=(b, *x.src))), # don't need contiguous or assign anymore (UPat(Ops.CONTIGUOUS, src=(UPat.var("x"),)), lambda x: x), @@ -209,9 +212,6 @@ to_si = PatternMatcher([ (UPat(Ops.PRELOAD, name="root"), lambda root:root.replace(op=Ops.LOAD)), ]) -add_assign_adjacents = PatternMatcher([(UPat.load(UPat.var("b"), UPat(), name="x"), lambda ctx,b,x: ctx.assign_adj.setdefault(b, []).append(x) - if b in ctx.assigns else None)]) - # LOAD(BUFFER) -> the STORE value if it's we're doing the STORE in the same kernel multioutput = PatternMatcher([(UPat.load(UPat.var("b"), UPat()), lambda ctx,b: ctx.get(b)),]) @@ -219,7 +219,7 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem: # remove movement ops + substitute LOAD of fused STORE with just the value sink = graph_rewrite(graph_rewrite(pre, multioutput+view_left, store_bufs:={x.buf_uop:x.src[2] for x in pre.src}), view_right) # remove extra uops from SINK + substitue BUFFER with DEFINE_GLOBAL - ast = graph_rewrite(sink, to_si+append_bufs, si_ctx:=ScheduleItemContext(ctx.var_vals)) + ast = graph_rewrite(sink, to_si, si_ctx:=ScheduleItemContext(ctx.var_vals)) # deal with ASSIGN assign_preloads: list[UOp] = [] if len(ctx.assigns) != 0: @@ -540,12 +540,13 @@ def create_schedule_with_vars(outs:list[UOp], skip_check:bool=not __debug__) -> # preschedule realize groups prescheduled: list[ScheduleItem] = [] for store_uops in store_groups: - if len(stores:=[ctx.realizes[u] for u in store_uops if ctx.realizes[u].op is Ops.STORE]) != 0: - prescheduled.append(schedule_uop(UOp.sink(*stores), ctx)) - # can only schedule once - for buf_uop in store_uops: - for luop in ctx.tensor_uops[buf_uop]: ctx.becomes_map[luop] = buf_uop.view(unwrap(luop.st)) - # do BFS + if len(stores:=[ctx.realizes[u] for u in store_uops if ctx.realizes[u].op is Ops.STORE]) == 0: continue + prescheduled.append(schedule_uop(UOp.sink(*stores), ctx)) + # can only schedule once + for buf_uop in store_uops: + for luop in ctx.tensor_uops[buf_uop]: ctx.becomes_map[luop] = buf_uop.view(unwrap(luop.st)) + + # add kernel children schedule_targets = {out:si for si in prescheduled for out in si.outputs} graph: defaultdict[ScheduleItem, list[ScheduleItem]] = defaultdict(list) in_degree: defaultdict[ScheduleItem, int] = defaultdict(int) @@ -560,6 +561,8 @@ def create_schedule_with_vars(outs:list[UOp], skip_check:bool=not __debug__) -> for x in scheduled_parents: graph[x].append(si) in_degree[si] += 1 + + # do BFS queue = deque(si for si in prescheduled if in_degree[si] == 0) schedule: list[ScheduleItem] = [] while queue: From 1ff6862a3d1cc63bbc551bed513a8557a4f7f576 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 14 Jan 2025 15:55:23 +0300 Subject: [PATCH 10/29] ci: sleep a bit to let the driver unload the prev pid (#8605) --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index be6c4ace11..1d5236d835 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -372,7 +372,7 @@ jobs: #- name: Fuzz Padded Tensor Core GEMM # run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py - name: Remove amdgpu - run: sudo rmmod amdgpu + run: sleep 5 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid. - name: Run Stable Diffusion run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt - name: Run SDXL From 4057b98f7ff599310311add7057126940698a8b6 Mon Sep 17 00:00:00 2001 From: ignaciosica Date: Tue, 14 Jan 2025 13:27:05 -0300 Subject: [PATCH 11/29] rename i and j into k and row/col (#8607) --- tinygrad/runtime/ops_python.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 650f6a8e12..618a356af5 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -138,31 +138,31 @@ class PythonProgram: ul[i] = wmma_helper(32, 8, 2, 2, 2, a_b_elem, a_b_elem, c_map) elif arg[4] == "AMD": # A (16 elements on 32 threads): col major, lane 16-32 == lane 0-15 - def a_elem(x, i, j, goff): - assert x[i][goff+j] == x[i][goff+j+16], "warp elements not duplicated properly across lanes" - return x[i][goff+j] + def a_elem(x, k, row, goff): + assert x[k][goff+row] == x[k][goff+row+16], "warp elements not duplicated properly across lanes" + return x[k][goff+row] # B (16 elements on 32 threads): row major, lane 16-32 == lane 0-15 - def b_elem(x, i, j, goff): return a_elem(x, j, i, goff) # pylint: disable=arguments-out-of-order + def b_elem(x, col, k, goff): return a_elem(x, k, col, goff) # pylint: disable=arguments-out-of-order def c_map(lane, elem): return (lane%16, lane//16+elem*2) # (i, j), C, D (8 elements on 32 threads): row major ul[i] = wmma_helper(32, 16, 16, 16, 8, a_elem, b_elem, c_map) elif arg[4] == "CUDA": # A (8 elements on 32 threads) - def a_elem(x, i, j, goff): return x[(i%2)+(j//8)*2+(i//8)*4][goff+((i//2)%4)+(j%8)*4] + def a_elem(x, k, row, goff): return x[(k%2)+(row//8)*2+(k//8)*4][goff+((k//2)%4)+(row%8)*4] # B (4 elements on 32 threads) - def b_elem(x, i, j, goff): return x[(j%2)+(j//8)*2][goff+(j//2)%4+(i)*4] + def b_elem(x, col, k, goff): return x[(k%2)+(k//8)*2][goff+(k//2)%4+(col)*4] # (i, j), C, D (4 elements on 32 threads) def c_map(lane, elem): return ((elem%2)+(lane%4)*2, (lane//4)+(elem//2)*8) ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map) elif arg[4] == "INTEL": # A (16 elements on 8 threads) - def a_elem(x, i, j, goff): return x[i%2+j*2][goff+i//2] + def a_elem(x, k, row, goff): return x[k%2+row*2][goff+k//2] # B (16 elements on 8 threads) - def b_elem(x, i, j, goff): return x[j][goff+i] + def b_elem(x, col, k, goff): return x[k][goff+col] # C, D (8 elements on 8 threads) def c_map(lane, elem): return (lane, elem) ul[i] = wmma_helper(8, 16, 16, 16, 8, a_elem, b_elem, c_map) elif arg[4] == "CLANG": - def elem(x, i, j, _): return x[i+j][0] + def elem(x, col, row, _): return x[col+row][0] # k is always 0 def c_map(_, elem): return (elem%16, elem//16) ul[i] = wmma_helper(1, 1, 16, 16, 256, elem, elem, c_map) else: raise NotImplementedError(f"unimplemented tensor core {arg}") From 76a03e950a0a930bedfe41ccf265ce3310dc3317 Mon Sep 17 00:00:00 2001 From: Francis Lata Date: Tue, 14 Jan 2025 11:27:45 -0500 Subject: [PATCH 12/29] make kits19 dataset samples have small sizes (#8591) --- test/external/external_test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/external/external_test_datasets.py b/test/external/external_test_datasets.py index 005336b68c..6abd5fe6cc 100644 --- a/test/external/external_test_datasets.py +++ b/test/external/external_test_datasets.py @@ -19,7 +19,7 @@ class ExternalTestDatasets(unittest.TestCase): def _create_samples(self, val, num_samples=2): self._set_seed() - img, lbl = np.random.rand(190, 392, 392).astype(np.float32), np.random.randint(0, 100, size=(190, 392, 392)).astype(np.uint8) + img, lbl = np.random.rand(8, 8, 8).astype(np.float32), np.random.randint(0, 100, size=(8, 8, 8)).astype(np.uint8) img, lbl = nib.Nifti1Image(img, np.eye(4)), nib.Nifti1Image(lbl, np.eye(4)) dataset = "val" if val else "train" preproc_pth = Path(tempfile.gettempdir() + f"/{dataset}") From 52e7003414633f4b813d1a340e5cbc618328f946 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 14 Jan 2025 12:24:27 -0500 Subject: [PATCH 13/29] Revert "make kits19 dataset samples have small sizes (#8591)" (#8610) This reverts commit 76a03e950a0a930bedfe41ccf265ce3310dc3317. --- test/external/external_test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/external/external_test_datasets.py b/test/external/external_test_datasets.py index 6abd5fe6cc..005336b68c 100644 --- a/test/external/external_test_datasets.py +++ b/test/external/external_test_datasets.py @@ -19,7 +19,7 @@ class ExternalTestDatasets(unittest.TestCase): def _create_samples(self, val, num_samples=2): self._set_seed() - img, lbl = np.random.rand(8, 8, 8).astype(np.float32), np.random.randint(0, 100, size=(8, 8, 8)).astype(np.uint8) + img, lbl = np.random.rand(190, 392, 392).astype(np.float32), np.random.randint(0, 100, size=(190, 392, 392)).astype(np.uint8) img, lbl = nib.Nifti1Image(img, np.eye(4)), nib.Nifti1Image(lbl, np.eye(4)) dataset = "val" if val else "train" preproc_pth = Path(tempfile.gettempdir() + f"/{dataset}") From cbfd51f5a5030e1d10ebd31e802c45ec88653403 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 14 Jan 2025 13:25:54 -0500 Subject: [PATCH 14/29] make MultiLazyBuffer.bounds a property [pr] (#8614) determined by lbs shapes and axis --- tinygrad/multi.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tinygrad/multi.py b/tinygrad/multi.py index 97a17fe714..07a3430f7b 100644 --- a/tinygrad/multi.py +++ b/tinygrad/multi.py @@ -46,9 +46,6 @@ class MultiLazyBuffer(MathTrait): assert all(isinstance(x, UOp) for x in lbs) and len(lbs), "all lbs must be LazyBuffers, and we need at least one of them" assert all_same([x.dtype for x in lbs]), f"all multilazybuffer needs same dtype, getting {[x.dtype for x in lbs]}" self.lbs, self.axis, self.dtype, self.device, self.real = lbs, axis, lbs[0].dtype, tuple(x.device for x in lbs), real or [True]*len(lbs) - if axis is not None: - splits = list(itertools.accumulate([lb.shape[axis] for lb in lbs], initial=0)) - self.bounds = tuple(zip(splits, splits[1:])) @property def shape(self): return tuple(sum(y.shape[a] for y in self.real_lbs) if a == self.axis else s for a,s in enumerate(self.real_lbs[0].shape)) @@ -59,6 +56,11 @@ class MultiLazyBuffer(MathTrait): @property def real_lbs(self): return [lb for lb,r in zip(self.lbs, self.real) if r] + @property + def bounds(self): + if self.axis is None: raise RuntimeError("bounds is not defined when axis is None") + return tuple(itertools.pairwise(itertools.accumulate([lb.shape[self.axis] for lb in self.lbs], initial=0))) + def __repr__(self): return f"" @staticmethod From d5a646d492071bedd0800542eacf6711d088a8e1 Mon Sep 17 00:00:00 2001 From: ignaciosica Date: Tue, 14 Jan 2025 15:35:14 -0300 Subject: [PATCH 15/29] CUDA Turing TC (#8597) * init turing tc * reorder tc * hotfix: remove some spaces * revert var name to x * consistent order of factors * revert order of terms to match old stuff --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> --- .github/workflows/test.yml | 1 + tinygrad/renderer/cstyle.py | 16 +++++++++++----- tinygrad/renderer/ptx.py | 5 +++-- tinygrad/runtime/ops_python.py | 24 ++++++++++++++++-------- 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 596f219b64..ef18f559f8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -166,6 +166,7 @@ jobs: - name: Test emulated CUDA tensor cores run: | DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 + DEBUG=2 EMULATE_CUDA_SM75=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 PYTHONPATH="." DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded - name: Test emulated INTEL OpenCL tensor cores run: DEBUG=2 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 HALF=1 N=64 python3 ./extra/gemm/simple_matmul.py diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 65855c5a34..5171839cef 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -291,17 +291,23 @@ class MetalRenderer(CStyleLanguage): return super().render_kernel(function_name, kernel, bufs, uops, prefix) _nms = "xyzwabcdefghijkl" +cuda_tc_opts = ("u0","l0","l0","l1","l1","l1","u1") # shared by all shapes with M=16 N=8 class CUDARenderer(CStyleLanguage): device = "CUDA" global_max = (2147483647, 65535, 65535) local_max = (1024, 1024, 64) shared_max = 49152 - # https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-fragment-mma-16816-float - tensor_cores = [TensorCore(dims=(8,16,16), threads=32, elements_per_thread=(8,4,4), dtype_in=di, dtype_out=do, - opts=("u0","l0","l0","l1","l1","l1","u1"), swizzle=(((6,7,2,3,4),(0,1,9,5,10,8)), ((6,7,9,0,1),(2,3,4,10,5,8)))) - for di,do in ([(dtypes.half,dtypes.float),(dtypes.bfloat16,dtypes.float)])] - def __init__(self, arch:str): self.tensor_cores, self.arch = CUDARenderer.tensor_cores if int(arch[3:]) >= 80 else [], arch + # https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-multiply-accumulate-instructions + tc_81616 = [TensorCore(dims=(8,16,16), threads=32, elements_per_thread=(8,4,4), dtype_in=di,dtype_out=do, opts=cuda_tc_opts, + swizzle=(((6,7,2,3,4),(0,1,9,5,10,8)), ((6,7,9,0,1),(2,3,4,10,5,8)))) for di,do in [(dtypes.half,dtypes.float), (dtypes.bfloat16,dtypes.float)]] + tc_8168_f16 = [TensorCore(dims=(8,16,8), threads=32, elements_per_thread=(4,2,4), dtype_in=dtypes.half, dtype_out=dtypes.float, opts=cuda_tc_opts, + swizzle=(((6,7,2,3,4),(0,1,8,5,9)), ((6,7,8,0,1),(2,3,4,9,5))))] + + tc_sm80 = tc_81616 + tc_8168_f16 + tc_sm75 = tc_8168_f16 + def __init__(self, arch:str): + self.tensor_cores, self.arch = CUDARenderer.tc_sm80 if int(arch[3:]) >= 80 else CUDARenderer.tc_sm75 if int(arch[3:]) >= 75 else [], arch def __reduce__(self): return self.__class__, (self.arch,) # language options diff --git a/tinygrad/renderer/ptx.py b/tinygrad/renderer/ptx.py index c7eaf6f32f..c7bb49fdfb 100644 --- a/tinygrad/renderer/ptx.py +++ b/tinygrad/renderer/ptx.py @@ -124,11 +124,12 @@ class PTXRenderer(Renderer): device = "CUDA" suffix = "PTX" global_max, local_max, shared_max = CUDARenderer.global_max, CUDARenderer.local_max, CUDARenderer.shared_max - tensor_cores = [tc for tc in CUDARenderer.tensor_cores if tc.dtype_in == dtypes.half] + tc_sm80 = [tc for tc in CUDARenderer.tc_sm80 if tc.dtype_in == dtypes.half] code_for_op = asm_for_op extra_matcher = ptx_matcher def __init__(self, arch:str, device="CUDA"): - self.device, self.tensor_cores, self.arch = device, PTXRenderer.tensor_cores if int(arch[3:]) >= 80 else [], arch + self.device, self.arch = device, arch + self.tensor_cores = PTXRenderer.tc_sm80 if int(arch[3:]) >= 80 else CUDARenderer.tc_sm75 if int(arch[3:]) >= 75 else [] def __reduce__(self): return self.__class__, (self.arch, self.device) # language options diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 618a356af5..4961104492 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -146,13 +146,20 @@ class PythonProgram: def c_map(lane, elem): return (lane%16, lane//16+elem*2) # (i, j), C, D (8 elements on 32 threads): row major ul[i] = wmma_helper(32, 16, 16, 16, 8, a_elem, b_elem, c_map) elif arg[4] == "CUDA": - # A (8 elements on 32 threads) - def a_elem(x, k, row, goff): return x[(k%2)+(row//8)*2+(k//8)*4][goff+((k//2)%4)+(row%8)*4] - # B (4 elements on 32 threads) - def b_elem(x, col, k, goff): return x[(k%2)+(k//8)*2][goff+(k//2)%4+(col)*4] - # (i, j), C, D (4 elements on 32 threads) - def c_map(lane, elem): return ((elem%2)+(lane%4)*2, (lane//4)+(elem//2)*8) - ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map) + # (col, row) given (lane, elem) for C & D (4 elements on 32 threads); shared by all tc shapes with M=16 N=8 + def c_map(lane, elem): return (elem%2 + (lane%4)*2, lane//4 + (elem//2)*8) + + if arg[1] == (8,16,16): + def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2 + (k//8)*4][goff + (k//2)%4 + (row%8)*4] + def b_elem(x, col, k, goff): return x[k%2 + (k//8)*2][goff + (k//2)%4 + col*4] + ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map) + + elif arg[1] == (8,16,8): + def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2][goff + k//2 + (row%8)*4] + def b_elem(x, col, k, goff): return x[k%2][goff + k//2 + col*4] + ul[i] = wmma_helper(32, 8, 4, 2, 4, a_elem, b_elem, c_map) + + else: raise NotImplementedError(f"unimplemented tensor core {arg}") elif arg[4] == "INTEL": # A (16 elements on 8 threads) def a_elem(x, k, row, goff): return x[k%2+row*2][goff+k//2] @@ -179,7 +186,8 @@ class PythonRenderer(Renderer): def __init__(self): if getenv("EMULATE_METAL"): self.device, self.tensor_cores = "METAL", MetalRenderer.tensor_cores if getenv("EMULATE_AMD"): self.device, self.tensor_cores = "AMD", AMDRenderer.tensor_cores - if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tensor_cores + if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm80 + if getenv("EMULATE_CUDA_SM75"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm75 if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CLANG", ClangRenderer.tensor_cores From 393eec3201cea5b6ab6491968ea6fd9e2b9e43f3 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 14 Jan 2025 14:51:48 -0500 Subject: [PATCH 16/29] raise RuntimeError for uneven shard [pr] (#8593) no 7B llama on 6 GPUs skip 70B --- .github/workflows/benchmark.yml | 16 ++++++++-------- test/test_multitensor.py | 10 +++++++++- test/test_nn.py | 13 +++++++------ tinygrad/multi.py | 2 +- tinygrad/tensor.py | 2 +- 5 files changed, 26 insertions(+), 17 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 1d5236d835..0596f22055 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -204,10 +204,10 @@ jobs: run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt - name: Run LLaMA-3 8B on 4 GPUs run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt - - name: Run LLaMA-3 8B on 6 GPUs - run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt - - name: Run LLaMA-2 70B - run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt + # - name: Run LLaMA-3 8B on 6 GPUs + # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt + # - name: Run LLaMA-2 70B + # run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt - name: Run Mixtral 8x7B run: time NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt - name: Run GPT2 @@ -391,12 +391,12 @@ jobs: run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt - name: Run LLaMA-3 8B on 4 GPUs run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt - - name: Run LLaMA-3 8B on 6 GPUs - run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt + # - name: Run LLaMA-3 8B on 6 GPUs + # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt - name: Restore amdgpu run: sudo modprobe amdgpu - - name: Run LLaMA-2 70B - run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt + # - name: Run LLaMA-2 70B + # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt - name: Run Mixtral 8x7B run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt - name: Run GPT2 diff --git a/test/test_multitensor.py b/test/test_multitensor.py index 67cb9757e1..f53056c406 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -157,6 +157,7 @@ class TestMultiTensor(unittest.TestCase): strat.sampled_from((Ops.ADD, Ops.MUL, Ops.MAX)), strat.sampled_from((None, 0, 1)), strat.sampled_from((None, 0, 1)), strat.sampled_from((1, 0, -1))) def test_simple_reduce(self, N, devices, rop, shard_axis, reduce_axis, sign): + N = N * len(devices) X = Tensor.rand(N*N).reshape(N, N).mul(sign) n = X.numpy() X.shard_(devices, shard_axis) @@ -438,6 +439,7 @@ class TestMultiTensor(unittest.TestCase): assert isinstance(jf.jit_cache[4].prg, BufferCopy) assert isinstance(jf.jit_cache[5].prg, graph_d1) + @unittest.skip("no longer supports uneven shard") def test_uneven_shard(self): for N in range(1, 6): X = Tensor.rand(4, 1, 257).contiguous().realize() @@ -450,6 +452,7 @@ class TestMultiTensor(unittest.TestCase): np.testing.assert_equal(X.expand((4, 4, 257)).numpy(), np.tile(n, (1, 4, 1))) np.testing.assert_equal(X.permute((0, 2, 1)).numpy(), np.transpose(n, (0, 2, 1))) + @unittest.skip("no longer supports uneven shard") def test_uneven_multiple_zeros(self): for data in ([1, 2, 3, 4], [1, 2, 3], [1, 2], [1], []): for N in (1, 2, 3, 4): @@ -458,6 +461,7 @@ class TestMultiTensor(unittest.TestCase): X = ((Tensor(data).shard(devices, axis=0) + 1).realize() - 1).realize() np.testing.assert_equal(X.numpy(), data) + @unittest.skip("no longer supports uneven shard") def test_uneven_shard_with_empty(self): N = 4 X = Tensor.rand(16, 1, 3).contiguous().realize() @@ -470,6 +474,7 @@ class TestMultiTensor(unittest.TestCase): # test reshape with empty shard np.testing.assert_equal(X.shard(devices, 0).reshape(8, 1, 6).numpy(), np_x.reshape(8, 1, 6)) + @unittest.skip("no longer supports uneven shard") def test_multiple_uneven_shard(self): N = 4 X = Tensor.rand(4, 1, 257).contiguous().realize() @@ -531,7 +536,7 @@ class TestMultiTensor(unittest.TestCase): with self.assertRaises((AssertionError, ValueError)): t0.reshape((26*15,7)) - @unittest.skip("no longer supports splits") + @unittest.skip("no longer supports uneven shard") def test_reshape_on_axis_uneven(self): def reshape_helper(t0, t, t_axis): np.testing.assert_allclose(t0.reshape(t.shape).numpy(), t.numpy()) @@ -603,6 +608,7 @@ class TestMultiTensor(unittest.TestCase): self.assertEqual(t.dtype, t2.dtype) self.assertEqual(t.lazydata.axis, t2.lazydata.axis) + @unittest.skip("no longer supports uneven shard") def test_rand_like_uneven_shard(self): t = Tensor.empty((4, 42, 15)).shard(devices_3, axis=1) t2 = Tensor.rand_like(t) @@ -653,6 +659,7 @@ class TestMultiTensor(unittest.TestCase): assert set(unique) == {0, 2}, unique assert 200 < counts[0] < 312, counts[0] + @unittest.skip("no longer supports uneven shard") def test_dropout_on_uneven_shard_axis(self): with Tensor.train(): X = Tensor.ones(256).shard(devices_3, axis=0) @@ -814,6 +821,7 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase): np.testing.assert_allclose(a.reshape((2, 1, 8)).expand((2, 5, 8)).numpy(), b.reshape((2, 1, 8)).expand((2, 5, 8)).numpy(), rtol=1e-7, atol=1e-3) np.testing.assert_allclose(a.flip(-1).numpy(), b.flip(-1).numpy(), rtol=1e-7, atol=1e-3) + @unittest.skip("no longer supports uneven shard") def test_uneven(self): t = Tensor.arange(24).reshape(3, 8).contiguous().realize() t.shard_([f"{Device.DEFAULT}:{i}" for i in range(2)], axis=0) diff --git a/test/test_nn.py b/test/test_nn.py index e36b805c48..738574a989 100755 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -551,7 +551,7 @@ class TestNN(unittest.TestCase): @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI") def test_load_state_dict_sharded_model(self): - devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2") + devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3") layer = Conv2d(3, 5, kernel_size=3) layer.weight.shard_(devices, 3) @@ -572,7 +572,7 @@ class TestNN(unittest.TestCase): @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI") def test_load_state_dict_sharded_dict(self): - devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2") + devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3") layer = Conv2d(3, 5, kernel_size=3) state_dict = { @@ -589,7 +589,7 @@ class TestNN(unittest.TestCase): @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI") def test_load_state_dict_sharded_model_dict_same_axis(self): - devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2") + devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3") layer = Conv2d(3, 5, kernel_size=3) layer.weight.shard_(devices, 3) @@ -610,7 +610,8 @@ class TestNN(unittest.TestCase): @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI") def test_load_state_dict_sharded_model_dict_different_axis(self): - devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2") + devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3") + devices5 = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3", f"{Device.DEFAULT}:4", f"{Device.DEFAULT}:5") layer = Conv2d(3, 5, kernel_size=3) layer.weight.shard_(devices, 3) @@ -619,14 +620,14 @@ class TestNN(unittest.TestCase): # different shard axis state_dict = { 'weight': Tensor.randn(5, 3, 3, 3).shard(devices, None), - 'bias': Tensor.randn(5).shard(devices, 0), + 'bias': Tensor.randn(5).shard(devices5, 0), } load_state_dict(layer, state_dict) # NOTE: model and state_dict shard differently, use the state_dict sharding # TODO: revisit this? self.assertEqual(layer.weight.device, devices) self.assertEqual(layer.weight.lazydata.axis, None) - self.assertEqual(layer.bias.device, devices) + self.assertEqual(layer.bias.device, devices5) self.assertEqual(layer.bias.lazydata.axis, 0) np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy()) np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy()) diff --git a/tinygrad/multi.py b/tinygrad/multi.py index 07a3430f7b..919d33a97f 100644 --- a/tinygrad/multi.py +++ b/tinygrad/multi.py @@ -38,7 +38,7 @@ def all_reduce(bop: Ops, lbs: list[UOp]) -> list[UOp]: return [functools.reduce(operator.add, [c.pad(pad) for pad,c in zip(pads,lb_c)]).reshape(shape) for lb_c in chunked] def to_sharded(lbs:list[UOp], axis:int, bounds: tuple[tuple[int, int], ...]) -> list[UOp]: - if DEBUG >= 3 and lbs[0].shape[axis] % len(lbs) != 0: print(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}, bounds={bounds}") + if lbs[0].shape[axis] % len(lbs) != 0: raise RuntimeError(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}, bounds={bounds}") return [lb.shrink(tuple((0,s) if a != axis else bound for a,s in enumerate(lb.shape))) for i, (bound, lb) in enumerate(zip(bounds, lbs))] class MultiLazyBuffer(MathTrait): diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 60b9735bd9..171c7afe31 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -399,7 +399,7 @@ class Tensor(SimpleMathTrait): Shards the tensor across the given devices. Optionally specify which axis to shard on. ```python exec="true" source="above" session="tensor" result="python" - t = Tensor.empty(2, 3) + t = Tensor.empty(2, 4) print(t.shard((t.device, t.device), axis=1).lazydata) ``` """ From bfbe81df717e2dfec8a128a28b26c73f5ec8088c Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Tue, 14 Jan 2025 12:04:58 -0800 Subject: [PATCH 17/29] remove cast before view (#8613) * remove cast before view * greener * indexing * that passes too * openpilot too * ack --------- Co-authored-by: qazal --- .github/workflows/benchmark.yml | 6 +++--- .github/workflows/test.yml | 2 +- test/test_arange.py | 3 ++- test/test_const_folding.py | 5 +++-- test/test_image_dtype.py | 2 +- test/test_schedule.py | 12 ++++++++---- test/test_tiny.py | 2 ++ tinygrad/engine/schedule.py | 2 +- tinygrad/ops.py | 13 +------------ 9 files changed, 22 insertions(+), 25 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0596f22055..27b2764556 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -91,7 +91,7 @@ jobs: - name: Run GPT2 w HALF run: HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - name: Train MNIST run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt - name: Run 10 CIFAR training steps @@ -217,7 +217,7 @@ jobs: - name: Run GPT2 w HALF run: NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - uses: actions/upload-artifact@v4 with: name: Speed (NVIDIA) @@ -406,7 +406,7 @@ jobs: - name: Run GPT2 w HALF run: AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - uses: actions/upload-artifact@v4 with: name: Speed (AMD) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ef18f559f8..417b222f50 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -297,7 +297,7 @@ jobs: - if: ${{ matrix.task == 'optimage' }} name: Test openpilot model kernel count and gate usage run: | - PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2105 ALLOWED_GATED_READ_IMAGE=29 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx + PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2104 ALLOWED_GATED_READ_IMAGE=29 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'optimage' }} name: Test openpilot alt model correctness (float32) run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx diff --git a/test/test_arange.py b/test/test_arange.py index d8a215faab..a5c8b535bb 100644 --- a/test/test_arange.py +++ b/test/test_arange.py @@ -66,7 +66,8 @@ class TestArange(unittest.TestCase): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)]) class TestIndexing(unittest.TestCase): - @unittest.expectedFailure + # update: passing after CAST_BEFORE_VIEW=1 deletion + # @unittest.expectedFailure def test_arange_2_reduce(self): needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous() needle[1337] = 1 diff --git a/test/test_const_folding.py b/test/test_const_folding.py index 2a3215935f..4ca2359912 100644 --- a/test/test_const_folding.py +++ b/test/test_const_folding.py @@ -132,11 +132,12 @@ class TestMovedConstFolding(unittest.TestCase): def test_cast_padded(self): # NOTE: this is folded due to CAST_BEFORE_VIEW + # update: CAST_BEFORE_VIEW=1 is no longer supported if is_dtype_supported(dtypes.int16): - _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16)) + _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16)) np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0]) if is_dtype_supported(dtypes.uint16): - _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16)) + _check_ast_count(1, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16)) np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0]) # not folded if is_dtype_supported(dtypes.int64): diff --git a/test/test_image_dtype.py b/test/test_image_dtype.py index 2a8c719aff..686d60ca9b 100644 --- a/test/test_image_dtype.py +++ b/test/test_image_dtype.py @@ -120,7 +120,7 @@ class TestImageDType(unittest.TestCase): loss = x.image_dot(w1).image_dot(w2).float().max() loss.backward() sched = unwrap(w1.grad).schedule() - self.assertEqual(len(sched), 10) + self.assertEqual(len(sched), 9) for s,ei in zip(sched, lower_schedule(sched[:])): ei.run() if s.outputs[0].dtype == dtypes.float: diff --git a/test/test_schedule.py b/test/test_schedule.py index e83ae13c28..2448c9f8a1 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -1436,6 +1436,7 @@ class TestSchedule(unittest.TestCase): def test_late_fusion_post_expand(self): self._test_fusion([(32, 32)], lambda a:a-a.sum(1), 2) + @unittest.skip("CAST_BEFORE_VIEW=1 is not supported") def test_cast_padded_view(self): a = Tensor.arange(4).reshape(1, 4) casted_view = a.pad(((0, 1), (0, 0))).cast(dtypes.float) @@ -1446,6 +1447,7 @@ class TestSchedule(unittest.TestCase): self.assertListEqual(realized_view.tolist(), [[0.0, 1.0, 2.0, 3.0], [0.0, 0.0, 0.0, 0.0]]) # NOTE: we might want to reconsider pushing this cast before the shrink + @unittest.skip("CAST_BEFORE_VIEW=1 is not supported") def test_cast_after_shrink(self): a = Tensor.arange(4).reshape(1, 4) casted_view = a.shrink(((0, 1), (0, 2))).cast(dtypes.float) @@ -1455,6 +1457,7 @@ class TestSchedule(unittest.TestCase): self.assertEqual(realized_view.lazydata.base.realized.size, 2) self.assertListEqual(realized_view.tolist(), [[0, 1]]) + @unittest.skip("CAST_BEFORE_VIEW=1 is not supported") def test_cast_const_view(self): a = Tensor.ones((4, 4), dtype=dtypes.float32) casted_view = a.cast(dtypes.int32) @@ -1464,6 +1467,7 @@ class TestSchedule(unittest.TestCase): run_schedule(check_schedule(realized_const_view, 1)) self.assertListEqual(realized_const_view.tolist(), [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]) + @unittest.skip("CAST_BEFORE_VIEW=1 is not supported") def test_cast_padded_const(self): a = Tensor(1, dtype=dtypes.int32).reshape(1, 1).pad(((1, 1), None)) casted_view = a.cast(dtypes.float32) @@ -1566,7 +1570,7 @@ class TestIndexing(unittest.TestCase): x = Tensor.randn(5, 2).realize() a = Tensor.arange(10) out = (x + a[2]).sum() - self.check_schedule(out, 1) + self.check_schedule(out, 2) np.testing.assert_allclose(out.numpy(), (x.numpy()+np.arange(10)[2]).sum(), atol=1e-5, rtol=1e-6) def test_arange_index_contiguous(self): @@ -1574,7 +1578,7 @@ class TestIndexing(unittest.TestCase): x = Tensor.randn(5, 2).realize() a = Tensor.arange(10).contiguous() out = (x + a[2]).sum() - self.check_schedule(out, 2) + self.check_schedule(out, 3) np.testing.assert_allclose(out.numpy(), (x.numpy()+np.arange(10)[2]).sum(), atol=1e-5, rtol=1e-6) def test_arange_index_child(self): @@ -1582,7 +1586,7 @@ class TestIndexing(unittest.TestCase): x = Tensor.randn(5, 2).realize() a = Tensor.arange(10)+1 out = (x + a[2]).sum() - self.check_schedule(out, 1) + self.check_schedule(out, 2) np.testing.assert_allclose(out.numpy(), (x.numpy()+(np.arange(10)+1)[2]).sum(), atol=1e-5, rtol=1e-6) def test_arange_index_contiguous_child(self): @@ -1590,7 +1594,7 @@ class TestIndexing(unittest.TestCase): x = Tensor.randn(5, 2).realize() a = (Tensor.arange(10)+1).contiguous() out = (x + a[2]).sum() - self.check_schedule(out, 2) + self.check_schedule(out, 3) np.testing.assert_allclose(out.numpy(), (x.numpy()+(np.arange(10)+1)[2]).sum(), atol=1e-5, rtol=1e-6) def test_arange_childless_base(self): diff --git a/test/test_tiny.py b/test/test_tiny.py index e256d05e0e..6b3ff6ed06 100644 --- a/test/test_tiny.py +++ b/test/test_tiny.py @@ -81,6 +81,8 @@ class TestTiny(unittest.TestCase): # *** a model *** + # TODO: this is failing because of how swizzling rewrites the ShapeTracker of the final STORE + @unittest.skipIf(IMAGE>0, "failing because of make things that can't be images not images") def test_mnist_model(self): layers = [ nn.Conv2d(1, 32, 5), Tensor.relu, diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index d4f12e3975..7dc36febb8 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -500,7 +500,7 @@ break_sched = PatternMatcher([ def append_uop(ctx:ScheduleContext, view:UOp, buf_uop:UOp) -> None: ctx.allbufs[buf_uop] = view if (op:=uval(view)).op is Ops.ASSIGN: ctx.assigns.add(buf_uop) - for x in op.src: + for x in op.base.src: if is_scheduled(x.base): ctx.children.setdefault(x.base.buf_uop, {})[buf_uop] = None # BUFFER_VIEW overrides the underlying buffer # TODO: this should be a shrink on the buffer diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 6a039d54bf..a818ecc985 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -361,17 +361,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass): def cast(self, dtype:DType, bitcast=False): if bitcast: return self.bitcast(dtype) if self._device is not None and self._device.startswith("DISK"): raise RuntimeError("CAST isn't supported on DISK") - if getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self is not self.base: - # NOTE: we have to apply the movementops here, we can't use VIEW (yet) - # TODO: move this to the scheduler - ret = self.base.cast(dtype, bitcast) - op_arg = [] - mop = self - while mop is not self.base: - op_arg.append((mop.op, mop.arg)) - mop = mop.src[0] - for op,arg in reversed(op_arg): ret = UOp(op, ret.dtype, (ret,), arg) - return ret return UOp(Ops.CAST, dtype, (self,)) def bitcast(self, dtype:DType): if self.st is not None and self.shape and ((self.shape[-1]*self.dtype.itemsize)%dtype.itemsize != 0): @@ -477,7 +466,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): @property def base(self) -> UOp: if self.op in GroupOp.Movement: return self.src[0].base - return self.src[0] if self.op is Ops.VIEW and len(self.src) == 1 and self.src[0].op is not Ops.BUFFER else self + return self.src[0].base if self.op is Ops.VIEW and len(self.src) == 1 and self.src[0].op is not Ops.BUFFER else self def view(self, new_st:ShapeTracker) -> UOp: if self.st is None: return UOp(Ops.VIEW, self.dtype.base if not isinstance(self.dtype, ImageDType) else self.dtype, (self,), new_st) ret = UOp(Ops.VIEW, self.dtype, (self.base,), new_st) From c5782e85d2e8ad3d42ddabbfe60099f38af377cf Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 14 Jan 2025 23:48:07 +0300 Subject: [PATCH 18/29] tlsf: optimize alloc (#8608) --- tinygrad/runtime/support/allocator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tinygrad/runtime/support/allocator.py b/tinygrad/runtime/support/allocator.py index 4e6dbd8a41..8da3ec259a 100644 --- a/tinygrad/runtime/support/allocator.py +++ b/tinygrad/runtime/support/allocator.py @@ -14,6 +14,7 @@ class TLSFAllocator: def __init__(self, size:int, base:int=0, block_size:int=16, lv2_cnt:int=16): self.size, self.base, self.block_size, self.l2_cnt = size, base, block_size, lv2_cnt.bit_length() self.storage:list = [collections.defaultdict(list) for _ in range(size.bit_length() + 1)] + self.lv1_entries:list[int] = [0] * len(self.storage) # self.blocks is more like a linked list, where each entry is a contigous block. self.blocks:dict[int, tuple[int, int|None, int|None, bool]] = {0: (size, None, None, True)} # size, next, prev, is_free @@ -25,12 +26,14 @@ class TLSFAllocator: def _insert_block(self, start:int, size:int, prev:int|None=None): if prev is None: prev = self.blocks[start][2] self.storage[self.lv1(size)][self.lv2(size)].append(start) + self.lv1_entries[self.lv1(size)] += 1 self.blocks[start] = (size, start + size, prev, True) return self def _remove_block(self, start:int, size:int, prev:int|None=None): if prev is None: prev = self.blocks[start][2] self.storage[self.lv1(size)][self.lv2(size)].remove(start) + self.lv1_entries[self.lv1(size)] -= 1 self.blocks[start] = (size, start + size, prev, False) return self @@ -67,6 +70,7 @@ class TLSFAllocator: # Search for the smallest block that can fit the requested size. Start with the it's bucket and go up until any block is found. for l1 in range(self.lv1(size), len(self.storage)): + if self.lv1_entries[l1] == 0: continue for l2 in range(self.lv2(size) if l1 == size.bit_length() else 0, (1 << self.l2_cnt)): if len(self.storage[l1][l2]) > 0: nsize = self.blocks[self.storage[l1][l2][0]][0] From dddd4e5f9fc7b45e03de36707ddffd60773f0d0c Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:03:17 -0500 Subject: [PATCH 19/29] hotfix: remove duplicate TestTensorMutates [pr] (#8619) * hotfix: remove duplicate TestTensorMutates [pr] * imports --- test/unit/test_rewrite_map.py | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/test/unit/test_rewrite_map.py b/test/unit/test_rewrite_map.py index 0cafb9b9ab..f0b833dd19 100644 --- a/test/unit/test_rewrite_map.py +++ b/test/unit/test_rewrite_map.py @@ -1,39 +1,6 @@ import unittest -from tinygrad import dtypes, Tensor +from tinygrad import dtypes from tinygrad.ops import UOp, symbolic, graph_rewrite_map, _substitute -from test.unit.test_tensor_uop_representation import is_pattern, realized_pattern, is_pattern_uop - -class TestTensorMutates(unittest.TestCase): - def test_mutate_add(self): - a = Tensor([1,2,3]) - b = Tensor([4,5,6]) - ret = a+b - pa = a.lazydata - pb = b.lazydata - pr = ret.lazydata - ret.schedule() - self.assertIsNot(pa, a.lazydata) - self.assertIsNot(pb, b.lazydata) - self.assertIsNot(pr, ret.lazydata) - for t in [a,b,ret]: is_pattern(t, realized_pattern) - - def test_reshape_is_same_parent(self): - a = Tensor([1,2,3]) - b = Tensor([4,5,6]) - c = a+b - d = (a+b).reshape(3,1) - d.realize() - is_pattern_uop(d.lazydata.base, realized_pattern) - is_pattern_uop(c.lazydata.base, realized_pattern) - - def test_reshape_is_same_child(self): - a = Tensor([1,2,3]) - b = Tensor([4,5,6]) - c = a+b - d = (a+b).reshape(3,1) - c.realize() - is_pattern_uop(c.lazydata.base, realized_pattern) - is_pattern_uop(d.lazydata.base, realized_pattern) class TestRewriteMap(unittest.TestCase): def test_substitute(self): From fdd46c9f284969d10a9a865992991d2eea8904c8 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Tue, 14 Jan 2025 13:15:13 -0800 Subject: [PATCH 20/29] delete view instant rule (#8616) * remove cast before view * greener * indexing * delete view instant rule * that passes too * openpilot too * ack * base on cast_before_view * add it as a rewrite rule * VIEW(DEVICE) is also fine * test_shard_memory depends on forced_realize removal * put that back, will go soon * UOp representations change once we don't instantly fold things * do not duplicate tests --------- Co-authored-by: qazal Co-authored-by: qazal <77887910+Qazalin@users.noreply.github.com> --- test/test_multitensor.py | 1 + test/unit/test_tensor_uop_representation.py | 12 ++++++++---- tinygrad/ops.py | 12 +++++------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/test/test_multitensor.py b/test/test_multitensor.py index f53056c406..ff21906016 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -694,6 +694,7 @@ class TestMultiTensor(unittest.TestCase): assert ast.src[2].src[0].op is Ops.LOAD assert ast.src[2].src[1].src[1].op is Ops.CONST and ast.src[2].src[1].src[1].arg == 3 + @unittest.skip("TODO: this requires forced_realize to be deleted.") def test_shard_memory(self): devices = (d0, d1, d2, d3) t = Tensor.zeros(16, 16).contiguous() diff --git a/test/unit/test_tensor_uop_representation.py b/test/unit/test_tensor_uop_representation.py index 867acc1a60..dc8d0b64aa 100644 --- a/test/unit/test_tensor_uop_representation.py +++ b/test/unit/test_tensor_uop_representation.py @@ -19,7 +19,10 @@ class TestTensorMutates(unittest.TestCase): self.assertIsNot(pa, a.lazydata) self.assertIsNot(pb, b.lazydata) self.assertIsNot(pr, ret.lazydata) - for t in [a,b,ret]: is_pattern(t, realized_pattern) + # NOTE: this becomes a VIEW(VIEW(BUFFER)) because UOp.view no longer instantly folds contiguous VIEW of the same shape + # this is fine because realized exists on the base. + # TODO: we can make this always be a VIEW(BUFFER) once BUFFER has a ShapeTracker of shape=(N,) + for t in [a,b,ret]: is_pattern_uop(t.lazydata.base, realized_pattern) def test_reshape_is_same_parent(self): a = Tensor([1,2,3]) @@ -43,14 +46,14 @@ class TestTensorUopRepresentation(unittest.TestCase): def test_realized(self): a = Tensor([1.,2,3]).realize() print(a.lazydata) - is_pattern(a, realized_pattern) + is_pattern_uop(a.lazydata.base, realized_pattern) def test_add_realized(self): a = Tensor([1.,2,3]).realize() b = Tensor([4.,5,6]).realize() c = a+b print(c.lazydata) - is_pattern(c, UPat(Ops.ADD, src=(realized_pattern, realized_pattern))) + is_pattern(c, UPat(Ops.ADD, src=(UPat(Ops.VIEW, src=(realized_pattern,)), UPat(Ops.VIEW, src=(realized_pattern,))))) def test_const_pattern(self): a = Tensor(1) @@ -107,7 +110,8 @@ class TestTensorUopRepresentation(unittest.TestCase): a = Tensor([1.,2,3]).realize() c = a.to("TEST") # NOTE: this isn't checked print(c.lazydata) - is_pattern(c, UPat(Ops.COPY, src=(UPat(Ops.DEVICE), realized_pattern,))) + # TODO: COPY on a Tensor becomes a VIEW(COPY), this should be done in the scheduler not in ops + is_pattern(c, UPat(Ops.VIEW, src=(UPat(Ops.COPY, src=(UPat(Ops.DEVICE), realized_pattern,)),))) if __name__ == '__main__': unittest.main() diff --git a/tinygrad/ops.py b/tinygrad/ops.py index a818ecc985..2c23541a64 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -467,12 +467,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): def base(self) -> UOp: if self.op in GroupOp.Movement: return self.src[0].base return self.src[0].base if self.op is Ops.VIEW and len(self.src) == 1 and self.src[0].op is not Ops.BUFFER else self - def view(self, new_st:ShapeTracker) -> UOp: - if self.st is None: return UOp(Ops.VIEW, self.dtype.base if not isinstance(self.dtype, ImageDType) else self.dtype, (self,), new_st) - ret = UOp(Ops.VIEW, self.dtype, (self.base,), new_st) - # instant folding rules - if new_st.contiguous and self.base.shape == new_st.shape: return self.base - return ret + def view(self, new_st:ShapeTracker) -> UOp: return UOp(Ops.VIEW, self.dtype, (self.base,), new_st) def _mop(self, op:Ops, arg): ret = UOp(op, self.dtype, (self,), arg) @@ -1293,7 +1288,10 @@ ConstLike = Union[ConstType, Variable, tuple[ConstType, ...]] # *** uop swizzling *** -merge_views = PatternMatcher([(UPat(Ops.VIEW, name="s0").view(name="s1"), lambda s0,s1: s0.replace(arg=s0.st+s1.st))]) +merge_views = PatternMatcher([ + (UPat(Ops.VIEW, name="s0").view(name="s1"), lambda s0,s1: s0.replace(arg=s0.st+s1.st)), + (UPat(Ops.VIEW, name="mv", src=(UPat.var("x"),)), lambda mv,x: x if mv.st.contiguous and x.st is not None and x.shape == mv.shape else None), +]) # push VIEW to loads view_left = merge_views+PatternMatcher([ From c85737c200dc4fdf28cda3846a8e00e60eb8a6c3 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Tue, 14 Jan 2025 13:26:56 -0800 Subject: [PATCH 21/29] assert to prepare for grad uop [pr] (#8280) * assert to prepare for grad uop [pr] * fix test_nn * fix most of test_tensor * few more tests * fix multi * uniform gradient * acc_dtype * any for multi * fix typing * fix assert, CAST_BEFORE_VIEW is still the issue * explict test for CAST_BEFORE_VIEW --------- Co-authored-by: qazal <77887910+Qazalin@users.noreply.github.com> --- test/test_dtype.py | 3 ++- test/test_tensor.py | 2 +- test/unit/test_gradient.py | 12 ++++++++++++ tinygrad/multi.py | 2 ++ tinygrad/tensor.py | 3 +++ 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/test/test_dtype.py b/test/test_dtype.py index a09ae0c7f9..7a1677bdd0 100644 --- a/test/test_dtype.py +++ b/test/test_dtype.py @@ -781,7 +781,8 @@ class TestAutoCastType(unittest.TestCase): if DEBUG >= 2: print(f"testing {default_dtype=}, {dtype=}") a = Tensor([1, 2, 3], dtype=dtype, requires_grad=True) - b = (a * 5).sum() + # NOTE: this is broken without default_dtype because of CAST_BEFORE_VIEW + b = (a * 5).sum(acc_dtype=default_dtype) b.backward() # if there is dtype mismatch, lazy should assert assert a.grad.dtype == a.dtype np.testing.assert_allclose(a.grad.numpy(), [5, 5, 5]) diff --git a/test/test_tensor.py b/test/test_tensor.py index a5152caa95..82f4065593 100644 --- a/test/test_tensor.py +++ b/test/test_tensor.py @@ -464,7 +464,7 @@ class TestTinygrad(unittest.TestCase): def test_repr_with_grad(self): a = Tensor([1], requires_grad=True) b = Tensor([1]) - c = (a + b).mean().backward() + c = (a + b).sum().backward() print(a) print(c) diff --git a/test/unit/test_gradient.py b/test/unit/test_gradient.py index 21182874f2..b36b81f243 100644 --- a/test/unit/test_gradient.py +++ b/test/unit/test_gradient.py @@ -93,6 +93,12 @@ class TestTensorGradient(unittest.TestCase): dx = z.gradient(x, gradient=dz)[0] self.assertListEqual(dx.tolist(), [2.0, 4.0, 6.0]) + def test_cast_before_view(self): + x = Tensor([1.0, 1, 1, 1]) + x_reshaped = x.reshape(2,2) + x_casted = x_reshaped.cast(dtypes.float16) + x_casted.mean().gradient(x_reshaped) + class TestRealizeMeansRealize(unittest.TestCase): def test_randn_realizes(self): x = Tensor.randn(2, 3, 64, 64, requires_grad=True).realize() @@ -104,5 +110,11 @@ class TestRealizeMeansRealize(unittest.TestCase): print(x.lazydata) self.assertEqual(x.lazydata.op, Ops.VIEW) + # NOTE: even though it doesn't realize, this seems fine + def test_uniform_gradient(self): + x = Tensor.uniform(16, 3, 3, 3, requires_grad=True).realize() + y = x * 2 + y.sum().gradient(x)[0].realize() + if __name__ == '__main__': unittest.main() diff --git a/tinygrad/multi.py b/tinygrad/multi.py index 919d33a97f..ab521e9978 100644 --- a/tinygrad/multi.py +++ b/tinygrad/multi.py @@ -97,6 +97,8 @@ class MultiLazyBuffer(MathTrait): def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real) def clone(self) -> MultiLazyBuffer: return MultiLazyBuffer([lb.clone() for lb in self.lbs], self.axis, self.real) def detach(self) -> MultiLazyBuffer: return MultiLazyBuffer([lb.detach() for lb in self.lbs], self.axis, self.real) + @property + def toposort(self) -> dict[UOp, None]: return {l:None for x in self.lbs for l in x.toposort} # elementwise is simple def alu(self, op:Ops, *in_srcs:MultiLazyBuffer) -> MultiLazyBuffer: diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 171c7afe31..53ebb9b86c 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -946,6 +946,7 @@ class Tensor(SimpleMathTrait): # this is "implicit gradient creation" gradient = Tensor(1.0, dtype=self.dtype, device=self.device, requires_grad=False) + toposort_uop = self.lazydata.toposort assert self.shape == gradient.shape, f"grad shape must match tensor shape, {gradient.shape!r} != {self.shape!r}" self.grad = gradient for t0 in reversed(toposorted): @@ -958,6 +959,8 @@ class Tensor(SimpleMathTrait): for t, g in zip(t0._ctx.parents, grads): if g is not None and t.requires_grad: assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}" + assert t.lazydata in toposort_uop or (isinstance(t.lazydata, MultiLazyBuffer) and any(x in toposort_uop for x in t.lazydata.lbs)), \ + f"grad uop must have a path from self\ngrad uop: {t.lazydata}" t.grad = g if t.grad is None else (t.grad + g) if not retain_graph: del t0._ctx return self From 0790d8059fb6157977b50b7752bc5742ed720650 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 14 Jan 2025 18:00:49 -0500 Subject: [PATCH 22/29] remove MultiLazyBuffer.from_sharded [pr] (#8620) it's eqivalent to taking the lazydata from Tensor.split, then copy to devices --- test/test_multitensor.py | 2 +- tinygrad/multi.py | 14 +------------- tinygrad/tensor.py | 14 +++++++++++--- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/test/test_multitensor.py b/test/test_multitensor.py index ff21906016..2de7052e66 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -75,7 +75,7 @@ class TestMultiTensor(unittest.TestCase): ei.run() assert names[-2] == names[-1], "function was relinearized" - @unittest.skip("this doesn't fold because from_sharded calls contiguous on all lbs") + @unittest.skip("this doesn't fold because shard_ calls contiguous on all lbs") def test_sharded_memory(self): # Buffer may be stuck in track_cross_buffer for x in (d0, d1, d2, d3, d4): Device[x].synchronize() diff --git a/tinygrad/multi.py b/tinygrad/multi.py index ab521e9978..8e65f46ac9 100644 --- a/tinygrad/multi.py +++ b/tinygrad/multi.py @@ -1,6 +1,6 @@ from __future__ import annotations import functools, itertools, operator -from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv, ceildiv +from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv from tinygrad.dtype import DType from tinygrad.ops import Ops, MathTrait, UOp, sint @@ -63,18 +63,6 @@ class MultiLazyBuffer(MathTrait): def __repr__(self): return f"" - @staticmethod - def from_sharded(lb:UOp, devices:tuple[str, ...], axis:int|None): - if axis is not None: - if not isinstance(total:=lb.shape[axis], int): raise RuntimeError(f"cannot shard symbolic shape {lb.shape=}, {axis=}") - sz = ceildiv(total, len(devices)) - splits = tuple([max(0, min(sz, total - sz*i)) for i in range(len(devices))]) - bounds = tuple(itertools.pairwise(itertools.accumulate(splits, initial=0))) - lbs = [lb] * len(devices) - sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis, bounds) if axis is not None and bounds is not None else lbs, devices)] - # NOTE: this contiguous is making it impossible for the scheduler to do late const folding - return MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis) - def copy_to_device(self, device:str) -> UOp: if self.axis is None: # if we already have a copy on the device, return that diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 53ebb9b86c..71fe8b5693 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -179,7 +179,7 @@ class Tensor(SimpleMathTrait): # data might be on a different device if isinstance(device, str): self.lazydata:Union[UOp, MultiLazyBuffer] = data if data.device == device else data.copy_to_device(device) # if device is a tuple, we should have/construct a MultiLazyBuffer - elif isinstance(data, UOp): self.lazydata = MultiLazyBuffer.from_sharded(data, device, None) + elif isinstance(data, UOp): self.lazydata = Tensor(data).shard(device).lazydata else: assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}" self.lazydata = data @@ -405,8 +405,16 @@ class Tensor(SimpleMathTrait): """ assert isinstance(self.lazydata, UOp), "can't shard a MultiLazyBuffer" devices = tuple(Device.canonicalize(x) for x in devices) - if axis is not None: axis = self._resolve_dim(axis) - return Tensor(MultiLazyBuffer.from_sharded(self.lazydata, devices, axis), device=devices, requires_grad=self.requires_grad) + if axis is None: lbs = [self.lazydata] * len(devices) + else: + axis = self._resolve_dim(axis) + sz = ceildiv(self.shape[axis], len(devices)) + sizes = [max(0, min(sz, self.shape[axis] - sz*i)) for i in range(len(devices))] + lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)] + sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)] + # NOTE: this contiguous is making it impossible for the scheduler to do late const folding + mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis) + return Tensor(mlb, device=devices, requires_grad=self.requires_grad) def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None): """ From 930728c06997adee49c61b0db96ced7e51f45a88 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 14 Jan 2025 18:41:41 -0500 Subject: [PATCH 23/29] bert BS 72->66 [pr] (#8621) 72 does not fit now --- .../benchmarks/bert/implementations/tinybox_green/dev_beam.sh | 2 +- .../benchmarks/bert/implementations/tinybox_green/dev_run.sh | 2 +- .../bert/implementations/tinybox_green/run_and_time.sh | 2 +- .../benchmarks/bert/implementations/tinybox_red/dev_beam.sh | 2 +- .../benchmarks/bert/implementations/tinybox_red/dev_run.sh | 2 +- .../benchmarks/bert/implementations/tinybox_red/run_and_time.sh | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh index 7318b3bb29..99b99f7e89 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh index 555ed80cb4..70a3b6a6cb 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh index 4e732799ae..a213f4d682 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh @@ -3,7 +3,7 @@ export PYTHONPATH="." export MODEL="bert" export SUBMISSION_PLATFORM="tinybox_green" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh index 692e0b49a1..08ffd354b3 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36 export BEAM=3 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh index 86221ca594..c42c1f65b6 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export MODEL="bert" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36 export BEAM=3 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh index 272b0ddb12..d6ff9fd2cc 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -3,7 +3,7 @@ export PYTHONPATH="." export MODEL="bert" export SUBMISSION_PLATFORM="tinybox_red" -export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=36 +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=36 export BEAM=3 export IGNORE_JIT_FIRST_BEAM=1 From 7860a808015568d4324985a481a97174be338ae5 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 14 Jan 2025 19:19:13 -0500 Subject: [PATCH 24/29] simpler MultiLazyBuffer alu [pr] (#8622) --- tinygrad/multi.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tinygrad/multi.py b/tinygrad/multi.py index 8e65f46ac9..c9990534ce 100644 --- a/tinygrad/multi.py +++ b/tinygrad/multi.py @@ -102,12 +102,10 @@ class MultiLazyBuffer(MathTrait): assert any(new_real), "output contains no real lb" for mlb in msrcs: if (mlb.axis == axis and (mlb.axis is None or mlb.bounds == bounds)) or not_all_real: srcs.append(mlb.lbs) - elif mlb.axis is None and axis is not None: - assert bounds is not None - srcs.append(to_sharded(mlb.lbs, axis, bounds)) else: assert axis is not None and bounds is not None - srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.lbs], axis, bounds)) + if mlb.axis is None: srcs.append(to_sharded(mlb.lbs, axis, bounds)) + else: srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.lbs], axis, bounds)) new_real_lbs:dict[int,UOp] = {i:lsrcs[0].alu(op, *lsrcs[1:]) for i,(lsrcs,r) in enumerate(zip(zip(*srcs), new_real)) if r} # NOTE: const dtype should match real new_dtype = next(iter(new_real_lbs.values())).dtype From 4ee3243c93cd43982683ed341da9a19848bfd9c3 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 14 Jan 2025 19:52:38 -0500 Subject: [PATCH 25/29] JITBEAM=2 for LLaMA-3 8B on 4 GPUs [pr] (#8623) is it fast? --- .github/workflows/benchmark.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 27b2764556..cccf231dee 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -202,8 +202,8 @@ jobs: # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt - name: Run LLaMA-3 8B BEAM run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt - - name: Run LLaMA-3 8B on 4 GPUs - run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt + - name: Run LLaMA-3 8B on 4 GPUs with BEAM + run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt # - name: Run LLaMA-3 8B on 6 GPUs # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt # - name: Run LLaMA-2 70B @@ -389,8 +389,8 @@ jobs: # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt - name: Run LLaMA-3 8B BEAM run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt - - name: Run LLaMA-3 8B on 4 GPUs - run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt + - name: Run LLaMA-3 8B on 4 GPUs with BEAM + run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt # - name: Run LLaMA-3 8B on 6 GPUs # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt - name: Restore amdgpu From f29d6f54b874c3579b56d66638bd3e90128c702a Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Tue, 14 Jan 2025 18:33:33 -0800 Subject: [PATCH 26/29] support multilb gradient [pr] (#8624) --- test/models/test_real_world.py | 2 +- test/test_multitensor.py | 6 ++++++ tinygrad/gradient.py | 2 ++ tinygrad/tensor.py | 22 ++++++++++++++-------- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index c22ed5bb73..a213e5ec88 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -33,7 +33,7 @@ def helper_test(nm, gen, model, max_memory_allowed, max_kernels_allowed, all_jit kernels_used = len(model.jit_cache) if hasattr(model, "jit_cache") else None print(f"{nm}: used {mem_used/1e9:.2f} GB and {kernels_used} kernels in {min(tms)/1e6:.2f} ms") assert mem_used/1e9 < max_memory_allowed, f"{nm} used more than {max_memory_allowed:.2f} GB - {mem_used/1e9:.2} GB used" - assert not kernels_used or kernels_used <= max_kernels_allowed, f"{nm} used more than {max_kernels_allowed} kernels" + assert not kernels_used or kernels_used <= max_kernels_allowed, f"{nm} used more than {max_kernels_allowed} kernels, it used {kernels_used}" if all_jitted: assert kernels_used > 0 and kernels_used == GlobalCounters.kernel_count or (kernels_used <= GlobalCounters.kernel_count and getattr(Device[Device.DEFAULT], "graph", None)), f"only {kernels_used} out of {GlobalCounters.kernel_count} were jitted" # noqa: E501 diff --git a/test/test_multitensor.py b/test/test_multitensor.py index 2de7052e66..c8325137d7 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -43,6 +43,12 @@ class TestMultiTensor(unittest.TestCase): assert lb.shape == (256,) (X + X).realize() + def test_gradient(self): + X = Tensor.ones(256).contiguous().realize() + X.to_(devices_2) + grad = X.sum().gradient(X)[0] + grad.realize() + def test_shard(self): X = Tensor.ones(256).contiguous().realize() X.shard_(devices_2, 0) diff --git a/tinygrad/gradient.py b/tinygrad/gradient.py index c9ea1d8028..1b93e9374b 100644 --- a/tinygrad/gradient.py +++ b/tinygrad/gradient.py @@ -39,6 +39,8 @@ pm_gradient = PatternMatcher([ # there's no gradient for...is this ASSIGN? (UPat(Ops.VIEW, src=(UPat(Ops.BUFFER), UPat(Ops.BUFFER_VIEW))), lambda: (None, None)), + # also no gradient for bitcast + (UPat(Ops.BITCAST), lambda ctx: (None,)), ]) # copied from tensor.py, get relevant toposort of gradients diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 71fe8b5693..7d00e441d4 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -915,15 +915,21 @@ class Tensor(SimpleMathTrait): print(dy.tolist()) # dz/dy ``` """ - assert isinstance(self.lazydata, UOp), "multi isn't supported yet" - target_uops: list[UOp] = [x.lazydata for x in targets if isinstance(x.lazydata, UOp)] assert gradient is not None or self.shape == tuple(), "when no gradient is provided, backward must be called on a scalar tensor" - grads = compute_gradient(self.lazydata, self.lazydata.const_like(1) if gradient is None else cast(UOp, gradient.lazydata), target_uops) - ret = [] - for x in target_uops: - if (y:=grads.get(x)) is None: raise RuntimeError(f"{x}\n\nnot found in\n\n{self.lazydata}") - ret.append(Tensor(y, device=x.device)) - return ret + if gradient is None: gradient = Tensor(1.0, dtype=self.dtype, device=self.device, requires_grad=False) + rets = [] + for i,(uop,grad) in enumerate(zip(self.lazydata.lbs, gradient.lazydata.lbs)): + target_uops = [x.lazydata.lbs[i] for x in targets] + grads = compute_gradient(uop, grad, target_uops) + ret = [] + for x in target_uops: + if (y:=grads.get(x)) is None: raise RuntimeError(f"{x}\n\nnot found in\n\n{uop}") + ret.append(y) + rets.append(ret) + # create returned Tensors + if isinstance(self.lazydata, UOp): return [Tensor(u, device=t.device) for t,u in zip(targets, rets[0])] + return [Tensor(MultiLazyBuffer(list(u), cast(MultiLazyBuffer, t.lazydata).axis, cast(MultiLazyBuffer, t.lazydata).real), + device=t.device) for t,u in zip(targets, zip(*rets))] def _deepwalk(self): def _walk(node, visited): From 504ad08e736636fd8581e9473602836eb029ac16 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Tue, 14 Jan 2025 19:03:17 -0800 Subject: [PATCH 27/29] hotfix: add test_example_matmul_same --- test/test_schedule.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_schedule.py b/test/test_schedule.py index 2448c9f8a1..2d531f0170 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -585,6 +585,15 @@ class TestSchedule(unittest.TestCase): run_schedule(check_schedule(out, 2)) np.testing.assert_allclose(out.numpy(), np.ones((64,64))) + def test_example_matmul_same(self): + x = Tensor.eye(64, requires_grad=True) + z = x.matmul(x).sum() + z.backward() + out = x.grad.contiguous() + run_schedule(check_schedule(out, 2)) + # NOTE: the gradient flows twice + np.testing.assert_allclose(out.numpy(), 2*np.ones((64,64))) + def test_contiguous_add(self): x = Tensor.empty(32) y = Tensor.empty(32) From 7fb1c7af6101f757880e83bc5c007b85afb29c06 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 14 Jan 2025 22:25:23 -0500 Subject: [PATCH 28/29] minor multi cleanups [pr] (#8625) --- tinygrad/multi.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tinygrad/multi.py b/tinygrad/multi.py index c9990534ce..0dbb3520cc 100644 --- a/tinygrad/multi.py +++ b/tinygrad/multi.py @@ -14,11 +14,10 @@ def all_reduce(bop: Ops, lbs: list[UOp]) -> list[UOp]: if DEBUG >= 2: print(f"{'RING ALLREDUCE' if use_ring else 'NAIVE ALLREDUCE'} {n_lbs}x{numel} | {lbs[0].dtype}") if not use_ring: return [functools.reduce(lambda x,y: x.alu(bop, y), [x.copy_to_device(lb.device) for x in lbs]) for lb in lbs] - factor = next(f for f in [32, 16, 8, 4, 2, 1] if numel % f == 0) + factor = next((f for f in [32, 16, 8, 4, 2] if numel % f == 0), 1) base, left = (numel // factor) // n_lbs, (numel // factor) % n_lbs chunk_sizes = [(base + 1) * factor] * left + [base * factor] * (n_lbs - left) - acc = 0 - chunks = [(acc, (acc := acc + i)) for i in chunk_sizes if i > 0] + chunks = list(itertools.pairwise(itertools.accumulate(chunk_sizes, initial=0))) chunked = [[lb.reshape((numel,)).shrink(((s,e),)) for s,e in chunks] for lb in lbs] # scatter-reduce @@ -64,9 +63,8 @@ class MultiLazyBuffer(MathTrait): def __repr__(self): return f"" def copy_to_device(self, device:str) -> UOp: - if self.axis is None: - # if we already have a copy on the device, return that - return next((lb for lb in self.real_lbs if lb.device == device), self.real_lbs[0].copy_to_device(device)) + # if we already have a copy on the device, return that + if self.axis is None: return next((lb for lb in self.real_lbs if lb.device == device), self.real_lbs[0].copy_to_device(device)) # copy lbs to device, pad to final shape, and sum llbs:list[UOp] = [] for lb,real,(start,end) in zip(self.lbs, self.real, self.bounds): @@ -78,8 +76,7 @@ class MultiLazyBuffer(MathTrait): # passthroughs @property def is_realized(self) -> bool: return all(lb.base.realized is not None for lb in self.real_lbs) - def cast(self, dtype:DType, bitcast:bool=False): - return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real) + def cast(self, dtype:DType, bitcast:bool=False): return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real) def const_like(self, b) -> MultiLazyBuffer: return MultiLazyBuffer([x.const_like(b) for x in self.lbs], self.axis, self.real) def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real) def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real) From e1f7c90459200fd93ef142f899f4d9a4167625c4 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Tue, 14 Jan 2025 20:48:23 -0800 Subject: [PATCH 29/29] gradient is a set [pr] (#8626) * gradient is a set [pr] * typing for deepwalk --- tinygrad/gradient.py | 8 ++++---- tinygrad/tensor.py | 17 +++++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tinygrad/gradient.py b/tinygrad/gradient.py index 1b93e9374b..a2fa71a98d 100644 --- a/tinygrad/gradient.py +++ b/tinygrad/gradient.py @@ -1,4 +1,4 @@ -from typing import cast +from typing import cast, Iterator import math, functools from tinygrad.dtype import dtypes, sum_acc_dtype from tinygrad.ops import UOp, PatternMatcher, UPat, Ops @@ -44,10 +44,10 @@ pm_gradient = PatternMatcher([ ]) # copied from tensor.py, get relevant toposort of gradients -def _deepwalk(root:UOp, targets:list[UOp]): +def _deepwalk(root:UOp, targets:set[UOp]) -> list[UOp]: @functools.lru_cache(None) def is_in_target_path(x:UOp) -> bool: return any(u in targets or is_in_target_path(u) for u in x.src) - def _walk(node:UOp, visited:set[UOp]): + def _walk(node:UOp, visited:set[UOp]) -> Iterator[UOp]: visited.add(node) if node.op is Ops.DETACH: return if is_in_target_path(node): @@ -56,7 +56,7 @@ def _deepwalk(root:UOp, targets:list[UOp]): yield node return list(_walk(root, set())) -def compute_gradient(root:UOp, root_grad:UOp, targets:list[UOp]) -> dict[UOp, UOp]: +def compute_gradient(root:UOp, root_grad:UOp, targets:set[UOp]) -> dict[UOp, UOp]: grads = {root: root_grad} for t0 in reversed(_deepwalk(root, targets)): if t0 not in grads: continue diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 7d00e441d4..e3d87f51eb 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -920,7 +920,7 @@ class Tensor(SimpleMathTrait): rets = [] for i,(uop,grad) in enumerate(zip(self.lazydata.lbs, gradient.lazydata.lbs)): target_uops = [x.lazydata.lbs[i] for x in targets] - grads = compute_gradient(uop, grad, target_uops) + grads = compute_gradient(uop, grad, set(target_uops)) ret = [] for x in target_uops: if (y:=grads.get(x)) is None: raise RuntimeError(f"{x}\n\nnot found in\n\n{uop}") @@ -931,13 +931,13 @@ class Tensor(SimpleMathTrait): return [Tensor(MultiLazyBuffer(list(u), cast(MultiLazyBuffer, t.lazydata).axis, cast(MultiLazyBuffer, t.lazydata).real), device=t.device) for t,u in zip(targets, zip(*rets))] - def _deepwalk(self): - def _walk(node, visited): + def _deepwalk(self) -> list[Tensor]: + def _walk(node:Tensor, visited:set[Tensor]): visited.add(node) # if tensor is not leaf, reset grad if (ctx := getattr(node, "_ctx", None)) is not None and len(ctx.parents) != 0: node.grad = None if ctx: - for i in node._ctx.parents: + for i in cast(Function, node._ctx).parents: if i not in visited: yield from _walk(i, visited) yield node return list(_walk(self, set())) @@ -965,12 +965,13 @@ class Tensor(SimpleMathTrait): self.grad = gradient for t0 in reversed(toposorted): if t0.grad is None: raise RuntimeError(f"tensor {t0} has no grad") - token = _METADATA.set(dataclasses.replace(md, backward=True) if (md := t0._ctx.metadata) is not None else None) - grads = t0._ctx.backward(t0.grad.lazydata) + ctx = cast(Function, t0._ctx) + token = _METADATA.set(dataclasses.replace(md, backward=True) if (md := ctx.metadata) is not None else None) + grads = ctx.backward(t0.grad.lazydata) _METADATA.reset(token) grads = [Tensor(g, device=self.device, requires_grad=False) if g is not None else None - for g in ([grads] if len(t0._ctx.parents) == 1 else grads)] - for t, g in zip(t0._ctx.parents, grads): + for g in ([grads] if len(ctx.parents) == 1 else grads)] + for t, g in zip(ctx.parents, grads): if g is not None and t.requires_grad: assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}" assert t.lazydata in toposort_uop or (isinstance(t.lazydata, MultiLazyBuffer) and any(x in toposort_uop for x in t.lazydata.lbs)), \