From e701106a64e235bd36c9ac92b52eb47f8679e244 Mon Sep 17 00:00:00 2001 From: chenyu Date: Wed, 8 Oct 2025 16:54:07 +0800 Subject: [PATCH] remove FUSE_ARANGE (#12511) it was the default already --- .github/workflows/test.yml | 4 ++-- examples/beautiful_cifar.py | 2 +- examples/hlb_cifar10.py | 1 - examples/mlperf/model_train.py | 2 +- extra/hcqfuzz/tests/bert.py | 1 - extra/torch_backend/test.py | 19 +++++++++---------- test/test_arange.py | 30 +++++++----------------------- test/test_nn.py | 11 +++++------ test/test_schedule.py | 34 +++++++++------------------------- test/test_stunning.py | 2 +- tinygrad/helpers.py | 2 +- tinygrad/schedule/kernelize.py | 4 ++-- 12 files changed, 38 insertions(+), 74 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5b3fe62e16..2740f62b7e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -144,7 +144,7 @@ jobs: sudo apt update || true sudo apt install -y --no-install-recommends ninja-build - name: Test beautiful_mnist in torch with TINY_BACKEND - run: SPLIT_REDUCEOP=0 FUSE_ARANGE=1 CPU=1 CPU_LLVM=1 TARGET_EVAL_ACC_PCT=96.0 TINY_BACKEND=1 python3 examples/other_mnist/beautiful_mnist_torch.py + run: CPU=1 CPU_LLVM=1 TARGET_EVAL_ACC_PCT=96.0 TINY_BACKEND=1 python3 examples/other_mnist/beautiful_mnist_torch.py - name: Test some torch tests (expect failure) run: python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true @@ -533,7 +533,7 @@ jobs: - name: Test LLVM=1 DEVECTORIZE=0 for model run: CPU=1 CPU_LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py - name: Test CPU=1 DEVECTORIZE=0 - run: CPU=1 CPU_LLVM=0 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" + run: CPU=1 CPU_LLVM=0 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" testdsp: name: Linux (DSP) diff --git a/examples/beautiful_cifar.py b/examples/beautiful_cifar.py index cea8262f17..5bc2fc87c3 100644 --- a/examples/beautiful_cifar.py +++ b/examples/beautiful_cifar.py @@ -10,7 +10,7 @@ GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 1))] # override tinygrad defaults dtypes.default_float = dtypes.half -Context(FUSE_ARANGE=1, FUSE_OPTIM=1).__enter__() +Context(FUSE_OPTIM=1).__enter__() # from https://github.com/tysam-code/hlb-CIFAR10/blob/main/main.py batchsize = getenv("BS", 1024) diff --git a/examples/hlb_cifar10.py b/examples/hlb_cifar10.py index 27fecf02d8..35ca8d352a 100644 --- a/examples/hlb_cifar10.py +++ b/examples/hlb_cifar10.py @@ -145,7 +145,6 @@ hyp = { }, } -@Context(FUSE_ARANGE=getenv("FUSE_ARANGE", 1)) def train_cifar(): def set_seed(seed): diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 4b333918e3..db3767edd3 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -1309,7 +1309,7 @@ def train_llama3(): EVAL_BS = config["EVAL_BS"] = getenv("EVAL_BS", 16) EVAL_TARGET = config["EVAL_TARGET"] = getenv("EVAL_TARGET", 5.6) - # LR=1e-4 TRAIN_ON_VAL=1 DEFAULT_FLOAT=bfloat16 FUSE_ARANGE=1 JITBEAM=2 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B WARMUP_STEPS=36 DECAY_STEPS=360 SEQLEN=512 PYTHONPATH=. AMD=1 AMD_LLVM=0 MODEL=llama3 python3 examples/mlperf/model_train.py + # LR=1e-4 TRAIN_ON_VAL=1 DEFAULT_FLOAT=bfloat16 JITBEAM=2 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B WARMUP_STEPS=36 DECAY_STEPS=360 SEQLEN=512 PYTHONPATH=. AMD=1 AMD_LLVM=0 MODEL=llama3 python3 examples/mlperf/model_train.py # trains to 7 opt_adamw_beta_1 = 0.9 diff --git a/extra/hcqfuzz/tests/bert.py b/extra/hcqfuzz/tests/bert.py index 4514b74556..1ac72ac7c6 100644 --- a/extra/hcqfuzz/tests/bert.py +++ b/extra/hcqfuzz/tests/bert.py @@ -7,7 +7,6 @@ bert_train_params = { "GPUS": 6, "BS": 96, "EVAL_BS": 96, - "FUSE_ARANGE": 1, "BASEDIR": "/raid/datasets/wiki", } diff --git a/extra/torch_backend/test.py b/extra/torch_backend/test.py index 463eed6f2c..3348344f7f 100644 --- a/extra/torch_backend/test.py +++ b/extra/torch_backend/test.py @@ -227,16 +227,15 @@ class TestTorchBackend(unittest.TestCase): np.testing.assert_equal(result.cpu().numpy(), [3., 3., 2.]) def test_mnist_index(self): - with Context(FUSE_ARANGE=1, SPLIT_REDUCEOP=0): - GlobalCounters.reset() - from tinygrad.nn.datasets import mnist - X_train, Y_train, _, _ = mnist() - X_train = torch.tensor(X_train.float().numpy(), device=device) - Y_train = torch.tensor(Y_train.cast('int64').numpy(), device=device) - samples = torch.randint(0, X_train.shape[0], (32,)) - X,Y = X_train[samples], Y_train[samples] - X.cpu(), Y.cpu() - self.assertLessEqual(GlobalCounters.global_ops, 10_000_000) + GlobalCounters.reset() + from tinygrad.nn.datasets import mnist + X_train, Y_train, _, _ = mnist() + X_train = torch.tensor(X_train.float().numpy(), device=device) + Y_train = torch.tensor(Y_train.cast('int64').numpy(), device=device) + samples = torch.randint(0, X_train.shape[0], (32,)) + X,Y = X_train[samples], Y_train[samples] + X.cpu(), Y.cpu() + self.assertLessEqual(GlobalCounters.global_ops, 10_000_000) def _test_diagonal(self, *shape): a = torch.randn(*shape, dtype=torch.float32, device=device) diff --git a/test/test_arange.py b/test/test_arange.py index a46b38a087..3f31b71303 100644 --- a/test/test_arange.py +++ b/test/test_arange.py @@ -25,22 +25,6 @@ class TestArange(unittest.TestCase): t = Tensor.arange(2, dtype=dtypes.int)+Tensor([3]) self.assertEqual(t.cat(t).tolist(), [3, 4, 3, 4]) -class TestRand(unittest.TestCase): - def test_fused_rand_less_ops(self, noopt=1): - GlobalCounters.reset() - with Context(FUSE_ARANGE=0, NOOPT=noopt): - out = Tensor.rand(16384) - out.realize() - unfused_ops = GlobalCounters.global_ops - - GlobalCounters.reset() - with Context(FUSE_ARANGE=1, NOOPT=noopt): - out = Tensor.rand(16384) - out.realize() - print(f"fused {GlobalCounters.global_ops} unfused {unfused_ops}") - self.assertLessEqual(GlobalCounters.global_ops, unfused_ops*2) - def test_fused_rand_less_ops_opt(self): self.test_fused_rand_less_ops(0) - DSET, DDIM = 2048, 32 class TestIndexing(unittest.TestCase): @@ -48,7 +32,7 @@ class TestIndexing(unittest.TestCase): needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous() needle[1337] = 1 needle.realize() - with Context(NOOPT=1, FUSE_ARANGE=1): + with Context(NOOPT=1): GlobalCounters.reset() out = ((Tensor.arange(1,16385)-1)*needle).sum() sched = out.schedule() @@ -61,7 +45,7 @@ class TestIndexing(unittest.TestCase): idxs = Tensor([0,3,5,6]).realize() real_index = dataset.numpy()[idxs.numpy()] print("*** indexing ***") - with Context(NOOPT=1, FUSE_ARANGE=1): + with Context(NOOPT=1): GlobalCounters.reset() rng = Tensor.ones(4, DDIM, DSET, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, DDIM, DSET, 1) idxs = idxs.reshape(4,1,1,1).expand(4, DDIM, DSET, 1) @@ -77,7 +61,7 @@ class TestIndexing(unittest.TestCase): def test_index_variable(self): dataset = Tensor.rand(DSET, DDIM).realize() v = Variable("v", 0, DDIM-1) - with Context(NOOPT=1, FUSE_ARANGE=1, SPLIT_REDUCEOP=0): + with Context(NOOPT=1): GlobalCounters.reset() vb = Tensor(v.bind(12)) comp = dataset[vb].numpy() @@ -106,7 +90,7 @@ class TestIndexing(unittest.TestCase): idxs = Tensor([0,3,5,6]).realize() real_index = dataset.numpy()[idxs.numpy()] print("*** indexing ***") - with Context(NOOPT=noopt, FUSE_ARANGE=1): + with Context(NOOPT=noopt): GlobalCounters.reset() X = dataset[idxs] assert X.shape == (4,DDIM) @@ -121,7 +105,7 @@ class TestIndexing(unittest.TestCase): def test_index_fused_out_of_bounds(self): dataset = Tensor.rand(256, 256).realize() idxs = Tensor([-19238, -257, 256, 495, 10982377]).realize() - with Context(NOOPT=1, FUSE_ARANGE=1): + with Context(NOOPT=1): X = dataset[idxs] np.testing.assert_equal(X.numpy(), 0) @@ -130,7 +114,7 @@ class TestIndexing(unittest.TestCase): if Device.DEFAULT == "WEBGPU": op_limit *= 15 from tinygrad.nn.datasets import mnist X_train, Y_train, _, _ = mnist() - with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=split_reduceop): + with Context(NOOPT=noopt, SPLIT_REDUCEOP=split_reduceop): samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0]).realize() GlobalCounters.reset() x = X_train[samples].numpy() @@ -150,7 +134,7 @@ class TestIndexing(unittest.TestCase): # TODO: why is a new realize needed here emb_w = emb.weight.realize().numpy() x = Tensor([1,2,3,4]) - with Context(NOOPT=noopt, FUSE_ARANGE=1): + with Context(NOOPT=noopt): GlobalCounters.reset() z = emb(x).realize() self.assertLessEqual(GlobalCounters.global_ops, op_limit) diff --git a/test/test_nn.py b/test/test_nn.py index ddada1eccb..00fcf70291 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -447,11 +447,11 @@ class TestNN(unittest.TestCase): # TODO: fused with opts uses more ops def test_embedding_one_kernel_fused(self): - with Context(FUSE_ARANGE=1, NOOPT=0): + with Context(NOOPT=0): self.test_embedding_one_kernel(ops=612_000, kcount=2) def test_embedding_one_kernel_fused_noopt(self): - with Context(FUSE_ARANGE=1, NOOPT=1): + with Context(NOOPT=1): self.test_embedding_one_kernel(ops=0, kcount=2) def test_embedding_shape(self): @@ -465,10 +465,9 @@ class TestNN(unittest.TestCase): def test_embedding_regression(self): # used to fail bounds check - with Context(FUSE_ARANGE=1): - embedding = Embedding(100, 1024) - input_ids = Tensor.empty(16, 16, dtype=dtypes.int) - embedding(input_ids).realize() + embedding = Embedding(100, 1024) + input_ids = Tensor.empty(16, 16, dtype=dtypes.int) + embedding(input_ids).realize() def test_load_state_dict(self): layer = Conv2d(3, 5, kernel_size=3) diff --git a/test/test_schedule.py b/test/test_schedule.py index 83e312a691..0a0531c28f 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -83,33 +83,30 @@ class TestSchedule(unittest.TestCase): np.testing.assert_allclose(t.numpy(), torch_out) def test_arange_avgpool2d_fused_noopt(self): - with Context(FUSE_ARANGE=1, NOOPT=1): self.test_arange_avgpool2d(kcount=1) + with Context(NOOPT=1): self.test_arange_avgpool2d(kcount=1) # linearizer error @unittest.skip("recursion error no longer raised") @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "needs supports_float4 to fail") def test_arange_avgpool2d_fused(self): with self.assertRaises(RecursionError): - with Context(FUSE_ARANGE=1, NOOPT=0): self.test_arange_avgpool2d(kcount=1) + with Context(NOOPT=0): self.test_arange_avgpool2d(kcount=1) # when we're fusing a reduce, all ReduceOps must have the same N in the dimensions # all permutes, reshapes, expands and shrinks push through the reduce def test_arange_sum(self): a = Tensor.arange(6).reshape(3, 2).sum(axis=1) - with Context(FUSE_ARANGE=1): - run_schedule(check_schedule(a, 1)) + run_schedule(check_schedule(a, 1)) self.assertListEqual(a.tolist(), [1, 5, 9]) def test_arange_sum_alt(self): a = (Tensor.arange(5).reshape(1,5).expand(6,5)*Tensor(2)).reshape(1,6,5).sum(axis=2) - with Context(FUSE_ARANGE=1): - run_schedule(check_schedule(a, 1)) + run_schedule(check_schedule(a, 1)) np.testing.assert_equal(a.numpy(), 20) def test_permute_arange(self): a = Tensor.arange(6).reshape(6, 1, 1).permute(2, 0, 1).sum(axis=1) - with Context(FUSE_ARANGE=1): - run_schedule(check_schedule(a, 1)) + run_schedule(check_schedule(a, 1)) self.assertListEqual(a.tolist(), [[15]]) @unittest.skipIf(Device.DEFAULT == "CPU", "devices must mismatch") @@ -137,8 +134,7 @@ class TestSchedule(unittest.TestCase): def test_indexing_scalars_simple(self): X = Tensor.randn(2, 2).realize() xt = X[Tensor(1)][Tensor(0)] - with Context(FUSE_ARANGE=1): - run_schedule(check_schedule(xt, 2)) + run_schedule(check_schedule(xt, 2)) np.testing.assert_equal(xt.numpy(), X.numpy()[1][0]) @unittest.skipIf(CI and Device.DEFAULT == "NV", "crashes on NV CI") @@ -158,8 +154,7 @@ class TestSchedule(unittest.TestCase): assume(a