remove trivial use of RANGEIFY flag (#12550)

some tests need update still
2026-01-06 21:53:53 -05:00 · 2025-10-09 14:29:38 +08:00
parent 80d99d52a5
commit ae51bdd06a
17 changed files with 86 additions and 132 deletions
--- a/docs/abstractions2.py
+++ b/docs/abstractions2.py
@@ -80,7 +80,6 @@ print("******** third, the UOp ***********")

 from tinygrad.engine.realize import run_schedule
 from tinygrad.engine.schedule import create_schedule_with_vars
-from tinygrad.helpers import RANGEIFY
 from tinygrad.schedule.rangeify import get_rangeify_map

 # allocate some values + load in values
--- a/extra/gemm/amd_uop_matmul.py
+++ b/extra/gemm/amd_uop_matmul.py
@@ -49,8 +49,7 @@ def rangeify_kernel3():
  b = Tensor.empty(N,N)
  c = a@b
  #c = c.reshape((32,2,16,4,32,2,16,4)).contiguous()
-  with Context(RANGEIFY=1):
-    sink = c.schedule()[-1].ast
+  sink = c.schedule()[-1].ast
  #print(sink)

  opts  = [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.UPCAST, 0, 2)]
@@ -329,7 +328,7 @@ if __name__ == "__main__":
  elif HL == 1: hprg = hl_spec_kernel3()
  else: hprg = hand_spec_kernel3()
  if HL == 3:
-    with Context(RANGEIFY=1, BLOCK_REORDER=0):
+    with Context(BLOCK_REORDER=0):
      prg = get_program(hprg, Device.default.renderer)
  else:
    prg = get_program(hprg, Device.default.renderer)
--- a/test/external/external_test_opt.py
+++ b/test/external/external_test_opt.py
@@ -4,7 +4,7 @@ import numpy as np
 import torch

 from tinygrad import GlobalCounters, Tensor, Device
-from tinygrad.helpers import getenv, RANGEIFY
+from tinygrad.helpers import getenv
 from tinygrad.nn.state import get_parameters
 from tinygrad.engine.realize import capturing
 from tinygrad.tensor import _to_np_dtype
@@ -164,7 +164,7 @@ class TestOpt(unittest.TestCase):

  def test_permute_was_pushed(self):
    a = Tensor.randn(16, 16, 16)
-    with CLCache(1 if RANGEIFY else 2):
+    with CLCache(1):
      c = a.sum(2)
      d = c.permute(1,0).contiguous()
      d.realize()
@@ -172,7 +172,7 @@ class TestOpt(unittest.TestCase):

  def test_permute_was_pushed_through_contract_reshape(self):
    a = Tensor.randn(4, 4, 4, 4, 4)
-    with CLCache(1 if RANGEIFY else 2):
+    with CLCache(1):
      c = a.sum(-1)
      d = c.reshape(16,16).permute(1,0).contiguous()
      d.realize()
@@ -180,7 +180,7 @@ class TestOpt(unittest.TestCase):

  def test_permute_was_pushed_through_contractw1s_reshape(self):
    a = Tensor.randn(4, 4, 4, 4, 4)
-    with CLCache(1 if RANGEIFY else 2):
+    with CLCache(1):
      c = a.sum(-1)
      d = c.reshape(16,1,16).permute(2,1,0).contiguous()
      d.realize()
@@ -188,7 +188,7 @@ class TestOpt(unittest.TestCase):

  def test_permute_was_pushed_through_expand_reshape(self):
    a = Tensor.randn(16, 16, 16)
-    with CLCache(1 if RANGEIFY else 2):
+    with CLCache(1):
      c = a.sum(2)
      d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
      d.realize()
@@ -220,7 +220,7 @@ class TestOpt(unittest.TestCase):
    for axis in [0, 1]:
      for n in [4, 8, 16]:
        b = torch.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
-        with CLCache(allowed=3 if RANGEIFY else 2):
+        with CLCache(allowed=3):
          a = Tensor.ones(n, n).contiguous().sum(axis).reshape(n, 1).expand(n, n).sum(axis)
          a.realize()
        np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
@@ -229,7 +229,7 @@ class TestOpt(unittest.TestCase):
    axis1, axis2 = 0, 1
    for n in [4, 8, 16]:
      b = torch.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
-      with CLCache(allowed=3 if RANGEIFY else 2):
+      with CLCache(allowed=3):
        a = Tensor.ones(n, n).contiguous().sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
        a.realize()
      np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,4 +1,4 @@
-import time, struct, unittest
+import time, struct
 from typing import Any, Callable
 import numpy as np
 from tinygrad import Tensor, dtypes, Device
@@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype
 from tinygrad.engine.realize import Runner
 from tinygrad.dtype import DType
 from tinygrad.nn.state import get_parameters
-from tinygrad.helpers import T, CI, RANGEIFY
+from tinygrad.helpers import T, CI
 from tinygrad.codegen import full_rewrite
 from tinygrad.runtime.ops_python import PythonProgram, PythonRenderer, PythonCompiler

@@ -62,6 +62,3 @@ def not_support_multi_device():

 # NOTE: This will open REMOTE if it's the default device
 REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device)
-
-def expect_rangeify_fails(fxn): return (unittest.expectedFailure if RANGEIFY else (lambda f:f))(fxn)
-def expect_nonrangeify_fails(fxn): return (unittest.expectedFailure if not RANGEIFY else (lambda f:f))(fxn)
--- a/test/opt/test_kernel_opts.py
+++ b/test/opt/test_kernel_opts.py
@@ -1,6 +1,6 @@
 import unittest
 from tinygrad import Device, Tensor, dtypes
-from tinygrad.helpers import CI, RANGEIFY
+from tinygrad.helpers import CI
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError

 # TODO: write a clean version of this
@@ -351,7 +351,6 @@ class TestKernelOpts(unittest.TestCase):
    ] + [[Opt(OptOps.THREAD, 0, 4)] if Device[Device.DEFAULT].renderer.global_max[0] >= 4 else []]
      + [[Opt(OptOps.THREAD, 0, 8)] if Device[Device.DEFAULT].renderer.global_max[0] >= 8 else []])

-  @unittest.skipUnless(RANGEIFY>=1, "Kernel only fuses with rangeify")
  def test_double_sum_group(self):
    a = Tensor.rand(4, 4, 4)
    r = a.sum((1, 2)).sum()
--- a/test/test_arange.py
+++ b/test/test_arange.py
@@ -1,7 +1,7 @@
 import unittest
 import numpy as np
 from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
-from tinygrad.helpers import CI, Context, getenv, RANGEIFY
+from tinygrad.helpers import CI, Context, getenv
 from tinygrad.engine.realize import run_schedule
 from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
 from tinygrad.uop.ops import Ops
@@ -95,7 +95,7 @@ class TestIndexing(unittest.TestCase):
      X = dataset[idxs]
      assert X.shape == (4,DDIM)
      sched = X.schedule()
-      self.assertEqual(len(sched), 1 if RANGEIFY else 2)
+      self.assertEqual(len(sched), 1)
      run_schedule(sched)
      assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops} != {4*DSET}"
    np.testing.assert_allclose(real_index, X.numpy())
--- a/test/test_assign.py
+++ b/test/test_assign.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 import unittest
-import contextlib
 import numpy as np
 from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
 from tinygrad.device import is_dtype_supported
@@ -271,8 +270,6 @@ class TestAssign(unittest.TestCase):
    b.assign(a.contiguous()).realize()
    assert GlobalCounters.kernel_count - kc == 2

-  # passing in RANGEIFY=1, RANGEIFY=0 asserts permuted assigns it can't fuse
-  def assert_permuted_assign(self): return self.assertRaisesRegex(RuntimeError, "contiguous") if not RANGEIFY else contextlib.nullcontext()
  def test_permuted_assignment(self):
    a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
    b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
@@ -280,14 +277,13 @@ class TestAssign(unittest.TestCase):
    b.realize()
    ba1 = a.uop.base.realized
    bb1 = b.uop.base.realized
-    with self.assert_permuted_assign():
-      a = a.permute(1,0)
-      a += b
-      a.realize()
-      ba2 = a.uop.base.realized
-      np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
-      # permute and base are the same buffer
-      assert ba1 == ba2 and ba1 != bb1
+    a = a.permute(1,0)
+    a += b
+    a.realize()
+    ba2 = a.uop.base.realized
+    np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
+    # permute and base are the same buffer
+    assert ba1 == ba2 and ba1 != bb1

  def test_post_permuted_assignment(self):
    a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
@@ -297,13 +293,12 @@ class TestAssign(unittest.TestCase):
    #GlobalCounters.cache = []
    ba1 = a.uop.base.realized # noqa: F841
    bb1 = b.uop.base.realized # noqa: F841
-    with self.assert_permuted_assign():
-      a.assign(a.permute(1,0) + b)   # this should not work!
-      a.realize()
-      ba2 = a.uop.base.realized # noqa: F841
-      # NOTE: don't test that it's assigned
-      #assert ba1 == ba2 and ba1 != bb1
-      np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
+    a.assign(a.permute(1,0) + b)   # this should not work!
+    a.realize()
+    ba2 = a.uop.base.realized # noqa: F841
+    # NOTE: don't test that it's assigned
+    #assert ba1 == ba2 and ba1 != bb1
+    np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))

  @unittest.skipUnless(RANGEIFY, "only correct in rangeify")
  def test_post_permuted_assignment_alt(self):
@@ -345,21 +340,18 @@ class TestAssign(unittest.TestCase):
  def test_permuted_assignment_correct(self):
    a = Tensor.arange(4 * 4).reshape(4, 4).contiguous().realize()
    b = Tensor.arange(4 * 4).reshape(4, 4).contiguous().realize()
-    # TODO: swizzler.py limitation, should NOT raise AssertionError from numpy.
-    with self.assert_permuted_assign():
-      a = a.permute(1, 0)
-      new_val = a + b
-      a.assign(new_val)
-      np.testing.assert_equal(a.numpy(), np.arange(4 * 4).reshape(4, 4).transpose(1, 0) + np.arange(4 * 4).reshape(4, 4))
+    a = a.permute(1, 0)
+    new_val = a + b
+    a.assign(new_val)
+    np.testing.assert_equal(a.numpy(), np.arange(4 * 4).reshape(4, 4).transpose(1, 0) + np.arange(4 * 4).reshape(4, 4))

  def test_permuted_reduceop_child_dual_use(self):
    a = Tensor.randn(32, 32, 32).realize()
    b = Tensor.full((32, 32), 1.).contiguous().realize()
-    with self.assert_permuted_assign():
-      r = a.sum(axis=1)
-      b.assign(r + b.permute(1, 0))
-      b.realize()
-      np.testing.assert_allclose(b.numpy(), a.numpy().sum(axis=1)+np.ones((32, 32)).transpose(1, 0), atol=1e-6, rtol=1e-3)
+    r = a.sum(axis=1)
+    b.assign(r + b.permute(1, 0))
+    b.realize()
+    np.testing.assert_allclose(b.numpy(), a.numpy().sum(axis=1)+np.ones((32, 32)).transpose(1, 0), atol=1e-6, rtol=1e-3)

  @unittest.skip("multi output not supported anymore")
  def test_permuted_reduceop_multioutput_dual_use(self):
@@ -401,11 +393,10 @@ class TestAssign(unittest.TestCase):

  def test_permuted_assignment_masked_view_not_contiguous(self):
    a = Tensor.ones(4, 4).contiguous().realize()
-    with self.assert_permuted_assign():
-      b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2).permute(1, 0)
-      a.assign(a + b)
-      a.realize()
-      self.assertListEqual(a.tolist(), [[2.,2.,2.,2.],[2.,2.,2.,2.],[3.,3.,3.,3.], [3.,3.,3.,3.]])
+    b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2).permute(1, 0)
+    a.assign(a + b)
+    a.realize()
+    self.assertListEqual(a.tolist(), [[2.,2.,2.,2.],[2.,2.,2.,2.],[3.,3.,3.,3.], [3.,3.,3.,3.]])

  # TODO: is there a way to sneak in a permute such that it returns the wrong answer?

--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -3164,8 +3164,8 @@ class TestOps(unittest.TestCase):
    helper_test_op([(32,10)], lambda x: x.masked_fill((x>0.1).detach(), -math.inf))
    helper_test_op([(32,10)], lambda x: x.masked_fill((x<0.1).detach(), -math.inf))

-  @unittest.skipIf(RANGEIFY and (getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
-  @unittest.skipIf(RANGEIFY and Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
+  @unittest.skipIf((getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
+  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
  def test_masked_select(self):
    helper_test_op([(32, 10)], lambda x: x.masked_select(x>0.5), lambda x: x.masked_select(x>0.5), forward_only=True)
    helper_test_op([(32, 10)], lambda x: x.masked_select(torch.tensor(True)), lambda x: x.masked_select(Tensor(True)), forward_only=True)
--- a/test/test_rangeify.py
+++ b/test/test_rangeify.py
@@ -1,9 +1,8 @@
 import unittest
 from tinygrad import Tensor, nn
-from tinygrad.helpers import RANGEIFY, Context, GlobalCounters
+from tinygrad.helpers import Context, GlobalCounters
 from tinygrad.uop.ops import UOp, graph_rewrite, PatternMatcher, UPat, Ops

-@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
 class TestRangeifyAssign(unittest.TestCase):
  def test_assign_permuted(self):
    A = Tensor.empty(4, 4, dtype='int')
@@ -55,7 +54,6 @@ class TestRangeifyOpt(unittest.TestCase):
    A = Tensor.empty(8,8,8,8).permute(1,0,3,2).flatten()
    A.sum().realize()

-@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
 class TestRangeify(unittest.TestCase):
  def test_groupnorm(self):
    # ranges 1 and 3 are merging
@@ -230,7 +228,6 @@ class TestRangeify(unittest.TestCase):
 # contiguous + reduce can support ranges?

@unittest.skip("okay to disable this for now")
-@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
 class TestOuterworld(unittest.TestCase):
  def test_passthrough_range(self):
    t = Tensor.rand(10, 10).realize()
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -17,7 +17,6 @@ from tinygrad.helpers import CI, DEBUG, SPLIT_REDUCEOP, GlobalCounters, Context,
 from tinygrad.schedule.rangeify import get_rangeify_map, Kernel
 from tinygrad.engine.schedule import create_schedule_with_vars
 from tinygrad.engine.realize import CompiledRunner, run_schedule, lower_schedule
-from test.helpers import expect_rangeify_fails, expect_nonrangeify_fails

 class KernelCountException(Exception): pass
 def check_schedule(t:Tensor|list[Tensor]|UOp, allowed:int, to_prerealize:list[Tensor]|None=None, filter_sink=True):
@@ -117,7 +116,7 @@ class TestSchedule(unittest.TestCase):
    a = Tensor.empty(10)
    b = Tensor.empty((1,), device="CPU").expand(10).contiguous()
    c = a+b
-    with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 2 if RANGEIFY else 1)
+    with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 2)

  @unittest.skipUnless(is_dtype_supported(dtypes.half) and getenv("CAST_AFTER_EXPAND"), "need half and CAST_AFTER_EXPAND=1")
  @unittest.skip("CAST_AFTER_EXPAND is not supported")
@@ -343,7 +342,7 @@ class TestSchedule(unittest.TestCase):
    r1 = (x - r0).sum(axis=0).div(2)
    out0 = r0 + y
    out1 = r1 + y
-    schedule = check_schedule([out0, out1], 2 if RANGEIFY else 4)
+    schedule = check_schedule([out0, out1], 2)
    reduceops = [x for si in schedule for x in si.ast.toposort() if x.op in {Ops.REDUCE_AXIS, Ops.REDUCE}]
    assert len(reduceops) in [2,3]  # why is RANGEIFY different?

@@ -712,7 +711,7 @@ class TestSchedule(unittest.TestCase):
    check_schedule(b, 0)
    self.assertEqual(b.item(), 1)

-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_multioutput_ast(self):
    a = Tensor.zeros(1, dtype=dtypes.int).contiguous().realize().uop
    b = Tensor.zeros(1, dtype=dtypes.int).contiguous().realize().uop
@@ -919,7 +918,7 @@ class TestSchedule(unittest.TestCase):
    out0 = a.sum() + 2
    out1 = a.sum() + 4
    out2 = out0 * out1
-    run_schedule(check_schedule([out0, out1, out2], 1 if RANGEIFY else 4))
+    run_schedule(check_schedule([out0, out1, out2], 1))
    np.testing.assert_allclose(out0.numpy(), out0_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-6)
    np.testing.assert_allclose(out1.numpy(), out1_np:=a.numpy().sum()+4, atol=1e-4, rtol=1e-6)
    np.testing.assert_allclose(out2.numpy(), out0_np*out1_np, atol=1e-4, rtol=1e-6)
@@ -930,7 +929,7 @@ class TestSchedule(unittest.TestCase):
    out0 = a.sum().exp2()
    # out1 has two paths to a.sum()
    out1 = a.sum() + out0
-    run_schedule(check_schedule([out0, out1], 1 if RANGEIFY else 3))
+    run_schedule(check_schedule([out0, out1], 1))
    np.testing.assert_allclose(out0.numpy(), out0_np:=np.exp2(a.numpy().sum()), atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(out1.numpy(), a.numpy().sum()+out0_np, atol=1e-4, rtol=1e-6)

@@ -1022,7 +1021,7 @@ class TestSchedule(unittest.TestCase):
    b = Tensor.empty(10,)
    c = a.sum() + b[0]
    d = a.sum() + 2
-    check_schedule([c, d], 1 if RANGEIFY else 3)
+    check_schedule([c, d], 1)

  def test_reduce_multiple_paths_midshrink(self):
    a = Tensor.empty(4, 4)
@@ -1186,14 +1185,14 @@ class TestSchedule(unittest.TestCase):
    np.testing.assert_allclose(out.numpy(), expected, atol=1e-4, rtol=1e-4)

  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_softmax_upcast(self):
    # input half, softmax in float
    Tensor.manual_seed(0)
    x = Tensor.randn(4, 12, 64, 64, dtype=dtypes.half).realize()
    out = x.softmax(dtype=dtypes.float)
    sched = out.schedule()
-    self.assertEqual(len(sched), 2 if RANGEIFY else 3)
+    self.assertEqual(len(sched), 2)
    self.assertEqual(sched[0].bufs[0].dtype, dtypes.half)

    # input float, softmax in float
@@ -1323,7 +1322,7 @@ class TestSchedule(unittest.TestCase):
      check_schedule(opt.schedule_step(), 14)

  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_prefer_half_buffer(self):
    x = Tensor.ones(4).contiguous().realize()
    # y = Tensor.ones(4).contiguous().realize()
@@ -1475,7 +1474,7 @@ class TestSchedule(unittest.TestCase):
    e = c * d
    f = b.sum() - e
    # run_schedule(check_schedule([c, d, e, f], 1))
-    run_schedule(check_schedule([c, d, e, f], 2 if RANGEIFY else 5))
+    run_schedule(check_schedule([c, d, e, f], 2))
    np.testing.assert_allclose(c.numpy(), c_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(d.numpy(), d_np:=a.numpy().sum()*2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(e.numpy(), e_np:=c_np*d_np, atol=1e-4, rtol=1e-4)
@@ -1690,7 +1689,7 @@ class TestSchedule(unittest.TestCase):
  def test_late_fusion_post_expand(self):
    self._test_fusion([(32, 32)], lambda a:a-a.sum(1), 2)

-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_cast_padded_view(self):
    a = Tensor.arange(4).reshape(1, 4)
    casted_view = a.pad(((0, 1), (0, 0))).cast(dtypes.float)
@@ -1720,7 +1719,7 @@ class TestSchedule(unittest.TestCase):
    self.assertListEqual(realized_const_view.tolist(), [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]])

  @given(strat.sampled_from(dtypes.all), strat.sampled_from(dtypes.all))
-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_cast_padded_const(self, dt1, dt2):
    assume(is_dtype_supported(dt1) and is_dtype_supported(dt2))
    a = Tensor(1, dtype=dt1).reshape(1, 1).pad(((1, 1), None))
@@ -1891,9 +1890,7 @@ class TestSchedule(unittest.TestCase):
      tst = x.shrink((None, (0, 2))).assign(a).realize()
      xref[:, :2] = np.arange(8).reshape(4, 2)+y.numpy()
    np.testing.assert_equal(x.numpy(), xref)
-    if RANGEIFY > 0:
-      # NOTE: this is a bug on non rangeify
-      np.testing.assert_equal(tst.numpy(), a.numpy())
+    np.testing.assert_equal(tst.numpy(), a.numpy())

  def test_setitem_sched(self, mop=lambda x:x, expected_kcount=1):
    a = Tensor.arange(16, device="CPU").reshape(4, 4).contiguous().realize()
@@ -1904,7 +1901,6 @@ class TestSchedule(unittest.TestCase):
    run_schedule(sched)
    self.assertListEqual(a.tolist(), expected)
    self.assertEqual(kcount, expected_kcount)
-  @unittest.skipUnless(RANGEIFY>0, "this asserts on non rangeify")
  def test_setitem_permuted_sched(self): self.test_setitem_sched(lambda x: x.T, 2)
  def test_setitem_paddded_sched(self): self.test_setitem_sched(lambda x: x.shrink_to(4, 1).pad_to(4, 4), 1)

@@ -1943,7 +1939,7 @@ class TestSchedule(unittest.TestCase):
    r = (X+Tensor.arange(16).reshape(4, 4)).sum()
    out0 = r+2
    out1 = r+3
-    run_schedule(check_schedule([out0, out1], 1 if RANGEIFY else 3))
+    run_schedule(check_schedule([out0, out1], 1))
    r_ref = (X.numpy()+np.arange(16).reshape(4, 4)).sum()
    np.testing.assert_allclose(out0.numpy(), r_ref+2, rtol=2e-7)
    np.testing.assert_allclose(out1.numpy(), r_ref+3, rtol=2e-7)
@@ -2088,7 +2084,7 @@ class TestView(unittest.TestCase):
    run_schedule(sched)
    np.testing.assert_equal(b.numpy(), 0)

-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_mask_dim_1(self):
    # mask out dim = 1 works too
    a = Tensor.rand(10, 10).realize()
@@ -2236,7 +2232,6 @@ class TestCopyFolding(unittest.TestCase):
    b.realize()
    self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])

-  @expect_nonrangeify_fails
  def test_permute_on_disk_contiguous(self):
    with open(temp('dt_arange_4_permute'), "wb") as f: f.write(Tensor.arange(4).realize().uop.base.buffer.as_buffer())
    a = Tensor.empty(4, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_4_permute')}")
@@ -2251,8 +2246,6 @@ class TestCopyFolding(unittest.TestCase):
    self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])

  # NOTE: disk permute must come after COPY
-  # TODO: this is wrong because of the permute
-  @expect_nonrangeify_fails
  def test_permute_after_shrink_on_disk(self):
    with open(temp('dt_arange_5_permute'), "wb") as f: f.write(Tensor.arange(5).realize().uop.base.buffer.as_buffer())
    a = Tensor.empty(5, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_5_permute')}")
@@ -2396,12 +2389,8 @@ class TestUOpBecome(unittest.TestCase):
    a = Tensor.empty(4, 1)
    b = a.expand(4, 4).reciprocal()
    check_schedule(b, 1)
-    if RANGEIFY:
-      self.assertEqual(b.uop.base.buffer.size, 4)
-      self.assertEqual(b.uop.shape, (4, 4))
-      return
-    self.assertEqual(b.uop.base.buffer.size, 16)
-    self.assertEqual(b.uop.st, ShapeTracker.from_shape((4, 4)))
+    self.assertEqual(b.uop.base.buffer.size, 4)
+    self.assertEqual(b.uop.shape, (4, 4))

  def test_reorder_expand_alt(self):
    x = Tensor.empty(4, 1)
@@ -2410,7 +2399,7 @@ class TestUOpBecome(unittest.TestCase):
    z = (img*x) / y
    check_schedule(z, 1)

-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_become_existing_buffer(self):
    a = Tensor.empty(4, 4)
    b = a*1
@@ -2444,7 +2433,7 @@ class TestUOpBecome(unittest.TestCase):
    assert UPat(Ops.CONST, arg=3).match(const_add.uop.base, {})

  # tensors can become another realized tensor source
-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_become_existing_buf_simple(self):
    a = Tensor.empty(4, 4)
    b = a+0
@@ -2453,14 +2442,14 @@ class TestUOpBecome(unittest.TestCase):
    self.assertIs(a.uop, b.uop)

  # they can also chain other movement ops on top of the tensor source
-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_become_existing_buf_view(self):
    a = Tensor.empty(4, 4)
    b = a.permute((1, 0))+0
    check_schedule(b, 0)
    self.assertEqual(b.uop.st, a.uop.permute((1, 0)).st)

-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_become_existing_buf_view_alt(self):
    a = Tensor.empty(4, 4)
    b = a.permute((1, 0)).reshape((8, 2))+0
@@ -2468,7 +2457,7 @@ class TestUOpBecome(unittest.TestCase):
    self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st)

  # they can also have other base parents that simplified, in that case we just backtrack to the chained mops
-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_become_existing_buf_complex(self):
    a = Tensor.empty(4, 4)
    b = (a.permute((1, 0))+0).reshape((8, 2))+0
@@ -2476,7 +2465,7 @@ class TestUOpBecome(unittest.TestCase):
    self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st)
    assert b.uop.base.op is Ops.BUFFER

-  @expect_rangeify_fails
+  @unittest.expectedFailure
  def test_become_multiple_choices(self):
    a = Tensor.empty(16)
    b = (a.reshape(1, 1, 4, 1, 4)+0).reshape(1, 1, 4, 4).shrink(((0, 1), (0, 1), (0, 3), (0, 3)))+0
@@ -2494,13 +2483,8 @@ class TestUOpBecome(unittest.TestCase):
    b.realize()
    assert a.uop.is_realized
    assert a.uop.buffer._base is None
-    # b is a subbuffer of a (buffer_view in non rangeify, rangeify just makes a shrink)
-    if RANGEIFY:
-      assert b.uop.op_in_backward_slice_with_self(Ops.SHRINK)
-      assert b.uop.base is a.uop.base
-      return
-    assert b.uop.op is Ops.BUFFER_VIEW
-    assert b.uop.src[0] is a.uop
+    assert b.uop.op_in_backward_slice_with_self(Ops.SHRINK)
+    assert b.uop.base is a.uop.base

  def test_setitem_offset(self):
    a = Tensor.full((16,), 0.).contiguous().realize()
--- a/test/test_symbolic_jit.py
+++ b/test/test_symbolic_jit.py
@@ -2,7 +2,6 @@ import unittest

 from test.helpers import assert_jit_cache_len
 from tinygrad import Variable, Tensor, TinyJit
-from tinygrad.helpers import RANGEIFY
 import numpy as np

 class TestSymbolicJit(unittest.TestCase):
@@ -27,7 +26,7 @@ class TestSymbolicJit(unittest.TestCase):
      symbolic = jf(a[:, :vi]).numpy()
      expected = f(a[:, :i]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
-    assert_jit_cache_len(jf, 1 if RANGEIFY else 2) # one add and one pad, can be one kernel?
+    assert_jit_cache_len(jf, 1)

  def test_add(self):
    def f(a, b): return (a+b).realize()
@@ -80,7 +79,7 @@ class TestSymbolicJit(unittest.TestCase):
      symbolic = jf(q, k[:, :vi], v[:, :vi])[:2, :4, :1, :8].numpy()
      expected = f(q, k[:, :i], v[:, :i]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
-    assert_jit_cache_len(jf, 4 if RANGEIFY else 5)
+    assert_jit_cache_len(jf, 4)

  def test_cat_dim0(self):
    def f(a, b): return a.cat(b, dim=0).realize()
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -4,7 +4,7 @@ import torch
 import unittest, copy, mmap, random, math, array
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _METADATA
-from tinygrad.helpers import getenv, temp, mv_address, RANGEIFY
+from tinygrad.helpers import getenv, temp, mv_address
 from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
 from hypothesis import given, settings, strategies as strat
 from tinygrad.device import is_dtype_supported
@@ -872,18 +872,11 @@ class TestTensorMetadata(unittest.TestCase):
    self.assertEqual(y.grad.uop.metadata[0].name, "sigmoid")
    self.assertTrue(y.grad.uop.metadata[0].backward)
    si = Tensor.schedule(out, x.grad, y.grad)[-1]
-    if not RANGEIFY:
-      self.assertEqual(len(si.metadata), 4, f"failed with {si.metadata}")
-      self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "__mul__", "relu"})
-      bw = [m for m in si.metadata if m.backward]
-      self.assertEqual(len(bw), 2)
-      self.assertEqual(bw[0].name, "sigmoid")
-    else:
-      self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
-      self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"})
-      bw = [m for m in si.metadata if m.backward]
-      self.assertEqual(len(bw), 1)
-      self.assertEqual(bw[0].name, "sigmoid")
+    self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
+    self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"})
+    bw = [m for m in si.metadata if m.backward]
+    self.assertEqual(len(bw), 1)
+    self.assertEqual(bw[0].name, "sigmoid")

 class TestIdxUpcast(unittest.TestCase):
  def _find_op(self, ast: UOp, op: Ops):
--- a/test/test_uops_stats.py
+++ b/test/test_uops_stats.py
@@ -1,6 +1,6 @@
 import unittest
 from tinygrad import Tensor
-from tinygrad.helpers import getenv, GlobalCounters, EMULATE, RANGEIFY
+from tinygrad.helpers import getenv, GlobalCounters, EMULATE
 from tinygrad.engine.realize import lower_schedule_item, ProgramSpec, get_program
 from tinygrad.renderer import Estimates
 from tinygrad.codegen import full_rewrite
@@ -51,11 +51,8 @@ class TestMemoryCount(unittest.TestCase):
    a = Tensor.empty(1024, 1, dtype=dtypes.uint8).expand(1024, 1024)
    b = Tensor.empty(1024, 1, dtype=dtypes.uint8).expand(1024, 1024)
    _, mem = get_stats(a+b)
-    if RANGEIFY:
-      # rangeify is smart!
-      self.assertEqual(mem, 1024 + 2*1024)  # 2 lil reads + 1 lil write
-    else:
-      self.assertEqual(mem, 1024*1024 + 2*1024)  # 2 lil reads + 1 write
+    # rangeify is smart!
+    self.assertEqual(mem, 1024 + 2*1024)  # 2 lil reads + 1 lil write

  def test_self_add(self):
    a = Tensor.empty(1024, 1024, dtype=dtypes.uint8)
--- a/test/unit/test_kernelize.py
+++ b/test/unit/test_kernelize.py
@@ -1,7 +1,6 @@
 import unittest
 from tinygrad import Tensor
 from tinygrad.uop import Ops
-from tinygrad.helpers import RANGEIFY

 class TestKernelize(unittest.TestCase):
  def test_add_reshaped(self):
@@ -18,8 +17,8 @@ class TestKernelize(unittest.TestCase):
    a1 = a.sum(axis=1)
    a0 = a1.sum(axis=0)
    a0.kernelize()
-    self.assertEqual(len([s for s in a0.uop.toposort() if s.op is Ops.KERNEL]), 2 if RANGEIFY else 3)
-    self.assertIs(a1.uop.base.op, Ops.REDUCE_AXIS if RANGEIFY else Ops.ASSIGN)
+    self.assertEqual(len([s for s in a0.uop.toposort() if s.op is Ops.KERNEL]), 2)
+    self.assertIs(a1.uop.base.op, Ops.REDUCE_AXIS)
    # input Tensor and user contiguous kernelize
    self.assertIs(a0.uop.base.op, Ops.ASSIGN)
    self.assertIs(a.uop.base.op, Ops.ASSIGN)
--- a/test/unit/test_shm_tensor.py
+++ b/test/unit/test_shm_tensor.py
@@ -1,11 +1,11 @@
 import unittest
 import multiprocessing.shared_memory as shared_memory
-from tinygrad.helpers import CI, WIN, RANGEIFY
+from tinygrad.helpers import CI, WIN
 from tinygrad.tensor import Tensor, Device
 import numpy as np

 class TestRawShmBuffer(unittest.TestCase):
-  @unittest.skipIf(WIN and CI and RANGEIFY, "only fails with RANGEIFY on CI windows instance")
+  @unittest.skipIf(WIN and CI, "only fails on CI windows instance")
  def test_e2e(self):
    t = Tensor.randn(2, 2, 2).realize()

--- a/test/unit/test_winograd.py
+++ b/test/unit/test_winograd.py
@@ -35,14 +35,14 @@ class TestWinograd(unittest.TestCase):
  def test_forward_kernels(self):
    x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
    out = Tensor.conv2d(x,w)
-    self.assertEqual(len(out.schedule()), 2 if RANGEIFY else 4)
+    self.assertEqual(len(out.schedule()), 2)

  def test_backward_kernels(self):
    x,w = Tensor.empty(1,4,9,9,requires_grad=True).realize(), Tensor.empty(4,4,3,3,requires_grad=True).realize()
    out = Tensor.conv2d(x,w, padding=1)
    out.mean().backward()
    backward_schedule = Tensor.schedule(x.grad, w.grad)
-    self.assertEqual(len(backward_schedule), 4 if RANGEIFY else 9)
+    self.assertEqual(len(backward_schedule), 4)

  def test_counters(self):
    IC, OC, X, Y = 4,4,9,9
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -6,7 +6,7 @@ from typing import Callable, ClassVar, Sequence, cast, get_args, Literal, Suppor
 from tinygrad.dtype import DType, DTypeLike, dtypes, ImageDType, ConstType, least_upper_float, least_upper_dtype, sum_acc_dtype, to_dtype, truncate
 from tinygrad.dtype import _from_np_dtype, _to_np_dtype
 from tinygrad.helpers import argfix, make_tuple, flatten, prod, all_int, round_up, merge_dicts, argsort, getenv, all_same, fully_flatten, dedup
-from tinygrad.helpers import IMAGE, WINO, Metadata, TRACEMETA, ceildiv, fetch, polyN, unwrap, DEBUG, is_numpy_ndarray, RANGEIFY, FUSE_ATTENTION
+from tinygrad.helpers import IMAGE, WINO, Metadata, TRACEMETA, ceildiv, fetch, polyN, unwrap, DEBUG, is_numpy_ndarray, FUSE_ATTENTION
 from tinygrad.helpers import suppress_finalizing
 from tinygrad.gradient import compute_gradient
 from tinygrad.uop.ops import smax, smin, resolve, UOp, Ops, sint, MathTrait, identity_element, all_metadata, _index_to_concrete_int, sint_to_uop, \
@@ -227,7 +227,7 @@ class Tensor(MathTrait):
    # verify Tensors match the spec
    if __debug__: type_verify(list(big_sink.toposort()), tensor_uop_spec)

-    if RANGEIFY and any(isinstance(x._device, tuple) for x in big_sink.toposort()):
+    if any(isinstance(x._device, tuple) for x in big_sink.toposort()):
      _apply_map_to_tensors(get_multi_map(big_sink), "Apply Multi Map")
      big_sink = UOp.sink(*flatten([x.uop.src if x.uop.op is Ops.MULTI else [x.uop] for x in (self,)+lst]))