mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-06 21:53:53 -05:00
remove trivial use of RANGEIFY flag (#12550)
some tests need update still
This commit is contained in:
@@ -80,7 +80,6 @@ print("******** third, the UOp ***********")
|
||||
|
||||
from tinygrad.engine.realize import run_schedule
|
||||
from tinygrad.engine.schedule import create_schedule_with_vars
|
||||
from tinygrad.helpers import RANGEIFY
|
||||
from tinygrad.schedule.rangeify import get_rangeify_map
|
||||
|
||||
# allocate some values + load in values
|
||||
|
||||
@@ -49,8 +49,7 @@ def rangeify_kernel3():
|
||||
b = Tensor.empty(N,N)
|
||||
c = a@b
|
||||
#c = c.reshape((32,2,16,4,32,2,16,4)).contiguous()
|
||||
with Context(RANGEIFY=1):
|
||||
sink = c.schedule()[-1].ast
|
||||
sink = c.schedule()[-1].ast
|
||||
#print(sink)
|
||||
|
||||
opts = [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.UPCAST, 0, 2)]
|
||||
@@ -329,7 +328,7 @@ if __name__ == "__main__":
|
||||
elif HL == 1: hprg = hl_spec_kernel3()
|
||||
else: hprg = hand_spec_kernel3()
|
||||
if HL == 3:
|
||||
with Context(RANGEIFY=1, BLOCK_REORDER=0):
|
||||
with Context(BLOCK_REORDER=0):
|
||||
prg = get_program(hprg, Device.default.renderer)
|
||||
else:
|
||||
prg = get_program(hprg, Device.default.renderer)
|
||||
|
||||
14
test/external/external_test_opt.py
vendored
14
test/external/external_test_opt.py
vendored
@@ -4,7 +4,7 @@ import numpy as np
|
||||
import torch
|
||||
|
||||
from tinygrad import GlobalCounters, Tensor, Device
|
||||
from tinygrad.helpers import getenv, RANGEIFY
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.engine.realize import capturing
|
||||
from tinygrad.tensor import _to_np_dtype
|
||||
@@ -164,7 +164,7 @@ class TestOpt(unittest.TestCase):
|
||||
|
||||
def test_permute_was_pushed(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache(1 if RANGEIFY else 2):
|
||||
with CLCache(1):
|
||||
c = a.sum(2)
|
||||
d = c.permute(1,0).contiguous()
|
||||
d.realize()
|
||||
@@ -172,7 +172,7 @@ class TestOpt(unittest.TestCase):
|
||||
|
||||
def test_permute_was_pushed_through_contract_reshape(self):
|
||||
a = Tensor.randn(4, 4, 4, 4, 4)
|
||||
with CLCache(1 if RANGEIFY else 2):
|
||||
with CLCache(1):
|
||||
c = a.sum(-1)
|
||||
d = c.reshape(16,16).permute(1,0).contiguous()
|
||||
d.realize()
|
||||
@@ -180,7 +180,7 @@ class TestOpt(unittest.TestCase):
|
||||
|
||||
def test_permute_was_pushed_through_contractw1s_reshape(self):
|
||||
a = Tensor.randn(4, 4, 4, 4, 4)
|
||||
with CLCache(1 if RANGEIFY else 2):
|
||||
with CLCache(1):
|
||||
c = a.sum(-1)
|
||||
d = c.reshape(16,1,16).permute(2,1,0).contiguous()
|
||||
d.realize()
|
||||
@@ -188,7 +188,7 @@ class TestOpt(unittest.TestCase):
|
||||
|
||||
def test_permute_was_pushed_through_expand_reshape(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache(1 if RANGEIFY else 2):
|
||||
with CLCache(1):
|
||||
c = a.sum(2)
|
||||
d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
|
||||
d.realize()
|
||||
@@ -220,7 +220,7 @@ class TestOpt(unittest.TestCase):
|
||||
for axis in [0, 1]:
|
||||
for n in [4, 8, 16]:
|
||||
b = torch.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
|
||||
with CLCache(allowed=3 if RANGEIFY else 2):
|
||||
with CLCache(allowed=3):
|
||||
a = Tensor.ones(n, n).contiguous().sum(axis).reshape(n, 1).expand(n, n).sum(axis)
|
||||
a.realize()
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
|
||||
@@ -229,7 +229,7 @@ class TestOpt(unittest.TestCase):
|
||||
axis1, axis2 = 0, 1
|
||||
for n in [4, 8, 16]:
|
||||
b = torch.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
|
||||
with CLCache(allowed=3 if RANGEIFY else 2):
|
||||
with CLCache(allowed=3):
|
||||
a = Tensor.ones(n, n).contiguous().sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
|
||||
a.realize()
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import time, struct, unittest
|
||||
import time, struct
|
||||
from typing import Any, Callable
|
||||
import numpy as np
|
||||
from tinygrad import Tensor, dtypes, Device
|
||||
@@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.engine.realize import Runner
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.helpers import T, CI, RANGEIFY
|
||||
from tinygrad.helpers import T, CI
|
||||
from tinygrad.codegen import full_rewrite
|
||||
from tinygrad.runtime.ops_python import PythonProgram, PythonRenderer, PythonCompiler
|
||||
|
||||
@@ -62,6 +62,3 @@ def not_support_multi_device():
|
||||
|
||||
# NOTE: This will open REMOTE if it's the default device
|
||||
REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device)
|
||||
|
||||
def expect_rangeify_fails(fxn): return (unittest.expectedFailure if RANGEIFY else (lambda f:f))(fxn)
|
||||
def expect_nonrangeify_fails(fxn): return (unittest.expectedFailure if not RANGEIFY else (lambda f:f))(fxn)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
from tinygrad import Device, Tensor, dtypes
|
||||
from tinygrad.helpers import CI, RANGEIFY
|
||||
from tinygrad.helpers import CI
|
||||
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
|
||||
|
||||
# TODO: write a clean version of this
|
||||
@@ -351,7 +351,6 @@ class TestKernelOpts(unittest.TestCase):
|
||||
] + [[Opt(OptOps.THREAD, 0, 4)] if Device[Device.DEFAULT].renderer.global_max[0] >= 4 else []]
|
||||
+ [[Opt(OptOps.THREAD, 0, 8)] if Device[Device.DEFAULT].renderer.global_max[0] >= 8 else []])
|
||||
|
||||
@unittest.skipUnless(RANGEIFY>=1, "Kernel only fuses with rangeify")
|
||||
def test_double_sum_group(self):
|
||||
a = Tensor.rand(4, 4, 4)
|
||||
r = a.sum((1, 2)).sum()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
|
||||
from tinygrad.helpers import CI, Context, getenv, RANGEIFY
|
||||
from tinygrad.helpers import CI, Context, getenv
|
||||
from tinygrad.engine.realize import run_schedule
|
||||
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
|
||||
from tinygrad.uop.ops import Ops
|
||||
@@ -95,7 +95,7 @@ class TestIndexing(unittest.TestCase):
|
||||
X = dataset[idxs]
|
||||
assert X.shape == (4,DDIM)
|
||||
sched = X.schedule()
|
||||
self.assertEqual(len(sched), 1 if RANGEIFY else 2)
|
||||
self.assertEqual(len(sched), 1)
|
||||
run_schedule(sched)
|
||||
assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops} != {4*DSET}"
|
||||
np.testing.assert_allclose(real_index, X.numpy())
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import contextlib
|
||||
import numpy as np
|
||||
from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
|
||||
from tinygrad.device import is_dtype_supported
|
||||
@@ -271,8 +270,6 @@ class TestAssign(unittest.TestCase):
|
||||
b.assign(a.contiguous()).realize()
|
||||
assert GlobalCounters.kernel_count - kc == 2
|
||||
|
||||
# passing in RANGEIFY=1, RANGEIFY=0 asserts permuted assigns it can't fuse
|
||||
def assert_permuted_assign(self): return self.assertRaisesRegex(RuntimeError, "contiguous") if not RANGEIFY else contextlib.nullcontext()
|
||||
def test_permuted_assignment(self):
|
||||
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
@@ -280,14 +277,13 @@ class TestAssign(unittest.TestCase):
|
||||
b.realize()
|
||||
ba1 = a.uop.base.realized
|
||||
bb1 = b.uop.base.realized
|
||||
with self.assert_permuted_assign():
|
||||
a = a.permute(1,0)
|
||||
a += b
|
||||
a.realize()
|
||||
ba2 = a.uop.base.realized
|
||||
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
||||
# permute and base are the same buffer
|
||||
assert ba1 == ba2 and ba1 != bb1
|
||||
a = a.permute(1,0)
|
||||
a += b
|
||||
a.realize()
|
||||
ba2 = a.uop.base.realized
|
||||
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
||||
# permute and base are the same buffer
|
||||
assert ba1 == ba2 and ba1 != bb1
|
||||
|
||||
def test_post_permuted_assignment(self):
|
||||
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
@@ -297,13 +293,12 @@ class TestAssign(unittest.TestCase):
|
||||
#GlobalCounters.cache = []
|
||||
ba1 = a.uop.base.realized # noqa: F841
|
||||
bb1 = b.uop.base.realized # noqa: F841
|
||||
with self.assert_permuted_assign():
|
||||
a.assign(a.permute(1,0) + b) # this should not work!
|
||||
a.realize()
|
||||
ba2 = a.uop.base.realized # noqa: F841
|
||||
# NOTE: don't test that it's assigned
|
||||
#assert ba1 == ba2 and ba1 != bb1
|
||||
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
||||
a.assign(a.permute(1,0) + b) # this should not work!
|
||||
a.realize()
|
||||
ba2 = a.uop.base.realized # noqa: F841
|
||||
# NOTE: don't test that it's assigned
|
||||
#assert ba1 == ba2 and ba1 != bb1
|
||||
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
||||
|
||||
@unittest.skipUnless(RANGEIFY, "only correct in rangeify")
|
||||
def test_post_permuted_assignment_alt(self):
|
||||
@@ -345,21 +340,18 @@ class TestAssign(unittest.TestCase):
|
||||
def test_permuted_assignment_correct(self):
|
||||
a = Tensor.arange(4 * 4).reshape(4, 4).contiguous().realize()
|
||||
b = Tensor.arange(4 * 4).reshape(4, 4).contiguous().realize()
|
||||
# TODO: swizzler.py limitation, should NOT raise AssertionError from numpy.
|
||||
with self.assert_permuted_assign():
|
||||
a = a.permute(1, 0)
|
||||
new_val = a + b
|
||||
a.assign(new_val)
|
||||
np.testing.assert_equal(a.numpy(), np.arange(4 * 4).reshape(4, 4).transpose(1, 0) + np.arange(4 * 4).reshape(4, 4))
|
||||
a = a.permute(1, 0)
|
||||
new_val = a + b
|
||||
a.assign(new_val)
|
||||
np.testing.assert_equal(a.numpy(), np.arange(4 * 4).reshape(4, 4).transpose(1, 0) + np.arange(4 * 4).reshape(4, 4))
|
||||
|
||||
def test_permuted_reduceop_child_dual_use(self):
|
||||
a = Tensor.randn(32, 32, 32).realize()
|
||||
b = Tensor.full((32, 32), 1.).contiguous().realize()
|
||||
with self.assert_permuted_assign():
|
||||
r = a.sum(axis=1)
|
||||
b.assign(r + b.permute(1, 0))
|
||||
b.realize()
|
||||
np.testing.assert_allclose(b.numpy(), a.numpy().sum(axis=1)+np.ones((32, 32)).transpose(1, 0), atol=1e-6, rtol=1e-3)
|
||||
r = a.sum(axis=1)
|
||||
b.assign(r + b.permute(1, 0))
|
||||
b.realize()
|
||||
np.testing.assert_allclose(b.numpy(), a.numpy().sum(axis=1)+np.ones((32, 32)).transpose(1, 0), atol=1e-6, rtol=1e-3)
|
||||
|
||||
@unittest.skip("multi output not supported anymore")
|
||||
def test_permuted_reduceop_multioutput_dual_use(self):
|
||||
@@ -401,11 +393,10 @@ class TestAssign(unittest.TestCase):
|
||||
|
||||
def test_permuted_assignment_masked_view_not_contiguous(self):
|
||||
a = Tensor.ones(4, 4).contiguous().realize()
|
||||
with self.assert_permuted_assign():
|
||||
b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2).permute(1, 0)
|
||||
a.assign(a + b)
|
||||
a.realize()
|
||||
self.assertListEqual(a.tolist(), [[2.,2.,2.,2.],[2.,2.,2.,2.],[3.,3.,3.,3.], [3.,3.,3.,3.]])
|
||||
b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2).permute(1, 0)
|
||||
a.assign(a + b)
|
||||
a.realize()
|
||||
self.assertListEqual(a.tolist(), [[2.,2.,2.,2.],[2.,2.,2.,2.],[3.,3.,3.,3.], [3.,3.,3.,3.]])
|
||||
|
||||
# TODO: is there a way to sneak in a permute such that it returns the wrong answer?
|
||||
|
||||
|
||||
@@ -3164,8 +3164,8 @@ class TestOps(unittest.TestCase):
|
||||
helper_test_op([(32,10)], lambda x: x.masked_fill((x>0.1).detach(), -math.inf))
|
||||
helper_test_op([(32,10)], lambda x: x.masked_fill((x<0.1).detach(), -math.inf))
|
||||
|
||||
@unittest.skipIf(RANGEIFY and (getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
|
||||
@unittest.skipIf(RANGEIFY and Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
|
||||
@unittest.skipIf((getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
|
||||
def test_masked_select(self):
|
||||
helper_test_op([(32, 10)], lambda x: x.masked_select(x>0.5), lambda x: x.masked_select(x>0.5), forward_only=True)
|
||||
helper_test_op([(32, 10)], lambda x: x.masked_select(torch.tensor(True)), lambda x: x.masked_select(Tensor(True)), forward_only=True)
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
import unittest
|
||||
from tinygrad import Tensor, nn
|
||||
from tinygrad.helpers import RANGEIFY, Context, GlobalCounters
|
||||
from tinygrad.helpers import Context, GlobalCounters
|
||||
from tinygrad.uop.ops import UOp, graph_rewrite, PatternMatcher, UPat, Ops
|
||||
|
||||
@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
|
||||
class TestRangeifyAssign(unittest.TestCase):
|
||||
def test_assign_permuted(self):
|
||||
A = Tensor.empty(4, 4, dtype='int')
|
||||
@@ -55,7 +54,6 @@ class TestRangeifyOpt(unittest.TestCase):
|
||||
A = Tensor.empty(8,8,8,8).permute(1,0,3,2).flatten()
|
||||
A.sum().realize()
|
||||
|
||||
@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
|
||||
class TestRangeify(unittest.TestCase):
|
||||
def test_groupnorm(self):
|
||||
# ranges 1 and 3 are merging
|
||||
@@ -230,7 +228,6 @@ class TestRangeify(unittest.TestCase):
|
||||
# contiguous + reduce can support ranges?
|
||||
|
||||
@unittest.skip("okay to disable this for now")
|
||||
@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
|
||||
class TestOuterworld(unittest.TestCase):
|
||||
def test_passthrough_range(self):
|
||||
t = Tensor.rand(10, 10).realize()
|
||||
|
||||
@@ -17,7 +17,6 @@ from tinygrad.helpers import CI, DEBUG, SPLIT_REDUCEOP, GlobalCounters, Context,
|
||||
from tinygrad.schedule.rangeify import get_rangeify_map, Kernel
|
||||
from tinygrad.engine.schedule import create_schedule_with_vars
|
||||
from tinygrad.engine.realize import CompiledRunner, run_schedule, lower_schedule
|
||||
from test.helpers import expect_rangeify_fails, expect_nonrangeify_fails
|
||||
|
||||
class KernelCountException(Exception): pass
|
||||
def check_schedule(t:Tensor|list[Tensor]|UOp, allowed:int, to_prerealize:list[Tensor]|None=None, filter_sink=True):
|
||||
@@ -117,7 +116,7 @@ class TestSchedule(unittest.TestCase):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty((1,), device="CPU").expand(10).contiguous()
|
||||
c = a+b
|
||||
with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 2 if RANGEIFY else 1)
|
||||
with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 2)
|
||||
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.half) and getenv("CAST_AFTER_EXPAND"), "need half and CAST_AFTER_EXPAND=1")
|
||||
@unittest.skip("CAST_AFTER_EXPAND is not supported")
|
||||
@@ -343,7 +342,7 @@ class TestSchedule(unittest.TestCase):
|
||||
r1 = (x - r0).sum(axis=0).div(2)
|
||||
out0 = r0 + y
|
||||
out1 = r1 + y
|
||||
schedule = check_schedule([out0, out1], 2 if RANGEIFY else 4)
|
||||
schedule = check_schedule([out0, out1], 2)
|
||||
reduceops = [x for si in schedule for x in si.ast.toposort() if x.op in {Ops.REDUCE_AXIS, Ops.REDUCE}]
|
||||
assert len(reduceops) in [2,3] # why is RANGEIFY different?
|
||||
|
||||
@@ -712,7 +711,7 @@ class TestSchedule(unittest.TestCase):
|
||||
check_schedule(b, 0)
|
||||
self.assertEqual(b.item(), 1)
|
||||
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_multioutput_ast(self):
|
||||
a = Tensor.zeros(1, dtype=dtypes.int).contiguous().realize().uop
|
||||
b = Tensor.zeros(1, dtype=dtypes.int).contiguous().realize().uop
|
||||
@@ -919,7 +918,7 @@ class TestSchedule(unittest.TestCase):
|
||||
out0 = a.sum() + 2
|
||||
out1 = a.sum() + 4
|
||||
out2 = out0 * out1
|
||||
run_schedule(check_schedule([out0, out1, out2], 1 if RANGEIFY else 4))
|
||||
run_schedule(check_schedule([out0, out1, out2], 1))
|
||||
np.testing.assert_allclose(out0.numpy(), out0_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-6)
|
||||
np.testing.assert_allclose(out1.numpy(), out1_np:=a.numpy().sum()+4, atol=1e-4, rtol=1e-6)
|
||||
np.testing.assert_allclose(out2.numpy(), out0_np*out1_np, atol=1e-4, rtol=1e-6)
|
||||
@@ -930,7 +929,7 @@ class TestSchedule(unittest.TestCase):
|
||||
out0 = a.sum().exp2()
|
||||
# out1 has two paths to a.sum()
|
||||
out1 = a.sum() + out0
|
||||
run_schedule(check_schedule([out0, out1], 1 if RANGEIFY else 3))
|
||||
run_schedule(check_schedule([out0, out1], 1))
|
||||
np.testing.assert_allclose(out0.numpy(), out0_np:=np.exp2(a.numpy().sum()), atol=1e-4, rtol=1e-4)
|
||||
np.testing.assert_allclose(out1.numpy(), a.numpy().sum()+out0_np, atol=1e-4, rtol=1e-6)
|
||||
|
||||
@@ -1022,7 +1021,7 @@ class TestSchedule(unittest.TestCase):
|
||||
b = Tensor.empty(10,)
|
||||
c = a.sum() + b[0]
|
||||
d = a.sum() + 2
|
||||
check_schedule([c, d], 1 if RANGEIFY else 3)
|
||||
check_schedule([c, d], 1)
|
||||
|
||||
def test_reduce_multiple_paths_midshrink(self):
|
||||
a = Tensor.empty(4, 4)
|
||||
@@ -1186,14 +1185,14 @@ class TestSchedule(unittest.TestCase):
|
||||
np.testing.assert_allclose(out.numpy(), expected, atol=1e-4, rtol=1e-4)
|
||||
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_softmax_upcast(self):
|
||||
# input half, softmax in float
|
||||
Tensor.manual_seed(0)
|
||||
x = Tensor.randn(4, 12, 64, 64, dtype=dtypes.half).realize()
|
||||
out = x.softmax(dtype=dtypes.float)
|
||||
sched = out.schedule()
|
||||
self.assertEqual(len(sched), 2 if RANGEIFY else 3)
|
||||
self.assertEqual(len(sched), 2)
|
||||
self.assertEqual(sched[0].bufs[0].dtype, dtypes.half)
|
||||
|
||||
# input float, softmax in float
|
||||
@@ -1323,7 +1322,7 @@ class TestSchedule(unittest.TestCase):
|
||||
check_schedule(opt.schedule_step(), 14)
|
||||
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_prefer_half_buffer(self):
|
||||
x = Tensor.ones(4).contiguous().realize()
|
||||
# y = Tensor.ones(4).contiguous().realize()
|
||||
@@ -1475,7 +1474,7 @@ class TestSchedule(unittest.TestCase):
|
||||
e = c * d
|
||||
f = b.sum() - e
|
||||
# run_schedule(check_schedule([c, d, e, f], 1))
|
||||
run_schedule(check_schedule([c, d, e, f], 2 if RANGEIFY else 5))
|
||||
run_schedule(check_schedule([c, d, e, f], 2))
|
||||
np.testing.assert_allclose(c.numpy(), c_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
|
||||
np.testing.assert_allclose(d.numpy(), d_np:=a.numpy().sum()*2, atol=1e-4, rtol=1e-4)
|
||||
np.testing.assert_allclose(e.numpy(), e_np:=c_np*d_np, atol=1e-4, rtol=1e-4)
|
||||
@@ -1690,7 +1689,7 @@ class TestSchedule(unittest.TestCase):
|
||||
def test_late_fusion_post_expand(self):
|
||||
self._test_fusion([(32, 32)], lambda a:a-a.sum(1), 2)
|
||||
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_cast_padded_view(self):
|
||||
a = Tensor.arange(4).reshape(1, 4)
|
||||
casted_view = a.pad(((0, 1), (0, 0))).cast(dtypes.float)
|
||||
@@ -1720,7 +1719,7 @@ class TestSchedule(unittest.TestCase):
|
||||
self.assertListEqual(realized_const_view.tolist(), [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]])
|
||||
|
||||
@given(strat.sampled_from(dtypes.all), strat.sampled_from(dtypes.all))
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_cast_padded_const(self, dt1, dt2):
|
||||
assume(is_dtype_supported(dt1) and is_dtype_supported(dt2))
|
||||
a = Tensor(1, dtype=dt1).reshape(1, 1).pad(((1, 1), None))
|
||||
@@ -1891,9 +1890,7 @@ class TestSchedule(unittest.TestCase):
|
||||
tst = x.shrink((None, (0, 2))).assign(a).realize()
|
||||
xref[:, :2] = np.arange(8).reshape(4, 2)+y.numpy()
|
||||
np.testing.assert_equal(x.numpy(), xref)
|
||||
if RANGEIFY > 0:
|
||||
# NOTE: this is a bug on non rangeify
|
||||
np.testing.assert_equal(tst.numpy(), a.numpy())
|
||||
np.testing.assert_equal(tst.numpy(), a.numpy())
|
||||
|
||||
def test_setitem_sched(self, mop=lambda x:x, expected_kcount=1):
|
||||
a = Tensor.arange(16, device="CPU").reshape(4, 4).contiguous().realize()
|
||||
@@ -1904,7 +1901,6 @@ class TestSchedule(unittest.TestCase):
|
||||
run_schedule(sched)
|
||||
self.assertListEqual(a.tolist(), expected)
|
||||
self.assertEqual(kcount, expected_kcount)
|
||||
@unittest.skipUnless(RANGEIFY>0, "this asserts on non rangeify")
|
||||
def test_setitem_permuted_sched(self): self.test_setitem_sched(lambda x: x.T, 2)
|
||||
def test_setitem_paddded_sched(self): self.test_setitem_sched(lambda x: x.shrink_to(4, 1).pad_to(4, 4), 1)
|
||||
|
||||
@@ -1943,7 +1939,7 @@ class TestSchedule(unittest.TestCase):
|
||||
r = (X+Tensor.arange(16).reshape(4, 4)).sum()
|
||||
out0 = r+2
|
||||
out1 = r+3
|
||||
run_schedule(check_schedule([out0, out1], 1 if RANGEIFY else 3))
|
||||
run_schedule(check_schedule([out0, out1], 1))
|
||||
r_ref = (X.numpy()+np.arange(16).reshape(4, 4)).sum()
|
||||
np.testing.assert_allclose(out0.numpy(), r_ref+2, rtol=2e-7)
|
||||
np.testing.assert_allclose(out1.numpy(), r_ref+3, rtol=2e-7)
|
||||
@@ -2088,7 +2084,7 @@ class TestView(unittest.TestCase):
|
||||
run_schedule(sched)
|
||||
np.testing.assert_equal(b.numpy(), 0)
|
||||
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_mask_dim_1(self):
|
||||
# mask out dim = 1 works too
|
||||
a = Tensor.rand(10, 10).realize()
|
||||
@@ -2236,7 +2232,6 @@ class TestCopyFolding(unittest.TestCase):
|
||||
b.realize()
|
||||
self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])
|
||||
|
||||
@expect_nonrangeify_fails
|
||||
def test_permute_on_disk_contiguous(self):
|
||||
with open(temp('dt_arange_4_permute'), "wb") as f: f.write(Tensor.arange(4).realize().uop.base.buffer.as_buffer())
|
||||
a = Tensor.empty(4, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_4_permute')}")
|
||||
@@ -2251,8 +2246,6 @@ class TestCopyFolding(unittest.TestCase):
|
||||
self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])
|
||||
|
||||
# NOTE: disk permute must come after COPY
|
||||
# TODO: this is wrong because of the permute
|
||||
@expect_nonrangeify_fails
|
||||
def test_permute_after_shrink_on_disk(self):
|
||||
with open(temp('dt_arange_5_permute'), "wb") as f: f.write(Tensor.arange(5).realize().uop.base.buffer.as_buffer())
|
||||
a = Tensor.empty(5, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_5_permute')}")
|
||||
@@ -2396,12 +2389,8 @@ class TestUOpBecome(unittest.TestCase):
|
||||
a = Tensor.empty(4, 1)
|
||||
b = a.expand(4, 4).reciprocal()
|
||||
check_schedule(b, 1)
|
||||
if RANGEIFY:
|
||||
self.assertEqual(b.uop.base.buffer.size, 4)
|
||||
self.assertEqual(b.uop.shape, (4, 4))
|
||||
return
|
||||
self.assertEqual(b.uop.base.buffer.size, 16)
|
||||
self.assertEqual(b.uop.st, ShapeTracker.from_shape((4, 4)))
|
||||
self.assertEqual(b.uop.base.buffer.size, 4)
|
||||
self.assertEqual(b.uop.shape, (4, 4))
|
||||
|
||||
def test_reorder_expand_alt(self):
|
||||
x = Tensor.empty(4, 1)
|
||||
@@ -2410,7 +2399,7 @@ class TestUOpBecome(unittest.TestCase):
|
||||
z = (img*x) / y
|
||||
check_schedule(z, 1)
|
||||
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_become_existing_buffer(self):
|
||||
a = Tensor.empty(4, 4)
|
||||
b = a*1
|
||||
@@ -2444,7 +2433,7 @@ class TestUOpBecome(unittest.TestCase):
|
||||
assert UPat(Ops.CONST, arg=3).match(const_add.uop.base, {})
|
||||
|
||||
# tensors can become another realized tensor source
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_become_existing_buf_simple(self):
|
||||
a = Tensor.empty(4, 4)
|
||||
b = a+0
|
||||
@@ -2453,14 +2442,14 @@ class TestUOpBecome(unittest.TestCase):
|
||||
self.assertIs(a.uop, b.uop)
|
||||
|
||||
# they can also chain other movement ops on top of the tensor source
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_become_existing_buf_view(self):
|
||||
a = Tensor.empty(4, 4)
|
||||
b = a.permute((1, 0))+0
|
||||
check_schedule(b, 0)
|
||||
self.assertEqual(b.uop.st, a.uop.permute((1, 0)).st)
|
||||
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_become_existing_buf_view_alt(self):
|
||||
a = Tensor.empty(4, 4)
|
||||
b = a.permute((1, 0)).reshape((8, 2))+0
|
||||
@@ -2468,7 +2457,7 @@ class TestUOpBecome(unittest.TestCase):
|
||||
self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st)
|
||||
|
||||
# they can also have other base parents that simplified, in that case we just backtrack to the chained mops
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_become_existing_buf_complex(self):
|
||||
a = Tensor.empty(4, 4)
|
||||
b = (a.permute((1, 0))+0).reshape((8, 2))+0
|
||||
@@ -2476,7 +2465,7 @@ class TestUOpBecome(unittest.TestCase):
|
||||
self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st)
|
||||
assert b.uop.base.op is Ops.BUFFER
|
||||
|
||||
@expect_rangeify_fails
|
||||
@unittest.expectedFailure
|
||||
def test_become_multiple_choices(self):
|
||||
a = Tensor.empty(16)
|
||||
b = (a.reshape(1, 1, 4, 1, 4)+0).reshape(1, 1, 4, 4).shrink(((0, 1), (0, 1), (0, 3), (0, 3)))+0
|
||||
@@ -2494,13 +2483,8 @@ class TestUOpBecome(unittest.TestCase):
|
||||
b.realize()
|
||||
assert a.uop.is_realized
|
||||
assert a.uop.buffer._base is None
|
||||
# b is a subbuffer of a (buffer_view in non rangeify, rangeify just makes a shrink)
|
||||
if RANGEIFY:
|
||||
assert b.uop.op_in_backward_slice_with_self(Ops.SHRINK)
|
||||
assert b.uop.base is a.uop.base
|
||||
return
|
||||
assert b.uop.op is Ops.BUFFER_VIEW
|
||||
assert b.uop.src[0] is a.uop
|
||||
assert b.uop.op_in_backward_slice_with_self(Ops.SHRINK)
|
||||
assert b.uop.base is a.uop.base
|
||||
|
||||
def test_setitem_offset(self):
|
||||
a = Tensor.full((16,), 0.).contiguous().realize()
|
||||
|
||||
@@ -2,7 +2,6 @@ import unittest
|
||||
|
||||
from test.helpers import assert_jit_cache_len
|
||||
from tinygrad import Variable, Tensor, TinyJit
|
||||
from tinygrad.helpers import RANGEIFY
|
||||
import numpy as np
|
||||
|
||||
class TestSymbolicJit(unittest.TestCase):
|
||||
@@ -27,7 +26,7 @@ class TestSymbolicJit(unittest.TestCase):
|
||||
symbolic = jf(a[:, :vi]).numpy()
|
||||
expected = f(a[:, :i]).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert_jit_cache_len(jf, 1 if RANGEIFY else 2) # one add and one pad, can be one kernel?
|
||||
assert_jit_cache_len(jf, 1)
|
||||
|
||||
def test_add(self):
|
||||
def f(a, b): return (a+b).realize()
|
||||
@@ -80,7 +79,7 @@ class TestSymbolicJit(unittest.TestCase):
|
||||
symbolic = jf(q, k[:, :vi], v[:, :vi])[:2, :4, :1, :8].numpy()
|
||||
expected = f(q, k[:, :i], v[:, :i]).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert_jit_cache_len(jf, 4 if RANGEIFY else 5)
|
||||
assert_jit_cache_len(jf, 4)
|
||||
|
||||
def test_cat_dim0(self):
|
||||
def f(a, b): return a.cat(b, dim=0).realize()
|
||||
|
||||
@@ -4,7 +4,7 @@ import torch
|
||||
import unittest, copy, mmap, random, math, array
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.tensor import _METADATA
|
||||
from tinygrad.helpers import getenv, temp, mv_address, RANGEIFY
|
||||
from tinygrad.helpers import getenv, temp, mv_address
|
||||
from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
|
||||
from hypothesis import given, settings, strategies as strat
|
||||
from tinygrad.device import is_dtype_supported
|
||||
@@ -872,18 +872,11 @@ class TestTensorMetadata(unittest.TestCase):
|
||||
self.assertEqual(y.grad.uop.metadata[0].name, "sigmoid")
|
||||
self.assertTrue(y.grad.uop.metadata[0].backward)
|
||||
si = Tensor.schedule(out, x.grad, y.grad)[-1]
|
||||
if not RANGEIFY:
|
||||
self.assertEqual(len(si.metadata), 4, f"failed with {si.metadata}")
|
||||
self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "__mul__", "relu"})
|
||||
bw = [m for m in si.metadata if m.backward]
|
||||
self.assertEqual(len(bw), 2)
|
||||
self.assertEqual(bw[0].name, "sigmoid")
|
||||
else:
|
||||
self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
|
||||
self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"})
|
||||
bw = [m for m in si.metadata if m.backward]
|
||||
self.assertEqual(len(bw), 1)
|
||||
self.assertEqual(bw[0].name, "sigmoid")
|
||||
self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
|
||||
self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"})
|
||||
bw = [m for m in si.metadata if m.backward]
|
||||
self.assertEqual(len(bw), 1)
|
||||
self.assertEqual(bw[0].name, "sigmoid")
|
||||
|
||||
class TestIdxUpcast(unittest.TestCase):
|
||||
def _find_op(self, ast: UOp, op: Ops):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
from tinygrad import Tensor
|
||||
from tinygrad.helpers import getenv, GlobalCounters, EMULATE, RANGEIFY
|
||||
from tinygrad.helpers import getenv, GlobalCounters, EMULATE
|
||||
from tinygrad.engine.realize import lower_schedule_item, ProgramSpec, get_program
|
||||
from tinygrad.renderer import Estimates
|
||||
from tinygrad.codegen import full_rewrite
|
||||
@@ -51,11 +51,8 @@ class TestMemoryCount(unittest.TestCase):
|
||||
a = Tensor.empty(1024, 1, dtype=dtypes.uint8).expand(1024, 1024)
|
||||
b = Tensor.empty(1024, 1, dtype=dtypes.uint8).expand(1024, 1024)
|
||||
_, mem = get_stats(a+b)
|
||||
if RANGEIFY:
|
||||
# rangeify is smart!
|
||||
self.assertEqual(mem, 1024 + 2*1024) # 2 lil reads + 1 lil write
|
||||
else:
|
||||
self.assertEqual(mem, 1024*1024 + 2*1024) # 2 lil reads + 1 write
|
||||
# rangeify is smart!
|
||||
self.assertEqual(mem, 1024 + 2*1024) # 2 lil reads + 1 lil write
|
||||
|
||||
def test_self_add(self):
|
||||
a = Tensor.empty(1024, 1024, dtype=dtypes.uint8)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import unittest
|
||||
from tinygrad import Tensor
|
||||
from tinygrad.uop import Ops
|
||||
from tinygrad.helpers import RANGEIFY
|
||||
|
||||
class TestKernelize(unittest.TestCase):
|
||||
def test_add_reshaped(self):
|
||||
@@ -18,8 +17,8 @@ class TestKernelize(unittest.TestCase):
|
||||
a1 = a.sum(axis=1)
|
||||
a0 = a1.sum(axis=0)
|
||||
a0.kernelize()
|
||||
self.assertEqual(len([s for s in a0.uop.toposort() if s.op is Ops.KERNEL]), 2 if RANGEIFY else 3)
|
||||
self.assertIs(a1.uop.base.op, Ops.REDUCE_AXIS if RANGEIFY else Ops.ASSIGN)
|
||||
self.assertEqual(len([s for s in a0.uop.toposort() if s.op is Ops.KERNEL]), 2)
|
||||
self.assertIs(a1.uop.base.op, Ops.REDUCE_AXIS)
|
||||
# input Tensor and user contiguous kernelize
|
||||
self.assertIs(a0.uop.base.op, Ops.ASSIGN)
|
||||
self.assertIs(a.uop.base.op, Ops.ASSIGN)
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
import unittest
|
||||
import multiprocessing.shared_memory as shared_memory
|
||||
from tinygrad.helpers import CI, WIN, RANGEIFY
|
||||
from tinygrad.helpers import CI, WIN
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
import numpy as np
|
||||
|
||||
class TestRawShmBuffer(unittest.TestCase):
|
||||
@unittest.skipIf(WIN and CI and RANGEIFY, "only fails with RANGEIFY on CI windows instance")
|
||||
@unittest.skipIf(WIN and CI, "only fails on CI windows instance")
|
||||
def test_e2e(self):
|
||||
t = Tensor.randn(2, 2, 2).realize()
|
||||
|
||||
|
||||
@@ -35,14 +35,14 @@ class TestWinograd(unittest.TestCase):
|
||||
def test_forward_kernels(self):
|
||||
x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
|
||||
out = Tensor.conv2d(x,w)
|
||||
self.assertEqual(len(out.schedule()), 2 if RANGEIFY else 4)
|
||||
self.assertEqual(len(out.schedule()), 2)
|
||||
|
||||
def test_backward_kernels(self):
|
||||
x,w = Tensor.empty(1,4,9,9,requires_grad=True).realize(), Tensor.empty(4,4,3,3,requires_grad=True).realize()
|
||||
out = Tensor.conv2d(x,w, padding=1)
|
||||
out.mean().backward()
|
||||
backward_schedule = Tensor.schedule(x.grad, w.grad)
|
||||
self.assertEqual(len(backward_schedule), 4 if RANGEIFY else 9)
|
||||
self.assertEqual(len(backward_schedule), 4)
|
||||
|
||||
def test_counters(self):
|
||||
IC, OC, X, Y = 4,4,9,9
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import Callable, ClassVar, Sequence, cast, get_args, Literal, Suppor
|
||||
from tinygrad.dtype import DType, DTypeLike, dtypes, ImageDType, ConstType, least_upper_float, least_upper_dtype, sum_acc_dtype, to_dtype, truncate
|
||||
from tinygrad.dtype import _from_np_dtype, _to_np_dtype
|
||||
from tinygrad.helpers import argfix, make_tuple, flatten, prod, all_int, round_up, merge_dicts, argsort, getenv, all_same, fully_flatten, dedup
|
||||
from tinygrad.helpers import IMAGE, WINO, Metadata, TRACEMETA, ceildiv, fetch, polyN, unwrap, DEBUG, is_numpy_ndarray, RANGEIFY, FUSE_ATTENTION
|
||||
from tinygrad.helpers import IMAGE, WINO, Metadata, TRACEMETA, ceildiv, fetch, polyN, unwrap, DEBUG, is_numpy_ndarray, FUSE_ATTENTION
|
||||
from tinygrad.helpers import suppress_finalizing
|
||||
from tinygrad.gradient import compute_gradient
|
||||
from tinygrad.uop.ops import smax, smin, resolve, UOp, Ops, sint, MathTrait, identity_element, all_metadata, _index_to_concrete_int, sint_to_uop, \
|
||||
@@ -227,7 +227,7 @@ class Tensor(MathTrait):
|
||||
# verify Tensors match the spec
|
||||
if __debug__: type_verify(list(big_sink.toposort()), tensor_uop_spec)
|
||||
|
||||
if RANGEIFY and any(isinstance(x._device, tuple) for x in big_sink.toposort()):
|
||||
if any(isinstance(x._device, tuple) for x in big_sink.toposort()):
|
||||
_apply_map_to_tensors(get_multi_map(big_sink), "Apply Multi Map")
|
||||
big_sink = UOp.sink(*flatten([x.uop.src if x.uop.op is Ops.MULTI else [x.uop] for x in (self,)+lst]))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user