diff --git a/examples/hlb_cifar10.py b/examples/hlb_cifar10.py index bde89c4c8b..27fecf02d8 100644 --- a/examples/hlb_cifar10.py +++ b/examples/hlb_cifar10.py @@ -229,7 +229,8 @@ def train_cifar(): if getenv("RANDOM_CROP", 1): X = random_crop(X, crop_size=32) if getenv("RANDOM_FLIP", 1): - X = (Tensor.rand(X.shape[0],1,1,1) < 0.5).where(X.flip(-1), X) # flip LR + # NOTE: RANGEIFY=1 needs this contiguous or the X[perms] is very slow + X = (Tensor.rand(X.shape[0],1,1,1) < 0.5).where(X.flip(-1), X).contiguous() # flip LR X, Y = X[perms], Y[perms] return X, Y, *cutmix(X, Y, perms, mask_size=hyp['net']['cutmix_size']) diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index aa7345820e..eb74d2931c 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -114,6 +114,16 @@ class TestRealWorld(unittest.TestCase): helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.07, 93) + @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow") + def test_forward_cifar(self): + BS = 32 + # with training batchnorm still though + with Tensor.train(): + model = SpeedyResNet(Tensor.ones((12,3,2,2))) + @TinyJit + def run(X): return model(X) + helper_test("forward_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), run, (1.0/48)*BS, 126) + @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow") def test_train_cifar(self): with Tensor.train(): diff --git a/test/test_rangeify.py b/test/test_rangeify.py index af89cd6214..e27bacdc4e 100644 --- a/test/test_rangeify.py +++ b/test/test_rangeify.py @@ -20,6 +20,24 @@ class TestRangeifyAssign(unittest.TestCase): N = 256 +class TestRangeifyOpt(unittest.TestCase): + def test_randperm(self): + Tensor.randperm(10000).realize() + + def test_one_getitem(self): + X = Tensor.empty(10000) + sel = Tensor.arange(1000).contiguous().realize() + Xsel = X[sel] + Tensor.realize(Xsel) + + def test_two_getitem(self): + # this is splitting on the child even when it really shouldn't + X = Tensor.empty(10000) + Y = Tensor.empty(10000) + sel = Tensor.arange(1000).contiguous().realize() + Xsel, Ysel = X[sel], Y[sel] + Tensor.realize(Xsel, Ysel) + @unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY") class TestRangeify(unittest.TestCase): def test_expand_children(self): @@ -59,6 +77,14 @@ class TestRangeify(unittest.TestCase): C = Tensor.empty(N, N) (((A@B).exp()@C).exp()).realize() + def test_double_gemm_exp_child(self): + A = Tensor.empty(N, N) + B = Tensor.empty(N, N) + C = Tensor.empty(N, N) + # A@B is used with exp, and also on the sum. this is two kernels now, is this right? + ret = A@B + ((ret.exp()@C)+ret).realize() + def test_double_gemm_relu(self): A = Tensor.empty(N, N) B = Tensor.empty(N, N) diff --git a/test/unit/test_winograd.py b/test/unit/test_winograd.py index 0a7855ff06..5c81b95aad 100644 --- a/test/unit/test_winograd.py +++ b/test/unit/test_winograd.py @@ -42,7 +42,7 @@ class TestWinograd(unittest.TestCase): out = Tensor.conv2d(x,w, padding=1) out.mean().backward() backward_schedule = Tensor.schedule(x.grad, w.grad) - self.assertEqual(len(backward_schedule), 4 if RANGEIFY else 9) + self.assertEqual(len(backward_schedule), 3 if RANGEIFY else 9) def test_counters(self): IC, OC, X, Y = 4,4,9,9 diff --git a/tinygrad/schedule/rangeify.py b/tinygrad/schedule/rangeify.py index a93cd2a8e2..99d15aa914 100644 --- a/tinygrad/schedule/rangeify.py +++ b/tinygrad/schedule/rangeify.py @@ -390,14 +390,14 @@ def remove_bufferize(src:UOp, buf:UOp, idx:UOp): # if it's user contiguous, we never remove it if src.op is Ops.CONTIGUOUS: return None + # const reduce is okay + def okay_reduce(x:UOp): return all(y.op not in {Ops.BUFFER, Ops.COPY} for y in x.sparents) + # here is where we compute the cost # for now just no REDUCE, COPY, or ASSIGN ran = src.toposort(gate=lambda x: x.op not in {Ops.INDEX}) # we don't want to bufferize threefry, also causes problems because not all platforms support long - if any(x.op in {Ops.REDUCE, Ops.COPY, Ops.BUFFER_VIEW, Ops.ASSIGN} for x in ran) and src.op is not Ops.THREEFRY: return None - - # simple, matching old behavior - #if src.op is not Ops.INDEX: return None + if any(x.op in {Ops.REDUCE, Ops.COPY, Ops.BUFFER_VIEW, Ops.ASSIGN} and not okay_reduce(x) for x in ran) and src.op is not Ops.THREEFRY: return None # this is the ranges replaced return src.substitute(dict(zip(buf.src[1:], idx.src[1:])))