diff --git a/test/test_linearizer.py b/test/test_linearizer.py index bb5e7ac3c9..7d61653ce8 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -234,10 +234,7 @@ class TestLinearizer(unittest.TestCase): x, w = Tensor.randn((1,1,3)).realize(), Tensor.randn((1,1,2)).realize() r = Tensor.conv2d(x,w,padding=1).relu() - k = Kernel(r.schedule()[-1].ast) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=0)) - k.apply_opt(Opt(op=OptOps.UNROLL, axis=0, arg=0)) - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(r.schedule()[-1].ast, opts=[Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.UNROLL, axis=0, arg=0)]).uops accs = [u for u in uops if u.op is Ops.DEFINE_REG] stores = [u for u in uops if u.op is Ops.STORE] assert len(accs) == 0 # it's removed now @@ -249,9 +246,7 @@ class TestLinearizer(unittest.TestCase): @unittest.skipUnless(Device.DEFAULT == "CPU", "test only for CPU") def test_upcast_with_locals_cpu(self): out = Tensor.ones(64,64).contiguous() @ Tensor.ones(64,64).contiguous() - k = Kernel(out.schedule()[-1].ast) - k.apply_opt(Opt(OptOps.LOCAL, axis=0, arg=4)) - prg = get_program(k.get_optimized_ast(), k.opts) + prg = get_program(out.schedule()[-1].ast, opts=[Opt(OptOps.LOCAL, axis=0, arg=4)]).uops self.assertEqual(len(prg.src.split("for")), 5) @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @@ -261,10 +256,8 @@ class TestLinearizer(unittest.TestCase): def test_upcast_with_locals(self): x, y = Tensor.rand(1,128), Tensor.rand(128, 128) r = (x@y).relu() - realized_ast = r.schedule()[-1].ast opts_to_apply = [Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4)] - realized_ast = realized_ast.replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply))) - program = get_program(realized_ast, Device[Device.DEFAULT].renderer) + program = get_program(r.schedule()[-1].ast, opts=opts_to_apply) stores = [u for u in program.uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG] @@ -278,10 +271,7 @@ class TestLinearizer(unittest.TestCase): def test_zero_fold(self): a, b = Tensor.randn(1).realize(), Tensor.randn(1).realize() r = Tensor.stack(a, b) - - k = Kernel(r.schedule()[-1].ast) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=0)) - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(r.schedule()[-1].ast, opts=[Opt(op=OptOps.UPCAST, axis=0, arg=0)]).uops num_ops = len([uop for uop in uops if uop.op in GroupOp.ALU]) assert num_ops == 0, "more alu uops than needed" @@ -291,16 +281,14 @@ class TestLinearizer(unittest.TestCase): if is_dtype_supported(tensor_dtype) and is_dtype_supported(acc_dtype): a = Tensor([1, 2, 3], dtype=tensor_dtype).sum() realized_ast = a.schedule()[-1].ast - realized_ast = realized_ast.replace(arg=KernelInfo(opts_to_apply=tuple())) - program = get_program(realized_ast, Device[Device.DEFAULT].renderer) + program = get_program(realized_ast, opts=[]) local = [uop for uop in program.uops if uop.op is Ops.DEFINE_REG] assert local[0].dtype.base == acc_dtype def test_arg_acc_dtype(self): def helper_arg_acc_dtype(c: Tensor, expected_dtype:DType): realized_ast = c.schedule()[-1].ast - realized_ast = realized_ast.replace(arg=KernelInfo(opts_to_apply=tuple())) - program = get_program(realized_ast, Device[Device.DEFAULT].renderer) + program = get_program(realized_ast, opts=[]) local = [uop for uop in program.uops if uop.op is Ops.DEFINE_REG] self.assertEqual(local[0].dtype.base, expected_dtype) @@ -758,11 +746,7 @@ class TestFloat4(unittest.TestCase): c = a + b s = c.schedule()[0] - k = Kernel(s.ast) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=4)) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=2)) - uops = get_program(k.get_optimized_ast(), k.opts).uops - + uops = get_program(s.ast, opts=[Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=2)]).uops assert TestFloat4.count_float4(uops) == (4, 2) @unittest.skipUnless(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "Only CPU with AMX upcasts float up to size 16") @@ -773,10 +757,7 @@ class TestFloat4(unittest.TestCase): c = a + b s = c.schedule()[0] - k = Kernel(s.ast) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=4)) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=shift)) - return get_program(k.get_optimized_ast(), k.opts).uops + return get_program(s.ast, opts=[Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=shift)]).uops sizes = [12, 8, 16] shifts = [3, 2, 4] @@ -806,10 +787,7 @@ class TestFloat4(unittest.TestCase): c = a + b s = c.schedule()[0] - k = Kernel(s.ast) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=1, arg=4)) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=1, arg=2)) - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(s.ast, opts=[Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=2)]).uops assert TestFloat4.count_float4(uops) == (0, 2) @@ -842,9 +820,7 @@ class TestFloat4(unittest.TestCase): # float4 should be emitted (the reduce axis of size 4 is the float4 axis here) s = c.schedule()[0] - k = Kernel(s.ast) - k.apply_opt(Opt(op=OptOps.UNROLL, axis=0, arg=4)) - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(s.ast, opts=[Opt(op=OptOps.UNROLL, axis=0, arg=4)]).uops assert TestFloat4.count_float4(uops) == (0, 0) @@ -858,10 +834,7 @@ class TestFloat4(unittest.TestCase): # UPDATE: now we do this fusion s = c.schedule()[0] - k = Kernel(s.ast) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=0)) - k.apply_opt(Opt(op=OptOps.UNROLL, axis=0, arg=0)) - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(s.ast, opts=[Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.UNROLL, axis=0, arg=0)]).uops assert TestFloat4.count_float4(uops) in {(0,1), (1,1)} @@ -874,9 +847,7 @@ class TestFloat4(unittest.TestCase): # since the top axis is not contiguous. s = c.schedule()[0] - k = Kernel(s.ast) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=4)) - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(s.ast, opts=[Opt(op=OptOps.UPCAST, axis=0, arg=4)]).uops assert TestFloat4.count_float4(uops) == (0, 1) @@ -888,9 +859,7 @@ class TestFloat4(unittest.TestCase): # should float4 b but not a s = c.schedule()[0] - k = Kernel(s.ast) - k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=4)) - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(s.ast, opts=[Opt(op=OptOps.UPCAST, axis=0, arg=4)]).uops assert TestFloat4.count_float4(uops) == (1, 1) diff --git a/test/test_schedule.py b/test/test_schedule.py index 41963c7a88..4538fe395a 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -6,7 +6,7 @@ import unittest import numpy as np import functools from typing import cast -from hypothesis import assume, given, strategies as strat +from hypothesis import assume, given, settings, strategies as strat from tinygrad import nn, dtypes, Device, Tensor from tinygrad.device import is_dtype_supported @@ -16,7 +16,7 @@ from tinygrad.uop.ops import PatternMatcher, UOp, Ops, GroupOp, UPat, graph_rewr from tinygrad.uop.symbolic import symbolic_simple from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, SPLIT_REDUCEOP, GlobalCounters, Context, getenv, all_same, temp from tinygrad.schedule.kernelize import merge_views, get_kernelize_map, Kernel -from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars +from tinygrad.engine.schedule import create_schedule_with_vars from tinygrad.engine.realize import CompiledRunner, run_schedule, lower_schedule class KernelCountException(Exception): pass @@ -151,6 +151,7 @@ class TestSchedule(unittest.TestCase): self.assertEqual(root.item(), sum(range(N))) @given(strat.sampled_from(range(2,4)), strat.sampled_from(range(2,4)), strat.sampled_from(range(0,4)), strat.sampled_from(range(0,4))) + @settings(deadline=None) def test_indexing_scalars(self, x, y, a, b): assume(a int: return len([x for x in u.toposort() if x.op is Ops.VIEW and len(x.src) != 0 and x.src[0].op not in {Ops.BUFFER, Ops.DEFINE_GLOBAL, Ops.ASSIGN}]) @@ -2108,7 +2082,6 @@ class TestSwizzle(unittest.TestCase): run_schedule(check_schedule(t, 3)) np.testing.assert_equal(t.numpy(), [[0.5, 0.5], [0.5, 0.5], [0., 0.]]) -def store_val(si:ScheduleItem): return si.ast.src[0].src[1] zero_pm = UPat(Ops.CONST, arg=0) class TestView(unittest.TestCase): def test_all_masked_out(self): @@ -2117,7 +2090,6 @@ class TestView(unittest.TestCase): # all masked out, degrades to const 0 b = a.pad(((0, 10), None))[10:] sched = check_schedule(b.contiguous(), 1) - assert zero_pm.match(store_val(sched[-1]), {}) run_schedule(sched) np.testing.assert_equal(b.numpy(), 0) @@ -2128,7 +2100,6 @@ class TestView(unittest.TestCase): assert b.shape == (10, 10) sched = check_schedule(b.contiguous(), 1) self.assertEqual(sched[-1].ast.full_shape, (10, 10)) - assert zero_pm.match(store_val(sched[-1]), {}) run_schedule(sched) np.testing.assert_equal(b.numpy(), 0) @@ -2143,8 +2114,6 @@ class TestView(unittest.TestCase): b = a.pad(((0, 5), None))[5:] assert b.shape == (10, 10) sched = check_schedule(b.contiguous(), 1) - self.assertEqual(store_val(sched[-1]).op, Ops.LOAD) - self.assertEqual(store_val(sched[-1]).st_arg, b.uop.st) run_schedule(sched) np.testing.assert_allclose(b.numpy(), np.pad(a.numpy(), ((0, 5), (0, 0)))[5:]) @@ -2260,24 +2229,6 @@ class TestConst(unittest.TestCase): sched = a.schedule() self.assertEqual(len(sched), 1) - def test_const_ast(self): - a = Tensor.ones((4,)).pad((1, 1)).contiguous() - sched = a.schedule() - print(sched[0].ast) - const_ast_pattern = UPat(Ops.SINK, src=(UPat.store(UPat(), UPat.where(UPat(Ops.VALID), UPat.cvar("x"), UPat(Ops.CONST, arg=0))),)) - self.assertEqual(len(const_ast_pattern.match(sched[0].ast, {})), 1) - run_schedule(sched) - self.assertListEqual(a.tolist(), [0, 1, 1, 1, 1, 0]) - - def test_unmasked_const_ast(self): - a = Tensor.ones((4,)).contiguous() - sched = a.schedule() - print(sched[0].ast) - const_ast_pattern = UPat(Ops.SINK, src=(UPat.store(UPat(), UPat(Ops.CONST)),)) - self.assertEqual(len(const_ast_pattern.match(sched[0].ast, {})), 1) - run_schedule(sched) - self.assertListEqual(a.tolist(), [1, 1, 1, 1]) - # ** part 2: scheduler behavior when const folding happens later def test_const_folding_no_realize(self): diff --git a/test/test_uops.py b/test/test_uops.py index b2a2e4f200..c59af6874d 100644 --- a/test/test_uops.py +++ b/test/test_uops.py @@ -508,19 +508,6 @@ class TestShapeSpec(unittest.TestCase): a = Tensor.ones((4, 4)).uop self.assertEqual(a.st, ShapeTracker.from_shape(()).reshape((1,1)).expand((4,4))) - def test_padded_const(self): - a = Tensor.ones((1, 1)).pad(((1, 1), (1, 1))) - ast = a.contiguous().schedule()[0].ast - valid_pattern = UPat(Ops.WHERE, src=(UPat(Ops.VALID), UPat.cvar(), UPat.cvar())) - valid_ternary = [x for x in ast.toposort() if valid_pattern.match(x, {})][0] - # the WHERE outputs a contiguous (3, 3) - self.assertEqual(valid_ternary.st, ShapeTracker.from_shape((3, 3))) - valid, x, y = valid_ternary.src - # very notably, only the first source is padded - self.assertIsNotNone(valid.st.views[-1].mask) - assert x.st.views[-1].mask is y.st.views[-1].mask is None - assert all(s.shape == (3, 3) for s in valid_ternary.src) - # NOTE: CONST ShapeTracker comes from its source def test_scalar_const(self): a = Tensor(0).uop diff --git a/test/test_uops_stats.py b/test/test_uops_stats.py index a1d42ed8c9..21323af265 100644 --- a/test/test_uops_stats.py +++ b/test/test_uops_stats.py @@ -6,7 +6,7 @@ from tinygrad.renderer import Estimates from tinygrad.codegen import full_rewrite from tinygrad.uop.ops import Ops, UOp from tinygrad.dtype import dtypes -from tinygrad.opt.kernel import Kernel, Opt, OptOps, KernelOptError +from tinygrad.opt.kernel import Opt, OptOps, KernelOptError from tinygrad.device import Device def flops_mem(uops, ignore_indexing=False): @@ -178,10 +178,10 @@ class TestStatsOptimized(unittest.TestCase): self.assertEqual(p.estimates.lds, 2*N*N*N*4 + 4*N*N) def test_gemm_tc_unroll(self): - k = Kernel(self.ast_gemm) - if not k.apply_tensor_cores(): self.skipTest("no tensor cores") - k.apply_opt(Opt(OptOps.UNROLL, 0, 2)) - p = get_program(k.get_optimized_ast(), k.opts) + try: + p = get_program(self.ast_gemm, opts=[Opt(OptOps.TC, 0, (-1, 0, 1)), Opt(OptOps.UNROLL, 0, 2)]) + except KernelOptError: + raise unittest.SkipTest("no tensor cores") print(p.src) self.check_gemm(p) @@ -217,19 +217,16 @@ class TestStatsOptimized(unittest.TestCase): self.assertEqual(p.estimates.lds, 2*N*N*N*4 + SZ*4 + (SZ*4 + 4*N*N)*4) def test_reduce(self): - k = Kernel(self.ast_reduce) - p = get_program(k.get_optimized_ast(), k.opts) + p = get_program(self.ast_reduce, opts=[]) print(p.name, p.estimates.ops, p.estimates.mem, p.estimates.lds) self.assertEqual(p.estimates.ops, N*N) self.assertEqual(p.estimates.mem, N*N*4 + 4) def test_reduce_group(self): - k = Kernel(self.ast_reduce) try: - k.apply_opt(Opt(OptOps.GROUP, 0, 50)) + p = get_program(self.ast_reduce, opts=[Opt(OptOps.GROUP, 0, 50)]) except KernelOptError: raise unittest.SkipTest("no locals") - p = get_program(k.get_optimized_ast(), k.opts) # NOTE: these are wrong, they don't respect the if statement print(p.name, p.estimates.ops, p.estimates.mem, p.estimates.lds) diff --git a/test/test_winograd.py b/test/test_winograd.py index 515694c6c2..453e910f60 100644 --- a/test/test_winograd.py +++ b/test/test_winograd.py @@ -1,10 +1,7 @@ import unittest import numpy as np from tinygrad import Tensor, GlobalCounters, dtypes, Context, nn -from tinygrad.uop.ops import Ops -from tinygrad.helpers import Timing, CI, Profiling, WINO, DEBUG, getenv -from tinygrad.opt.kernel import Kernel -from tinygrad.opt.heuristic import hand_coded_optimizations +from tinygrad.helpers import CI, Profiling, WINO, getenv class TestWinogradClose(unittest.TestCase): def test_close(self): @@ -28,30 +25,6 @@ class TestWinograd(unittest.TestCase): def tearDown(self): WINO.value = self.old - def test_speed(self): - x = Tensor.empty(1,4,9,9) - w = Tensor.empty(4,4,3,3) - - with Timing("running conv: "): - out = Tensor.conv2d(x, w) - - with Timing("scheduling: "): - sched = out.schedule() - - for i,s in enumerate(sched): - if s.ast.op is not Ops.SINK: continue - ops = s.ast.toposort() - with Timing(f"linearize {i} with {len(ops):4d} ops: "): - l = Kernel(s.ast) - l.apply_opts(hand_coded_optimizations(l)) - assert len(l.sts) <= 256 # just the current value to prevent regression - if DEBUG >= 2: print(f"{len(l.sts):4d} shapetrackers with max {max(len(x.views) for x in l.sts)} views") - for st in l.sts: - assert len(st.views) <= 2, "too many views in winograd" - if DEBUG >= 3: - print(f"{len(st.views):3d} views") - for v in st.views: print(v) - def test_profile(self): x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize() with Profiling(enabled=not CI, sort='time'):