diff --git a/extra/optimization/get_action_space.py b/extra/optimization/get_action_space.py index ca3eaeb992..46c34cc1b1 100644 --- a/extra/optimization/get_action_space.py +++ b/extra/optimization/get_action_space.py @@ -1,15 +1,27 @@ from tqdm import tqdm from extra.optimization.helpers import load_worlds, ast_str_to_lin from tinygrad.codegen.search import actions +from tinygrad.codegen.linearizer import Linearizer if __name__ == "__main__": ast_strs = load_worlds(False, False, False) tactions = set() for ast_str in tqdm(ast_strs): lin = ast_str_to_lin(ast_str) - lin.hand_coded_optimizations() + if True or not lin.apply_tensor_cores(): lin.hand_coded_optimizations() + linr = Linearizer(lin.ast) for o in lin.applied_opts: assert o in actions tactions.add(o) - print(len(tactions)) + linr.apply_opt(o) + + assert len(lin.sts) == len(linr.sts) + for st1,st2 in zip(lin.sts, linr.sts): + assert st1 == st2 + + #lin.linearize() + #linr.linearize() + #assert lin.uops == linr.uops + + print(len(tactions), len(actions)) print(sorted(list(tactions))) diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index 34da256635..28cac58fb6 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -19,7 +19,7 @@ def kopt_search_hook(k, create_k, to_prg, baseline, bufs, var_vals): def check_opt(x): try: k = create_k() - k.apply_auto_opt(x) + for o in x: k.apply_opt(o) prg = to_prg(k) first_tm = prg.exec(bufs, var_vals, force_wait=True, optimizing=True) np.testing.assert_allclose(wanna_output, bufs[0].toCPU(), atol=1e-4, rtol=1e-4) diff --git a/test/test_linearizer.py b/test/test_linearizer.py index fabe4be95e..f492dda9db 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -109,111 +109,6 @@ def helper_realized_ast(r:Tensor): output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args()) # allocate an output buffer return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs] -def helper_linearizer_opt(r:Tensor, opts=[]): - wanna_output = None - realized_ast, real_bufs = helper_realized_ast(r) - - def check_opt(x, create_k, to_prg): - k = create_k() - k.apply_auto_opt(x) - prg = to_prg(k) - real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled - prg.exec(real_bufs, force_wait=True) - np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4) - - # Get baseline, which is not optimized at all. - k = Linearizer(realized_ast) - prg = Device[Device.DEFAULT].to_program(k) - prg.exec(real_bufs, force_wait=True) - wanna_output = real_bufs[0].toCPU().copy() - - # Check correctness of handcoded optimiztions. - k = Linearizer(realized_ast) - k.hand_coded_optimizations() - prg = Device[Device.DEFAULT].to_program(k) - real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled - prg.exec(real_bufs, force_wait=True) - np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4) - for x in opts: # Check custom transformations if any. - check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_program) - -class TestLinearizerOpts(unittest.TestCase): - def test_local_and_grouped_reduce(self): - if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local: - self.skipTest("Only Compiled uses linearizer with locals") - - N = 128 - Tensor.manual_seed(1882) - a = Tensor.rand(4, 4, N, N) - b = Tensor.rand(4, 4, N) - r = (b.sqrt() + ((a+1).sum(axis=3).exp())) - helper_linearizer_opt(r, [ - [(0, 2, 'L')], [(0, 8, 'L')], [(0, 16, 'L')], # Checking how it works with locals - [(0, 2, 'G')], [(0, 32, 'G')], [(0, 64, 'G')], # Checking how it works with grouped reduce - [(0, 2, 'L'), (0, 2, 'G')], [(0, 16, 'L'), (0, 16, 'G')], [(0, 32, 'L'), (0, 2, 'G')], [(0, 2, 'L'), (0, 64, 'G')], # Checking how it works with locals + grouped reduce - [(0, 2, 'L'), (0, 2, 'G'), (0, 8, 'U'), (0, 4, 'R')], # Checking how it works with locals + grouped reduce + upcasts - ]) - - def test_upcasts(self): - if not isinstance(Device[Device.DEFAULT], Compiled): - self.skipTest("Only Compiled uses linearizer") - - N = 16 - Tensor.manual_seed(1772) - a = Tensor.rand(N, N) - b = Tensor.rand(N, N) - r = (a+b).sqrt() * ((a+1).exp()) - helper_linearizer_opt(r, [ - [(0, 2, 'U')], [(0, 4, 'U')], [(0, 8, 'U')], # Checking how it works with upcasts - ]) - - def test_full_upcast(self): - if not isinstance(Device[Device.DEFAULT], Compiled): - self.skipTest("Only Compiled uses linearizer") - - Tensor.manual_seed(1772) - a = Tensor.rand(4) - b = Tensor.rand(4) - r = (a+b).sqrt() * ((a+1).exp()) - helper_linearizer_opt(r, [ - [(0, 4, 'U')], # Checking how it works with upcasts - ]) - - def test_matmul(self): - if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local: - self.skipTest("Only Compiled uses linearizer with locals") - - N = 128 - Tensor.manual_seed(1552) - a = Tensor.rand(N, N) - b = Tensor.rand(N, N) - r = a@b - helper_linearizer_opt(r, [ - [(0, 2, 'U')], [(0, 4, 'U'), (1, 4, 'U')], # Checking how it works with upcasts - [(0, 2, 'L')], [(1, 32, 'L')], [(0, 4, 'L'), (1, 4, 'L')], [(0, 4, 'L'), (1, 32, 'L')], [(0, 16, 'L'), (1, 8, 'L')], # Checking how it works with locals - [(0, 2, 'G')], [(0, 32, 'G')], [(0, 32, 'G'), (0, 4, 'R')], # Checking how it works with grouped_reduce - [(0, 2, 'L'), (1, 2, 'L'), (0, 32, 'G')], [(0, 16, 'L'), (0, 32, 'G')], [(0, 16, 'L'), (0, 8, 'L'), (0, 4, 'G')], # Checking how it works with local+grouped_reduce - [(0, 4, 'L'), (0, 4, 'L'), (0, 16, 'G'), (0, 4, 'R'), (0, 4, 'U'), (1, 2, 'U')], # Checking all together - [(0, 4, 'L'), (0, 4, 'L'), (0, 16, 'G'), (0, 4, 'R'), (0, 8, 'U')], # Full global upcast + local - ]) - - def test_double_reduce(self): - if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local: - self.skipTest("Only Compiled uses linearizer with locals") - - N = 128 - Tensor.manual_seed(1552) - a = Tensor.rand(8, N, 8, N) - r = a.sum(axis=(1,3)) - helper_linearizer_opt(r, [ - [(0, 2, 'G')], [(0, 32, 'G')], [(1, 2, 'G')], [(1, 32, 'G')], # Checking how it works with 1 grouped_reduce. - [(0, 2, 'G'), (1, 2, 'G')], [(0, 16, 'G'), (1, 2, 'G')], [(0, 4, 'G'), (1, 64, 'G')], # Checking how it works with 2 grouped_reduces. - [(0, 16, 'G'), (1, 2, 'G'), (1, 4, 'R')], [(0, 2, 'G'), (1, 32, 'G'), (1, 4, 'R')], # Checking how it works with 2 grouped_reduces + upcasts. - [(0, 4, 'L'), (1, 4, 'L'), (0, 8, 'G'), (1, 4, 'G')], [(0, 4, 'L'), (1, 4, 'L'), (0, 2, 'G'), (1, 32, 'G'), (1, 4, 'R')], # Checking how it works with 2 grouped_reduces + upcasts + locals. - [(0, 2, 'L'), (1, 2, 'L'), (0, 8, 'G'), (1, 4, 'G'), (0, 2, 'U')], [(0, 2, 'L'), (1, 2, 'L'), (0, 8, 'G'), (1, 4, 'G'), (0, 2, 'U'), (0, 4, 'R'), (1, 4, 'R')], # Checking how it works with 2 grouped_reduces + upcasts + locals. - [(0, 4, 'L'), (1, 4, 'L'), (0, 8, 'G'), (1, 4, 'G'), (0, 2, 'U'), (1, 2, 'U')], # No globals - ]) - class TestFloat4(unittest.TestCase): def setUp(self): if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.supports_float4: diff --git a/tinygrad/codegen/optimizer.py b/tinygrad/codegen/optimizer.py index 7afe173a84..ec18bcd1ba 100644 --- a/tinygrad/codegen/optimizer.py +++ b/tinygrad/codegen/optimizer.py @@ -300,26 +300,6 @@ class OptimizedKernel(Kernel): return True return False - def apply_auto_opt(self, x): - for axis, amt, typ in x: - if axis is None or amt == 1: continue - if typ == "G" and self.opts.has_shared: - assert self.full_shape[self.first_reduce+axis+len(self.group_for_reduce)] % amt == 0, "no longer valid shift" - self.shift_to(self.first_reduce+axis+len(self.group_for_reduce), amt, top=True, insert_before=self.first_reduce+len(self.group_for_reduce)) - self.group_for_reduce.append(amt) - if typ == "R": - typ = "U" - axis += self.first_reduce + len(self.group_for_reduce) - if typ == "U": - assert self.full_shape[axis] % amt == 0, "no longer valid shift" - self.shift_to(axis, amt) - self.upcast() - elif typ == "L": - assert self.full_shape[axis] % amt == 0, "no longer valid shift" - self.shift_to(axis, amt, insert_before=self.first_reduce) - self.local_dims += 1 - self.simplify_ones() - def apply_opt(self, opt:Opt): self.applied_opts.append(opt) axis = opt.axis + (self.first_reduce if opt.op == OptOps.UNROLL else 0) @@ -350,8 +330,10 @@ class OptimizedKernel(Kernel): if (not early_only or buf in self.earlybufs) and self.bufs[buf_index].dtype.__class__ is ImageDType: assert len(unit_stride_axes_mul_4) >= 1, f"needs a unit stride axis in {self.bufs[buf_index]}" if all(x < (self.shape_len-self.upcasted) for x in unit_stride_axes_mul_4) and unit_stride_axes_mul_4[0] not in self.upcast_in_mid_reduce_axes: - if unit_stride_axes_mul_4[0] < self.first_reduce: self.apply_opt(Opt(OptOps.UPCAST, unit_stride_axes_mul_4[0], 4)) - else: self.apply_opt(Opt(OptOps.UNROLL, unit_stride_axes_mul_4[0]-self.first_reduce, 4)) + if unit_stride_axes_mul_4[0] < self.first_reduce: + self.apply_opt(Opt(OptOps.UPCAST, unit_stride_axes_mul_4[0], 4)) + else: + self.apply_opt(Opt(OptOps.UNROLL, unit_stride_axes_mul_4[0]-self.first_reduce, 4)) def hand_coded_optimizations(self): # if there's images in the earlybufs, we have to make an axis the 4 loading one diff --git a/tinygrad/features/kopt.py b/tinygrad/features/kopt.py index e6eb4f3ebd..b63bacc08a 100644 --- a/tinygrad/features/kopt.py +++ b/tinygrad/features/kopt.py @@ -1,6 +1,7 @@ from typing import Callable import time from tinygrad.codegen.linearizer import Linearizer +from tinygrad.codegen.optimizer import Opt, OptOps from tinygrad.helpers import DEBUG, prod, getenv from tinygrad.lazy import vars_from_ast @@ -15,11 +16,10 @@ def kernel_optimize_opts(k:Linearizer): for i in range(k.first_reduce): # TODO: the upcast always happen first, you might want to reverse this? # TODO: the order of the locals might improve things too - opts.append(ng.p.TransitionChoice([(i,s,"U") for s in get_divisors(k.full_shape[i], max_div=8)])) - opts.append(ng.p.TransitionChoice([(i,s,"L") for s in get_divisors(k.full_shape[i], min_div=4)])) + opts.append(ng.p.TransitionChoice([Opt(OptOps.UPCAST,i,s) for s in get_divisors(k.full_shape[i], max_div=8)])) + opts.append(ng.p.TransitionChoice([Opt(OptOps.LOCAL,i,s) for s in get_divisors(k.full_shape[i], min_div=4)])) for i in range(k.shape_len-k.first_reduce): - opts.append(ng.p.TransitionChoice([(i,s,"R") for s in get_divisors(k.full_shape[k.first_reduce+i], max_div=8)])) - opts.append(ng.p.TransitionChoice([(i,s,"G") for s in get_divisors(k.full_shape[k.first_reduce+i], min_div=4) if all(st.shape[k.first_reduce+i] % s == 0 or st.shape[k.first_reduce+i] == 1 for st in k.sts)])) + opts.append(ng.p.TransitionChoice([Opt(OptOps.UNROLL,i,s) for s in get_divisors(k.full_shape[k.first_reduce+i], max_div=8)])) return opts def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, baseline, bufs, var_vals): @@ -27,7 +27,7 @@ def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_p def opt(x): try: k = create_k() - k.apply_auto_opt(x) + for o in x: k.apply_opt(o) prg = to_prg(k) first_tm = prg.exec(bufs, var_vals, force_wait=True, optimizing=True) if baseline*5 < first_tm*1000: return first_tm*1000 # very slow @@ -78,5 +78,7 @@ def kernel_optimize(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, buf global_db[skey] = choice global_db.sync() - if choice == "BASELINE": k.hand_coded_optimizations() - else: k.apply_auto_opt(choice) \ No newline at end of file + if choice == "BASELINE": + k.hand_coded_optimizations() + else: + for o in choice: k.apply_opt(o) \ No newline at end of file