diff --git a/examples/handcode_opt.py b/examples/handcode_opt.py index 6e3daf6c66..f6d834a92b 100644 --- a/examples/handcode_opt.py +++ b/examples/handcode_opt.py @@ -84,7 +84,7 @@ if __name__ == "__main__": # always try hand coded opt lin = Kernel(si.ast, opts=device.renderer) - lin = hand_coded_optimizations(lin) + lin.apply_opts(hand_coded_optimizations(lin)) lins.append((lin, "HC")) # maybe try tensor cores diff --git a/extra/optimization/get_action_space.py b/extra/optimization/get_action_space.py index b5b8435042..6ff2a81a75 100644 --- a/extra/optimization/get_action_space.py +++ b/extra/optimization/get_action_space.py @@ -24,7 +24,7 @@ if __name__ == "__main__": for ast_str in tqdm(ast_strs): lin = ast_str_to_lin(ast_str) #if not lin.apply_tensor_cores(): - lin = hand_coded_optimizations(lin) + lin.apply_opts(hand_coded_optimizations(lin)) test_rebuild(lin) # confirm linearize can be called twice uops1 = lin.linearize().uops diff --git a/extra/optimization/test_net.py b/extra/optimization/test_net.py index 4258d94ca9..68a023f0ee 100644 --- a/extra/optimization/test_net.py +++ b/extra/optimization/test_net.py @@ -35,7 +35,7 @@ if __name__ == "__main__": rawbufs = bufs_from_lin(lin) linhc = deepcopy(lin) - linhc = hand_coded_optimizations(linhc) + linhc.applied_opts(hand_coded_optimizations(linhc)) tmhc = time_linearizer(linhc, rawbufs) print(f"{tmhc*1e6:10.2f} HC ", linhc.colored_shape()) diff --git a/extra/replay_pkl.py b/extra/replay_pkl.py index 91718654e5..9f02492d6a 100644 --- a/extra/replay_pkl.py +++ b/extra/replay_pkl.py @@ -57,7 +57,7 @@ if __name__ == "__main__": ei.bufs[0].copyin(memoryview(bytearray(b'\x00'*ei.bufs[0].nbytes))) GlobalCounters.kernel_count -= 1 - if not getenv("NOOPT"): k = hand_coded_optimizations(k) + if not getenv("NOOPT"): k.apply_opts(hand_coded_optimizations(k)) p2 = k.to_program() new_ei = replace(ei, prg=CompiledRunner(p2)) new_ei.run() diff --git a/test/external/external_benchmark_hcopt.py b/test/external/external_benchmark_hcopt.py index 80800d79de..c2ed276e2e 100644 --- a/test/external/external_benchmark_hcopt.py +++ b/test/external/external_benchmark_hcopt.py @@ -20,7 +20,8 @@ if __name__ == '__main__': k = ast_str_to_lin(world) rawbufs = bufs_from_lin(k) - k_hcopt = optimize_kernel(k.copy()) + k_hcopt = k.copy() + k_hcopt.apply_opts(optimize_kernel(k_hcopt)) k_beam = beam_search(k.copy(), rawbufs, getenv("BEAM", 2)) disable_cache = bool(getenv("NOCACHE", 0)) diff --git a/test/external/external_benchmark_schedule.py b/test/external/external_benchmark_schedule.py index 1f1c9923b2..1b0be431f8 100644 --- a/test/external/external_benchmark_schedule.py +++ b/test/external/external_benchmark_schedule.py @@ -38,7 +38,7 @@ if __name__ == "__main__": if BEAM: with Context(DEBUG=max(2, DEBUG.value)): k = beam_search(k, bufs_from_lin(k), BEAM.value) elif NOOPT: pass - else: k = hand_coded_optimizations(k) + else: k.apply_opts(hand_coded_optimizations(k)) kernels.append(k) with Timing("***** model lower in "): uops = [rewrite_shapetracker_with_index(k.get_optimized_ast(), k.opts) for k in kernels] diff --git a/test/external/speed_beam_v_hcopt.py b/test/external/speed_beam_v_hcopt.py index f6d2b8e5e1..1bd107f6d7 100644 --- a/test/external/speed_beam_v_hcopt.py +++ b/test/external/speed_beam_v_hcopt.py @@ -20,14 +20,14 @@ if __name__ == "__main__": k = new_lin() - if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k = hand_coded_optimizations(k) + if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.apply_opts(hand_coded_optimizations(k)) assert BEAM > 0 lins = [(("tc" if used_tensor_cores else "hc"), k)] if used_tensor_cores: lins.append(("hc", new_lin())) - lins[-1][1] = hand_coded_optimizations(lins[-1][1]) + lins[-1][1].apply_opts(hand_coded_optimizations(lins[-1][1])) kb = new_lin() test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1))))) diff --git a/test/external/speed_compare_amd_am.py b/test/external/speed_compare_amd_am.py index 1080f295e6..760833d48c 100644 --- a/test/external/speed_compare_amd_am.py +++ b/test/external/speed_compare_amd_am.py @@ -76,7 +76,7 @@ if __name__ == "__main__": for num,ast in enumerate(ast_strs): with run_amd(): amdlin = ast_str_to_lin(ast, opts=amddev.renderer) - amdlin = hand_coded_optimizations(amdlin) + amdlin.apply_opts(hand_coded_optimizations(amdlin)) has_bf16 = any(b.dtype == dtypes.bfloat16 for b in amdlin.membufs) amd_prg = CompiledRunner(amdlin.to_program()) @@ -88,7 +88,7 @@ if __name__ == "__main__": rdr = amdev.renderer rdr.device = "AMD:1" amlin = ast_str_to_lin(ast, opts=amdev.renderer) - amlin = hand_coded_optimizations(amlin) + amlin.apply_opts(hand_coded_optimizations(amlin)) am_prg = CompiledRunner(amlin.to_program()) ambufs = bufs_from_lin(amlin) test_ambufs = get_fuzz_rawbufs(amlin) if not has_bf16 else ambufs @@ -99,7 +99,7 @@ if __name__ == "__main__": cpu_rdr = cpudev.renderer cpu_rdr.device = "CPU" cpulin = ast_str_to_lin(ast, opts=cpu_rdr) - cpulin = hand_coded_optimizations(cpulin) + cpulin.apply_opts(hand_coded_optimizations(cpulin)) cpu_prg = CompiledRunner(cpulin.to_program()) cpubufs = bufs_from_lin(cpulin) test_cpubufs = get_fuzz_rawbufs(cpulin) if not has_bf16 else ambufs diff --git a/test/external/speed_compare_cuda_nv.py b/test/external/speed_compare_cuda_nv.py index 3d15d433a1..3ce1a74a31 100644 --- a/test/external/speed_compare_cuda_nv.py +++ b/test/external/speed_compare_cuda_nv.py @@ -22,7 +22,7 @@ if __name__ == "__main__": for num,ast in enumerate(ast_strs): # cuda compile culin = ast_str_to_lin(ast, opts=cudev.renderer) - culin = hand_coded_optimizations(culin) + culin.apply_opts(hand_coded_optimizations(culin)) has_bf16 = any(b.dtype == dtypes.bfloat16 for b in culin.membufs) cuda_prg = CompiledRunner(culin.to_program()) @@ -32,7 +32,7 @@ if __name__ == "__main__": rdr = nvdev.renderer rdr.device = "NV" nvlin = ast_str_to_lin(ast, opts=rdr) - nvlin = hand_coded_optimizations(nvlin) + nvlin.apply_opts(hand_coded_optimizations(nvlin)) nv_prg = CompiledRunner(nvlin.to_program()) nvbufs = bufs_from_lin(nvlin) test_nvbufs = get_fuzz_rawbufs(nvlin) if not has_bf16 else nvbufs diff --git a/test/external/speed_compare_cuda_ptx.py b/test/external/speed_compare_cuda_ptx.py index 2671f1f53d..32b586a1d5 100644 --- a/test/external/speed_compare_cuda_ptx.py +++ b/test/external/speed_compare_cuda_ptx.py @@ -24,7 +24,7 @@ if __name__ == "__main__": # cuda compile dev.compiler = CUDACompiler(dev.arch) lin = ast_str_to_lin(ast, opts=dev.renderer) - lin = hand_coded_optimizations(lin) + lin.apply_opts(hand_coded_optimizations(lin)) cuda_prg = CompiledRunner(lin.to_program()) bufs = bufs_from_lin(lin) @@ -32,7 +32,7 @@ if __name__ == "__main__": # ptx compile dev.compiler = PTXCompiler(dev.arch) lin = ast_str_to_lin(ast, opts=ptx) - lin = hand_coded_optimizations(lin) + lin.apply_opts(hand_coded_optimizations(lin)) lin.linearize() ptx_prg = CompiledRunner(lin.to_program()) diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 8d44a02901..92f98adfc2 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -1739,7 +1739,7 @@ class TestHandCodedOpts(unittest.TestCase): s = layer_2.schedule()[-1] k = Kernel(s.ast) - k = hand_coded_optimizations(k) + k.apply_opts(hand_coded_optimizations(k)) assert len(k.bufs) == 6 # make sure all ops are done in one kernel # masked upcast should upcast masked axis of size 7 # masked upcast should not upcast large (20) last axis @@ -1752,7 +1752,7 @@ class TestHandCodedOpts(unittest.TestCase): s = monster.schedule()[-1] k = Kernel(s.ast) - k = hand_coded_optimizations(k) + k.apply_opts(hand_coded_optimizations(k)) assert len(k.bufs) == 37 # make sure all ops are done in one kernel # should upcast the two Tensor.stacks assert k.upcasted >= 2 and k.full_shape[k.shape_len-k.upcasted:k.shape_len].count(6) == 2 @@ -1768,7 +1768,7 @@ class TestHandCodedOpts(unittest.TestCase): # collect upcasts of tile transform kernels for i, si in enumerate(wino_schedule): k = Kernel(si.ast) - k = hand_coded_optimizations(k) + k.apply_opts(hand_coded_optimizations(k)) if k.reduceop is not None: continue # not a tile transform kernel (there is a gemm reduce kernel) if len(k.bufs) < 22: continue # not a tile transform kernel (there's a permute kernel at the end) upcasts.append(tuple(k.full_shape[k.shape_len - k.upcasted:k.shape_len])) @@ -1780,7 +1780,7 @@ class TestHandCodedOpts(unittest.TestCase): backward_schedule = Tensor.schedule(x.grad, w.grad) for si in backward_schedule: k = Kernel(si.ast) - k = hand_coded_optimizations(k) + k.apply_opts(hand_coded_optimizations(k)) k.linearize() if len(k.bufs) < 20: continue # not a tile transform kernel # heuristic number to make sure that at least some upcasts but not too many upcasts are being done @@ -1866,7 +1866,7 @@ def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[] # Check correctness of handcoded optimiztions. k = Kernel(realized_ast) - k = hand_coded_optimizations(k) + k.apply_opts(hand_coded_optimizations(k)) lins.append(k) prg = get_prg(k) reset_bufs(outbufs) diff --git a/test/test_winograd.py b/test/test_winograd.py index d0871f0ed2..a125f9c0a0 100644 --- a/test/test_winograd.py +++ b/test/test_winograd.py @@ -27,7 +27,7 @@ class TestWinograd(unittest.TestCase): ops = s.ast.toposort with Timing(f"linearize {i} with {len(ops):4d} ops: "): l = Kernel(s.ast) - l = hand_coded_optimizations(l) + l.apply_opts(hand_coded_optimizations(l)) l.linearize() assert len(l.sts) <= 256 # just the current value to prevent regression if DEBUG >= 2: print(f"{len(l.sts):4d} shapetrackers with max {max(len(x.views) for x in l.sts)} views") diff --git a/tinygrad/codegen/heuristic.py b/tinygrad/codegen/heuristic.py index 8ac6722f0c..c1c7bc2015 100644 --- a/tinygrad/codegen/heuristic.py +++ b/tinygrad/codegen/heuristic.py @@ -4,7 +4,7 @@ from tinygrad.helpers import getenv, DEBUG, all_int, prod from tinygrad.dtype import ImageDType from tinygrad.ops import Ops, resolve -def hand_coded_optimizations(k:Kernel) -> Kernel: +def hand_coded_optimizations(k:Kernel) -> list[Opt]: # make a copy so it does not mutate the input k = k.copy() @@ -24,7 +24,7 @@ def hand_coded_optimizations(k:Kernel) -> Kernel: if MV_THREADS_PER_ROW > 1: k.apply_opt(Opt(OptOps.GROUP, 0, MV_THREADS_PER_ROW)) if MV_BLOCKSIZE > 1: k.apply_opt(Opt(OptOps.LOCAL, global_idx, MV_BLOCKSIZE)) if MV_ROWS_PER_THREAD > 1: k.apply_opt(Opt(OptOps.UPCAST, global_idx, MV_ROWS_PER_THREAD)) - return k + return k.applied_opts if k.opts.has_local and k.opts.has_shared and all_int(k.sts[0].shape[:k.first_reduce]): # are we grouping? (requires local shape support) @@ -50,7 +50,7 @@ def hand_coded_optimizations(k:Kernel) -> Kernel: k.apply_opt(Opt(OptOps.UNROLL, unit_stride_axes_mul_4[0]-k.first_reduce, 4)) # no more opt if we are grouping - if k.group_for_reduces: return k + if k.group_for_reduces: return k.applied_opts # **** below this line need to be optional and benchmarked **** @@ -129,4 +129,4 @@ def hand_coded_optimizations(k:Kernel) -> Kernel: k.apply_opt(Opt(OptOps.LOCAL, axis, local_sz)) if will_delete_shape: deleted_shape += 1 - return k \ No newline at end of file + return k.applied_opts \ No newline at end of file diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py index b1279f0054..fa9a838365 100644 --- a/tinygrad/engine/realize.py +++ b/tinygrad/engine/realize.py @@ -16,7 +16,7 @@ logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS def get_kernel(renderer:Renderer, ast:UOp) -> Kernel: k = Kernel(ast, opts=renderer) if not NOOPT: - if not k.apply_tensor_cores(getenv("TC", 1)): k = hand_coded_optimizations(k) + if not k.apply_tensor_cores(getenv("TC", 1)): k.apply_opts(hand_coded_optimizations(k)) if BEAM >= 1: from tinygrad.engine.search import beam_search, bufs_from_lin kb = Kernel(ast, opts=renderer)