diff --git a/examples/handcode_opt.py b/examples/handcode_opt.py index f48d8a3812..23fe143938 100644 --- a/examples/handcode_opt.py +++ b/examples/handcode_opt.py @@ -9,6 +9,7 @@ from tinygrad.device import Compiled from tinygrad.opt.search import beam_search, bufs_from_lin from tinygrad.helpers import DEBUG, ansilen, getenv, colored, TRACEMETA from extra.optimization.helpers import time_linearizer +from tinygrad.engine.realize import get_program def get_sched_resnet(): mdl = ResNet50() @@ -108,7 +109,7 @@ if __name__ == "__main__": choices = [] for lin, nm in lins: tm = time_linearizer(lin, rawbufs, allow_test_size=False, cnt=10, disable_cache=True) - ops = (prg:=lin.to_program()).estimates.ops + ops = (prg:=get_program(lin.get_optimized_ast(), lin.opts)).estimates.ops gflops = sym_infer(ops, {k:k.min for k in lin.ast.variables()})*1e-9/tm choices.append((tm, gflops, lin, prg, nm)) @@ -121,7 +122,7 @@ if __name__ == "__main__": if getenv("SRC"): print(si.ast) print(lin.applied_opts) - print(lin.to_program().src) + print(get_program(lin.get_optimized_ast(), lin.opts).src) total_tm += tm running_gflops += gflops * tm if (key := str([str(m) for m in si.metadata])) not in usage: usage[key] = (0, 0) diff --git a/extra/gemm/max_matmul.py b/extra/gemm/max_matmul.py index 9d093c2fbe..9ee2140508 100644 --- a/extra/gemm/max_matmul.py +++ b/extra/gemm/max_matmul.py @@ -2,6 +2,7 @@ import numpy as np, os from tinygrad.helpers import getenv, flat_mv from tinygrad import dtypes from typing import Optional, List, Tuple, cast, Dict, Final, DefaultDict, Self +from tinygrad.engine.realize import get_program # for copied uops from tinygrad.opt.kernel import Kernel, KernelOptError @@ -55,7 +56,7 @@ def randoms(): def ast_to_cuda_prog(compiler, ast, opts): k = Kernel(ast) k.apply_opts(opts) - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) return CUDAProgram(device, p.function_name, compiler.compile(p.src)) if __name__ == "__main__": diff --git a/extra/gemm/tinygrad_nv_matmul.py b/extra/gemm/tinygrad_nv_matmul.py index f81877a41a..b20528ff4e 100644 --- a/extra/gemm/tinygrad_nv_matmul.py +++ b/extra/gemm/tinygrad_nv_matmul.py @@ -1,7 +1,7 @@ from tinygrad import Tensor, dtypes, Device from tinygrad.helpers import getenv, DEBUG from tinygrad.opt.kernel import Kernel, Opt, OptOps -from tinygrad.engine.realize import CompiledRunner, ExecItem +from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program from dataclasses import replace N = 4096 @@ -29,7 +29,7 @@ if __name__ == "__main__": Opt(op=OptOps.LOCAL, axis=0, amt=2), ] k.apply_opts(opts) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) new_src = prg.src # can mod source here prg = replace(prg, src=new_src) diff --git a/extra/mcts_search.py b/extra/mcts_search.py index 562d11bc18..fdbb1ffe97 100644 --- a/extra/mcts_search.py +++ b/extra/mcts_search.py @@ -7,6 +7,7 @@ from tinygrad.helpers import DEBUG, getenv, CACHELEVEL, diskcache_get, diskcache from tinygrad.opt.kernel import Kernel from tinygrad.device import Buffer, Device, CompileError from tinygrad.opt.search import _ensure_buffer_alloc, get_kernel_actions, _time_program +from tinygrad.engine.realize import get_program class MCTSNode: def __init__(self, kernel:Kernel, parent=None): @@ -110,7 +111,7 @@ def mcts_search(lin:Kernel, rawbufs:List[Buffer], amt:int) -> Kernel: seen_asts[opt_ast.key] = node # lowering (50% of the time) - p = node.kernel.to_program(name_override="test") + p = get_program(node.kernel.get_optimized_ast(name_override="test"), node.kernel.opts) # rollout tm1 = time.perf_counter() diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py index 89ae8eccdd..baeb49385b 100644 --- a/extra/optimization/helpers.py +++ b/extra/optimization/helpers.py @@ -6,6 +6,7 @@ from tinygrad.dtype import dtypes, PtrDType from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View from tinygrad.helpers import getenv +from tinygrad.engine.realize import get_program inf, nan = float('inf'), float('nan') UOps = Ops @@ -115,7 +116,7 @@ def time_linearizer(lin:Kernel, rawbufs:list[Buffer], allow_test_size=True, max_ rawbufs = _ensure_buffer_alloc(rawbufs) var_vals: dict[Variable, int] = {k:int(k.vmax+k.vmin)//2 for k in lin.ast.variables()} - p = lin.to_program() + p = get_program(lin.get_optimized_ast(), lin.opts) tms = _time_program(p, dev.compiler.compile(p.src), var_vals, rawbufs, max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name)) diff --git a/extra/replay_pkl.py b/extra/replay_pkl.py index 62f991132a..0bfd7a36bd 100644 --- a/extra/replay_pkl.py +++ b/extra/replay_pkl.py @@ -4,7 +4,7 @@ from tinygrad import Device, Context, Tensor, GlobalCounters from tinygrad.device import Buffer from tinygrad.helpers import getenv, BEAM from tinygrad.engine.jit import TinyJit -from tinygrad.engine.realize import CompiledRunner, ExecItem, ScheduleItem, lower_schedule_item +from tinygrad.engine.realize import CompiledRunner, ExecItem, ScheduleItem, lower_schedule_item, get_program from tinygrad.renderer import ProgramSpec from tinygrad.opt.kernel import Kernel, Opt, OptOps from tinygrad.opt.heuristic import hand_coded_optimizations @@ -58,7 +58,7 @@ if __name__ == "__main__": GlobalCounters.kernel_count -= 1 if not getenv("NOOPT"): k.apply_opts(hand_coded_optimizations(k)) - p2 = k.to_program() + p2 = get_program(k.get_optimized_ast(), k.opts) new_ei = replace(ei, prg=CompiledRunner(p2)) new_ei.run() new_jit.append(new_ei) diff --git a/test/external/external_benchmark_sdxl_softmax.py b/test/external/external_benchmark_sdxl_softmax.py index 639d59e415..8da8cfb4ee 100644 --- a/test/external/external_benchmark_sdxl_softmax.py +++ b/test/external/external_benchmark_sdxl_softmax.py @@ -1,4 +1,5 @@ from tinygrad import Tensor, dtypes, GlobalCounters +from tinygrad.engine.realize import get_program if __name__ == "__main__": t = Tensor.empty(81920, 4096, dtype=dtypes.half) @@ -23,5 +24,5 @@ if __name__ == "__main__": #k.apply_opt(Opt(OptOps.GROUP, 1, 32)) #k.apply_opt(Opt(OptOps.GROUP, 0, 32)) from tinygrad.engine.realize import CompiledRunner, ExecItem - run = CompiledRunner(prg:=k.to_program()) + run = CompiledRunner(prg:=get_program(k.get_optimized_ast(), k.opts)) ExecItem(run, si.bufs).run() diff --git a/test/external/external_debug_metal_sd_conv.py b/test/external/external_debug_metal_sd_conv.py index 6c2d24b5a3..bb65140fbf 100644 --- a/test/external/external_debug_metal_sd_conv.py +++ b/test/external/external_debug_metal_sd_conv.py @@ -1,7 +1,7 @@ # ruff: noqa: E501 from tinygrad.opt.kernel import Kernel, Opt, OptOps from tinygrad.dtype import dtypes -from tinygrad.engine.realize import CompiledRunner +from tinygrad.engine.realize import CompiledRunner, get_program from tinygrad.opt.search import bufs_from_lin from tinygrad.uop.ops import UOp, Ops from tinygrad.shape.shapetracker import ShapeTracker @@ -35,7 +35,7 @@ k = Kernel(ast) k.apply_opts(opts) bufs = bufs_from_lin(k) -prg = CompiledRunner(k.to_program()) +prg = CompiledRunner(get_program(k.get_optimized_ast(), k.opts)) for i in range(10): speed = prg(bufs, var_vals={}, wait=True) diff --git a/test/external/external_test_nv.py b/test/external/external_test_nv.py index 1e8aaa377d..cd0ca9a2b8 100644 --- a/test/external/external_test_nv.py +++ b/test/external/external_test_nv.py @@ -3,8 +3,7 @@ from tinygrad import Device, dtypes, Tensor from tinygrad.helpers import to_mv from tinygrad.runtime.ops_nv import NVDevice, HWQueue from tinygrad.opt.search import Opt, OptOps -from test.test_linearizer_failures import helper_test_lin -from tinygrad.engine.realize import get_runner, CompiledRunner +from tinygrad.engine.realize import get_runner, CompiledRunner, get_program from test.external.fuzz_linearizer import get_fuzz_rawbufs from tinygrad.opt.kernel import Kernel @@ -24,11 +23,6 @@ class TestNV(unittest.TestCase): TestNV.b.uop.buffer.allocate() TestNV.addr = struct.pack("QQ", TestNV.b.uop.buffer._buf.va_addr, TestNV.a.uop.buffer._buf.va_addr) - def test_oor_kernels(self): - ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=Ops.CAST, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=Ops.CAST, src=(LazyOp(op=Ops.MUL, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 256, 1, 512, 4, 16, 4, 16), strides=(0, 100352, 0, 196, 0, 14, 0, 1), offset=-15, mask=((0, 1), (0, 256), (0, 1), (0, 512), (0, 4), (1, 15), (0, 4), (1, 15)), contiguous=False), View(shape=(256, 1, 512, 7, 7, 512, 3, 3), strides=(2097152, 0, 0, 128, 2, 4096, 1088, 17), offset=0, mask=None, contiguous=False))))), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=2, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(256, 1, 512, 7, 7, 512, 3, 3), strides=(25088, 0, 49, 7, 1, 0, 0, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=(dtypes.float, False)),), arg=((0, 3, 4), dtypes.float)),), arg=(dtypes.half, False)),), arg=MemBuffer(idx=0, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 1, 512, 1, 1, 512, 3, 3), strides=(0, 0, 4608, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=True),)))) # noqa: E501 - opts = [Opt(op=OptOps.TC, axis=6, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.LOCAL, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=2, arg=3), Opt(op=OptOps.UPCAST, axis=1, arg=2)] # noqa: E501 - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=["NV"]) - def test_error_on_huge_dims(self): ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=Ops.CAST, src=(LazyOp(op=Ops.MUL, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 1, 1024, 683), strides=(0, 0, 0, 1), offset=0, mask=None, contiguous=False),)))), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=2, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 1, 1024, 683), strides=(0, 0, 683, 1), offset=0, mask=None, contiguous=True),))))), arg=None),), arg=dtypes.float),), arg=(3,)),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1, 1, 1024, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=True),)))) # noqa: E501 opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=1, arg=32), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.LOCAL, axis=0, arg=2)] # noqa: E501 @@ -36,7 +30,7 @@ class TestNV(unittest.TestCase): lin = Kernel(ast) lin.apply_opts(opts) rawbufs = get_fuzz_rawbufs(lin) - prg = CompiledRunner(lin.to_program()) + prg = CompiledRunner(get_program(lin.get_optimized_ast(), lin.opts)) prg(rawbufs, {}, wait=True) self.assertEqual(str(cm.exception), "This is a runtime error message") diff --git a/test/external/external_test_valid_remove.py b/test/external/external_test_valid_remove.py deleted file mode 100644 index 9734b1e39b..0000000000 --- a/test/external/external_test_valid_remove.py +++ /dev/null @@ -1,121 +0,0 @@ -# ruff: noqa: E501 -import unittest - -from tinygrad import Device -from tinygrad.uop.ops import UOp, Ops -from tinygrad.opt.search import Opt, OptOps -from tinygrad.dtype import dtypes -from tinygrad.shape.shapetracker import ShapeTracker -from tinygrad.shape.view import View -from tinygrad.opt.kernel import Kernel - -class TestOpenpilotValidhack(unittest.TestCase): - def test_valid_removal(self): - Device.DEFAULT = "GPU" - - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.imagef((64, 1024, 4)), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 64, 128, 1, 1, 8, 4, 1, 1, 1, 1), strides=(0, 4096, 32, 0, 0, 4, 1, 0, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MAX, dtypes.float, arg=None, src=( - x5:=UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 8, 9, 10)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.imagef((128, 768, 4)), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 3, 1, 4, 4, 130, 4, 258), strides=(0, 0, 0, 0, 0, 4, 0, 1, 0, 3072, 0, 12), offset=-3084, mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 3), (0, 1), (0, 4), (0, 4), (1, 129), (0, 4), (1, 257)), contiguous=False), View(shape=(1, 64, 128, 1, 1, 8, 4, 3, 4, 3, 3), strides=(0, 2064, 2, 0, 0, 0, 0, 2146560, 536640, 135192, 259), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.imagef((8, 108, 4)), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 64, 128, 1, 1, 8, 4, 3, 4, 3, 3), strides=(0, 0, 0, 0, 0, 432, 1, 48, 4, 144, 16), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 64, 128, 1, 1, 8, 4, 1, 1, 1, 1), strides=(0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - x19:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - x20:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 64, 128, 1, 1, 8, 4, 1, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MAX, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - x20,)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.EXP2, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - x5, - UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x20,)),)),)), - x29:=UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x20,)),)),)), - x19,)), - x29,)),)),)),)) - - opts = [Opt(op=OptOps.UPCAST, axis=3, arg=4), Opt(op=OptOps.UNROLL, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.NOLOCALS, axis=None, arg=None)] - kernel = Kernel(ast) - - kernel.apply_opts(opts) - - p = kernel.to_program() - print(p.src) - - def test_const_idx(self): - Device.DEFAULT = "GPU" - - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.imagef((10, 128, 4)), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 512, 1), offset=0, mask=None, contiguous=True),)), src=()), - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.imagef((1, 128, 4)), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=0, mask=((0, 1), (0, 1), (0, 512)), contiguous=False),)), src=()),)), - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x18:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=48128, mask=((0, 1), (1, 2), (0, 512)), contiguous=False),)), src=()),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x18, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=45568, mask=((0, 1), (2, 3), (0, 512)), contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x18, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=43008, mask=((0, 1), (3, 4), (0, 512)), contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x18, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=40448, mask=((0, 1), (4, 5), (0, 512)), contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x18, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=37888, mask=((0, 1), (5, 6), (0, 512)), contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x18, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=35328, mask=((0, 1), (6, 7), (0, 512)), contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x18, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=32768, mask=((0, 1), (7, 8), (0, 512)), contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x18, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=30208, mask=((0, 1), (8, 9), (0, 512)), contiguous=False),)), src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.imagef((1, 128, 4)), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 10, 512), strides=(0, 0, 1), offset=0, mask=((0, 1), (9, 10), (0, 512)), contiguous=False),)), src=()),)),)),)),)),)) - - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.NOLOCALS, axis=None, arg=None)] - kernel = Kernel(ast) - - kernel.apply_opts(opts) - - p = kernel.to_program() - # ((idx1<1)?read_imagef(data1, smp, (int2)(idx0,0)):(float4)(0.0f,0.0f,0.0f,0.0f)) - print(p.src) - -if __name__ == '__main__': - unittest.main() diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py index 77da3c8449..5c76dde1f9 100644 --- a/test/external/fuzz_linearizer.py +++ b/test/external/fuzz_linearizer.py @@ -3,6 +3,7 @@ from typing import Any import numpy as np from collections import defaultdict from extra.optimization.helpers import load_worlds, ast_str_to_lin, kern_str_to_lin +from tinygrad.engine.realize import get_program # We need to insert ioctl before opening devices. if os.getenv("VALIDATE_HCQ", 0) != 0: @@ -93,7 +94,7 @@ def run_linearizer(lin: Kernel, rawbufs=None, var_vals=None) -> tuple[str, Any]: # TODO: images needs required_optimization try: - prg = CompiledRunner(lin.to_program()) + prg = CompiledRunner(get_program(lin.get_optimized_ast(), lin.opts)) except KeyboardInterrupt: raise except Exception: traceback.print_exc() @@ -206,7 +207,7 @@ def fuzz_linearizer(lin: Kernel, rtol=1e-2, atol=1e-2, opts_list=None): if not FUZZ_ALL_ACTIONS and test_lin.applied_opts: print(f"applied opts: {test_lin.applied_opts}") # stop if kernel uops repeat - try: tuops = tuplize_uops(test_lin.to_program().uops) + try: tuops = tuplize_uops(get_program(test_lin.get_optimized_ast(), test_lin.opts).uops) except KeyboardInterrupt: raise except BaseException as e: print(test_lin.ast) diff --git a/test/external/speed_compare_amd_am.py b/test/external/speed_compare_amd_am.py index 2821d2388d..fb67f5eb61 100644 --- a/test/external/speed_compare_amd_am.py +++ b/test/external/speed_compare_amd_am.py @@ -4,7 +4,7 @@ from extra.optimization.helpers import load_worlds, ast_str_to_lin from test.external.fuzz_linearizer import get_fuzz_rawbufs from tinygrad.opt.heuristic import hand_coded_optimizations from tinygrad.opt.search import bufs_from_lin -from tinygrad.engine.realize import CompiledRunner +from tinygrad.engine.realize import CompiledRunner, get_program from tinygrad.tensor import _to_np_dtype from tinygrad.runtime.ops_amd import AMDDevice from contextlib import contextmanager @@ -79,7 +79,7 @@ if __name__ == "__main__": amdlin.apply_opts(hand_coded_optimizations(amdlin)) has_bf16 = any(b.dtype == dtypes.bfloat16 for b in amdlin.bufs) - amd_prg = CompiledRunner(amdlin.to_program()) + amd_prg = CompiledRunner(get_program(amdlin.get_optimized_ast(), amdlin.opts)) amdbufs = bufs_from_lin(amdlin) test_amdbufs = get_fuzz_rawbufs(amdlin) if not has_bf16 else amdbufs if not has_bf16: contents = [buf.as_buffer() for buf in test_amdbufs] @@ -89,7 +89,7 @@ if __name__ == "__main__": rdr.device = "AMD:1" amlin = ast_str_to_lin(ast, opts=amdev.renderer) amlin.apply_opts(hand_coded_optimizations(amlin)) - am_prg = CompiledRunner(amlin.to_program()) + am_prg = CompiledRunner(get_program(amlin.get_optimized_ast(), amlin.opts)) ambufs = bufs_from_lin(amlin) test_ambufs = get_fuzz_rawbufs(amlin) if not has_bf16 else ambufs if not has_bf16: @@ -100,7 +100,7 @@ if __name__ == "__main__": cpu_rdr.device = "CPU" cpulin = ast_str_to_lin(ast, opts=cpu_rdr) cpulin.apply_opts(hand_coded_optimizations(cpulin)) - cpu_prg = CompiledRunner(cpulin.to_program()) + cpu_prg = CompiledRunner(get_program(cpulin.get_optimized_ast(), cpulin.opts)) cpubufs = bufs_from_lin(cpulin) test_cpubufs = get_fuzz_rawbufs(cpulin) if not has_bf16 else ambufs if not has_bf16: diff --git a/test/external/speed_compare_cuda_nv.py b/test/external/speed_compare_cuda_nv.py index 6fc557bc63..8cf6d1cef0 100644 --- a/test/external/speed_compare_cuda_nv.py +++ b/test/external/speed_compare_cuda_nv.py @@ -4,7 +4,7 @@ from extra.optimization.helpers import load_worlds, ast_str_to_lin from test.external.fuzz_linearizer import get_fuzz_rawbufs from tinygrad.opt.heuristic import hand_coded_optimizations from tinygrad.opt.search import bufs_from_lin -from tinygrad.engine.realize import CompiledRunner +from tinygrad.engine.realize import CompiledRunner, get_program from tinygrad.tensor import _to_np_dtype import numpy as np @@ -25,7 +25,7 @@ if __name__ == "__main__": culin.apply_opts(hand_coded_optimizations(culin)) has_bf16 = any(b.dtype == dtypes.bfloat16 for b in culin.bufs) - cuda_prg = CompiledRunner(culin.to_program()) + cuda_prg = CompiledRunner(get_program(culin.get_optimized_ast(), culin.opts)) cubufs = bufs_from_lin(culin) test_cubufs = get_fuzz_rawbufs(culin) if not has_bf16 else cubufs @@ -33,7 +33,7 @@ if __name__ == "__main__": rdr.device = "NV" nvlin = ast_str_to_lin(ast, opts=rdr) nvlin.apply_opts(hand_coded_optimizations(nvlin)) - nv_prg = CompiledRunner(nvlin.to_program()) + nv_prg = CompiledRunner(get_program(nvlin.get_optimized_ast(), nvlin.opts)) nvbufs = bufs_from_lin(nvlin) test_nvbufs = get_fuzz_rawbufs(nvlin) if not has_bf16 else nvbufs if not has_bf16: diff --git a/test/external/speed_compare_cuda_ptx.py b/test/external/speed_compare_cuda_ptx.py index f5ed26f755..7f3fdfa5b4 100644 --- a/test/external/speed_compare_cuda_ptx.py +++ b/test/external/speed_compare_cuda_ptx.py @@ -1,6 +1,6 @@ import itertools from tinygrad import Device -from tinygrad.engine.realize import CompiledRunner +from tinygrad.engine.realize import CompiledRunner, get_program from tinygrad.opt.heuristic import hand_coded_optimizations from tinygrad.helpers import getenv, colorize_float from extra.optimization.helpers import load_worlds, ast_str_to_lin @@ -25,7 +25,7 @@ if __name__ == "__main__": dev.compiler = CUDACompiler(dev.arch) lin = ast_str_to_lin(ast, opts=dev.renderer) lin.apply_opts(hand_coded_optimizations(lin)) - cuda_prg = CompiledRunner(lin.to_program()) + cuda_prg = CompiledRunner(get_program(lin.get_optimized_ast(), lin.opts)) bufs = bufs_from_lin(lin) @@ -33,7 +33,7 @@ if __name__ == "__main__": dev.compiler = PTXCompiler(dev.arch) lin = ast_str_to_lin(ast, opts=ptx) lin.apply_opts(hand_coded_optimizations(lin)) - ptx_prg = CompiledRunner(lin.to_program()) + ptx_prg = CompiledRunner(get_program(lin.get_optimized_ast(), lin.opts)) # warmup try: diff --git a/test/test_arange.py b/test/test_arange.py index 308c741e52..bb645c77c3 100644 --- a/test/test_arange.py +++ b/test/test_arange.py @@ -4,7 +4,7 @@ from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable from tinygrad.helpers import CI, Context, getenv from tinygrad.engine.realize import run_schedule from tinygrad.opt.kernel import Opt, OptOps, Kernel, KernelOptError -from tinygrad.engine.realize import CompiledRunner, ExecItem +from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program from tinygrad.opt.search import get_kernel_actions from tinygrad.uop.ops import Ops @@ -17,7 +17,7 @@ class TestArange(unittest.TestCase): k = Kernel(sched[-1].ast) if opts is not None: for o in opts: k.apply_opt(o) - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) print(p.name) #print(p.src) ExecItem(CompiledRunner(p), [tt.uop.buffer]).run() diff --git a/test/test_hcq.py b/test/test_hcq.py index 607e90647e..1674a7ac45 100644 --- a/test/test_hcq.py +++ b/test/test_hcq.py @@ -5,7 +5,7 @@ from tinygrad.device import Buffer, BufferSpec from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer from tinygrad.runtime.autogen import libc from tinygrad.runtime.support.system import PCIIfaceBase -from tinygrad.engine.realize import get_runner, CompiledRunner +from tinygrad.engine.realize import get_runner, CompiledRunner, get_program from tinygrad.opt.kernel import Kernel, Opt, OptOps from tinygrad import Variable @@ -164,7 +164,7 @@ class TestHCQ(unittest.TestCase): k = Kernel(si.ast, opts=TestHCQ.d0.renderer) for i in range(3): k.apply_opt(Opt(op=OptOps.LOCAL, axis=0, arg=3)) - runner = CompiledRunner(k.to_program()) + runner = CompiledRunner(get_program(k.get_optimized_ast(), k.opts)) zb = Buffer(Device.DEFAULT, 3 * 3 * 3, dtypes.int, options=BufferSpec(cpu_access=True, nolru=True)).ensure_allocated() zt = Buffer(Device.DEFAULT, 3 * 3 * 3, dtypes.int, options=BufferSpec(cpu_access=True, nolru=True)).ensure_allocated() diff --git a/test/test_linearizer.py b/test/test_linearizer.py index d883f878b7..14823ed89a 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -33,8 +33,8 @@ def helper_tc_allclose(N:int, M:int, K:int, dtype_in:DType, dtype_out:DType, axi realized_ast, bufs = helper_realized_ast(r) k = Kernel(realized_ast) k.apply_tensor_cores(use_tensor_cores, axis=axis, tc_select=tc_select, tc_opt=tc_opt) - prg = CompiledRunner(replace(k.to_program(), device=Device.DEFAULT)) - if use_tensor_cores == 1: assert len([uop for uop in k.uops if uop.op is Ops.WMMA]) > 0, "wmma not triggered" + prg = CompiledRunner(replace(get_program(k.get_optimized_ast(), k.opts), device=Device.DEFAULT)) + if use_tensor_cores == 1: assert len([uop for uop in prg.p.uops if uop.op is Ops.WMMA]) > 0, "wmma not triggered" assert len([x for x in k.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included" prg.exec(bufs) if dtype_in == dtypes.half: tc_atol, tc_rtol = 1e-2, 1e-3 @@ -100,8 +100,9 @@ class TestLinearizer(unittest.TestCase): a_t = Tensor.full(st.shape, 2).contiguous().realize() b_t = Tensor.full(st.shape, 3).contiguous().realize() lin = helper_linearizer_ast(sink, [a_t, b_t], wanna_output=[a_t.numpy()+b_t.numpy(), a_t.numpy()*b_t.numpy()])[0] + uops = get_program(lin.get_optimized_ast(), lin.opts).uops - stores = [u for u in lin.uops if u.op is Ops.STORE] + stores = [u for u in uops if u.op is Ops.STORE] mutable_bufs = dedup(flatten([[x for x in u.src[0].toposort() if x.op is Ops.DEFINE_GLOBAL] for u in stores])) assert len(mutable_bufs) == len(stores) == 2 self.assertSetEqual(set([u.arg for u in mutable_bufs]), set([0,1])) @@ -148,76 +149,83 @@ class TestLinearizer(unittest.TestCase): a = Tensor.randn(2, ).realize() out = a.reshape(2, 1).expand(2, 3).sum() lin = helper_linearizer_opt(out, wanna_output=[np.broadcast_to(a.numpy().reshape(2, 1), (2, 3)).sum()])[0] - ranges = [i for i,u in enumerate(lin.uops) if u.op is Ops.RANGE] + uops = get_program(lin.get_optimized_ast(), lin.opts).uops + ranges = [i for i,u in enumerate(uops) if u.op is Ops.RANGE] assert len(ranges) == 1 # NOTE: it collapses now # RANGE -> LOAD -> RANGE -> ASSIGN - #assert any(x.op is Ops.LOAD for x in lin.uops[ranges[0]:ranges[1]]) + #assert any(x.op is Ops.LOAD for x in uops[ranges[0]:ranges[1]]) def test_three_nested_range(self): a = Tensor.randn(2, ).realize() out = a.reshape(2, 1).expand(2, 3).expand(2, 2, 3).sum() lin = helper_linearizer_opt(out, wanna_output=[np.broadcast_to(np.broadcast_to(a.numpy().reshape(2, 1), (2, 3)), (2, 2, 3)).sum()])[0] - ranges = [i for i,u in enumerate(lin.uops) if u.op is Ops.RANGE] + uops = get_program(lin.get_optimized_ast(), lin.opts).uops + ranges = [i for i,u in enumerate(uops) if u.op is Ops.RANGE] assert len(ranges) == 1 # NOTE: it collapses now # RANGE -> RANGE -> LOAD -> RANGE -> ASSIGN # NOTE: nothing should toposort between the first two ranges #assert ranges[0]+1 == ranges[1] - #assert any(x.op is Ops.LOAD for x in lin.uops[ranges[1]:ranges[2]]) + #assert any(x.op is Ops.LOAD for x in uops[ranges[1]:ranges[2]]) def test_two_nested_range_alt_indexing(self): a = Tensor([2, 2]).realize() out = a.reshape(2, 1).pad(((1, 1), (1, 1)), value=2).sum() lin = helper_linearizer_opt(out, wanna_output=[24])[0] - ranges = [i for i,u in enumerate(lin.uops) if u.op is Ops.RANGE] + uops = get_program(lin.get_optimized_ast(), lin.opts).uops + ranges = [i for i,u in enumerate(uops) if u.op is Ops.RANGE] # RANGE -> ALU -> RANGE -> ALU + LOAD -> ASSIGN - assert any(x.op in GroupOp.ALU for x in lin.uops[ranges[0]:ranges[1]]) - assert not any(x.op is Ops.LOAD for x in lin.uops[ranges[0]:ranges[1]]) - assert any(x.op in {*GroupOp.ALU, Ops.LOAD} for x in lin.uops[ranges[1]:]) + assert any(x.op in GroupOp.ALU for x in uops[ranges[0]:ranges[1]]) + assert not any(x.op is Ops.LOAD for x in uops[ranges[0]:ranges[1]]) + assert any(x.op in {*GroupOp.ALU, Ops.LOAD} for x in uops[ranges[1]:]) def test_range_outer_op_before_phi(self): a = Tensor.randn(4, 1).realize() b = Tensor.randn(1, 1).realize() out = (a + b[0]).sum() + b[0] lin = helper_linearizer_opt(out, wanna_output=[(a.numpy()+b.numpy()[0]).sum()+b.numpy()])[0] - ranges = [i for i,u in enumerate(lin.uops) if u.op is Ops.RANGE] + uops = get_program(lin.get_optimized_ast(), lin.opts).uops + ranges = [i for i,u in enumerate(uops) if u.op is Ops.RANGE] # LOAD -> RANGE -> LOAD -> ASSIGN - assert len([x for x in lin.uops[:ranges[0]] if x.op is Ops.LOAD]) == 1 + assert len([x for x in uops[:ranges[0]] if x.op is Ops.LOAD]) == 1 def test_range_outer_op_before_phi_nested_range(self): a = Tensor.randn(2, ).realize() b = Tensor.randn(1, 1).realize() out = (a.reshape(2, 1).expand(2, 3) + b[0]).sum() + b[0] lin = helper_linearizer_opt(out, wanna_output=[(np.broadcast_to(a.numpy().reshape(2, 1), (2, 3)) + b.numpy()[0]).sum() + b.numpy()])[0] - ranges = [i for i,u in enumerate(lin.uops) if u.op is Ops.RANGE] + uops = get_program(lin.get_optimized_ast(), lin.opts).uops + ranges = [i for i,u in enumerate(uops) if u.op is Ops.RANGE] assert len(ranges) == 1 # NOTE: it collapses now #if getenv("PTX"): # LOAD -> RANGE -> CAST -> ALU -> ALU -> LOAD -> ALU -> RANGE -> ALU -> ASSIGN - # assert lin.uops[ranges[0]-2].op is Ops.LOAD + # assert uops[ranges[0]-2].op is Ops.LOAD # assert ranges[1] == ranges[0]+6 - # assert [x.op for x in lin.uops[ranges[1]-2:ranges[1]]] == [Ops.LOAD, Ops.ALU] + # assert [x.op for x in uops[ranges[1]-2:ranges[1]]] == [Ops.LOAD, Ops.ALU] # LOAD -> RANGE -> LOAD -> ALU -> RANGE -> ASSIGN #else: - # assert lin.uops[ranges[0]-2].op is Ops.LOAD + # assert uops[ranges[0]-2].op is Ops.LOAD # assert ranges[1] == ranges[0]+3 - # assert [x.op for x in lin.uops[ranges[1]-2:ranges[1]]] == [Ops.LOAD, Ops.ALU] + # assert [x.op for x in uops[ranges[1]-2:ranges[1]]] == [Ops.LOAD, Ops.ALU] def test_range_outer_op_after_phi(self): a = Tensor.randn(4, 1).realize() out = a.sum() * a.sum() lin = helper_linearizer_opt(out, wanna_output=[a.numpy().sum()*a.numpy().sum()])[0] + uops = get_program(lin.get_optimized_ast(), lin.opts).uops # RANGE -> LOAD -> ASSIGN -> ALU - end = max(i for i,u in enumerate(lin.uops) if u.op is Ops.ENDRANGE) + end = max(i for i,u in enumerate(uops) if u.op is Ops.ENDRANGE) # the INDEX can be first - assert lin.uops[end+1].op in GroupOp.ALU or lin.uops[end+2].op in GroupOp.ALU + assert uops[end+1].op in GroupOp.ALU or uops[end+2].op in GroupOp.ALU def test_range_outer_op_after_phi_nested_range(self): a = Tensor.randn(2, ).realize() out = a.reshape(2, 1).expand(2, 3).sum() + a.reshape(2, 1).expand(2, 3).sum() lin = helper_linearizer_opt(out, wanna_output=[(np.broadcast_to(a.numpy().reshape(2, 1), (2, 3))).sum()*2])[0] + uops = get_program(lin.get_optimized_ast(), lin.opts).uops # RANGE -> LOAD -> ASSIGN -> ALU - end = max(i for i,u in enumerate(lin.uops) if u.op is Ops.ENDRANGE) + end = max(i for i,u in enumerate(uops) if u.op is Ops.ENDRANGE) # the INDEX can be first - assert lin.uops[end+1].op in GroupOp.ALU or lin.uops[end+2].op in GroupOp.ALU + assert uops[end+1].op in GroupOp.ALU or uops[end+2].op in GroupOp.ALU def test_load_dedup(self): # for different leaves in the AST, the same loads may occur. @@ -228,8 +236,8 @@ class TestLinearizer(unittest.TestCase): k = Kernel(r.schedule()[-1].ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=0)) - k.to_program() - num_loads = len([uop for uop in k.uops if uop.op is Ops.LOAD]) + uops = get_program(k.get_optimized_ast(), k.opts).uops + num_loads = len([uop for uop in uops if uop.op is Ops.LOAD]) assert num_loads <= 4, "more load uops than needed" assert num_loads >= 4, "unexpected number of uops, maybe this test needs updating?" @@ -241,8 +249,8 @@ class TestLinearizer(unittest.TestCase): k = Kernel(r.schedule()[-1].ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=0)) - k.to_program() - num_ops = len([uop for uop in k.uops if uop.op in GroupOp.ALU]) + uops = get_program(k.get_optimized_ast(), k.opts).uops + num_ops = len([uop for uop in uops if uop.op in GroupOp.ALU]) assert num_ops <= 1, "more alu uops than needed" @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4") @@ -253,9 +261,9 @@ class TestLinearizer(unittest.TestCase): k = Kernel(r.schedule()[-1].ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=0)) k.apply_opt(Opt(op=OptOps.UNROLL, axis=0, arg=0)) - k.to_program() - accs = [u for u in k.uops if u.op is Ops.DEFINE_REG] - stores = [u for u in k.uops if u.op is Ops.STORE] + uops = get_program(k.get_optimized_ast(), k.opts).uops + accs = [u for u in uops if u.op is Ops.DEFINE_REG] + stores = [u for u in uops if u.op is Ops.STORE] assert len(accs) == 0 # it's removed now assert len(stores) == 1 assert stores[0].src[-1].dtype == dtypes.float.vec(4) @@ -267,7 +275,7 @@ class TestLinearizer(unittest.TestCase): out = Tensor.ones(64,64).contiguous() @ Tensor.ones(64,64).contiguous() k = Kernel(out.schedule()[-1].ast) k.apply_opt(Opt(OptOps.LOCAL, axis=0, arg=4)) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) self.assertEqual(len(prg.src.split("for")), 5) @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @@ -297,8 +305,8 @@ class TestLinearizer(unittest.TestCase): k = Kernel(r.schedule()[-1].ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=0)) - k.to_program() - num_ops = len([uop for uop in k.uops if uop.op in GroupOp.ALU]) + uops = get_program(k.get_optimized_ast(), k.opts).uops + num_ops = len([uop for uop in uops if uop.op in GroupOp.ALU]) assert num_ops == 0, "more alu uops than needed" def test_sum_acc_dtype(self): @@ -356,7 +364,7 @@ class TestLinearizer(unittest.TestCase): realized_ast = sched[-1].ast kernel = Kernel(realized_ast) kernel.apply_tensor_cores(1, axis=0, tc_select=-1, tc_opt=2) - prg = kernel.to_program() + prg = get_program(kernel.get_optimized_ast(), kernel.opts) if Device.DEFAULT == "LLVM": assert "0x201000" in prg.src elif Device.DEFAULT == "AMD" and AMD_LLVM: @@ -447,7 +455,7 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in) r = x.matmul(y, dtype=tc.dtype_out) k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1] - for u in k.uops: + for u in get_program(k.get_optimized_ast(), k.opts).uops: if u.op is Ops.WMMA: assert u.src[-1].src[0].op != Ops.ASSIGN @@ -458,7 +466,7 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in) r = x.matmul(y, dtype=tc.dtype_out) k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1] - for u in k.uops: + for u in get_program(k.get_optimized_ast(), k.opts).uops: if u.op is Ops.WMMA: #assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2])) assert u.src[-1].src[0].op != Ops.ASSIGN @@ -471,7 +479,7 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in) r = x.matmul(y, dtype=tc.dtype_out).relu() k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1] - for u in k.uops: + for u in get_program(k.get_optimized_ast(), k.opts).uops: if u.op is Ops.WMMA: #assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2])) assert u.src[-1].src[0].op != Ops.ASSIGN @@ -482,13 +490,14 @@ class TestLinearizer(unittest.TestCase): r = (x@y).relu() k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4)]])[-1] # the uops graph is RANGE -> DEFINE_ACC -> 4x ALU -> 4x ASSIGN -> ENDRANGE - for u in k.uops: + uops = get_program(k.get_optimized_ast(), k.opts).uops + for u in uops: if u.op is Ops.ASSIGN: assert u.src[1].op in GroupOp.ALU # children of ASSIGN are placed after ENDRANGE if any(x.op is Ops.ASSIGN for x in u.src): - end_range = [i for i, x in enumerate(k.uops) if x.op is Ops.ENDRANGE][0] - assert end_range < k.uops.index(u) + end_range = [i for i, x in enumerate(uops) if x.op is Ops.ENDRANGE][0] + assert end_range < uops.index(u) def test_grouped_dims(self): def _assert_grouped_dims(prefix, dims, max_sizes, reverse_dims, expected_sizes, assert_same_length = True): @@ -566,7 +575,8 @@ class TestLinearizer(unittest.TestCase): # shrink so that the dims do not collapse t = Tensor.ones(5, 6, 7).contiguous().realize().shrink(((0, 4), (0, 5), (0, 6))) k = helper_linearizer_opt(t+1)[0] - idxs = dedup([uop for uop in k.uops if uop.op is Ops.SPECIAL]) + uops = get_program(k.get_optimized_ast(), k.opts).uops + idxs = dedup([uop for uop in uops if uop.op is Ops.SPECIAL]) idxs = sorted(idxs, key=lambda uop: uop.arg[0]) assert idxs[0].arg == ('gidx0', 6), idxs[0].arg assert idxs[1].arg == ('gidx1', 5), idxs[1].arg @@ -605,7 +615,7 @@ class TestLinearizer(unittest.TestCase): def test_phi_simplification(self): def helper(t, max_ops=0): k = helper_linearizer_opt(t)[-1] - uops = list(k.to_program().uops) + uops = get_program(k.get_optimized_ast(), k.opts).uops # ignore kernel optimized IF statements for now if if_op:=next((u for u in uops if u.op is Ops.IF), None): uops = uops[:uops.index(if_op)] @@ -635,8 +645,9 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.randn(64,64), Tensor.randn(64,64) out = x.matmul(y) k = helper_linearizer_opt(out)[-1] + uops = get_program(k.get_optimized_ast(), k.opts).uops # check that the float4 cast collapses - store_vals = [u.src[-1] for u in k.uops if u.op is Ops.STORE] + store_vals = [u.src[-1] for u in uops if u.op is Ops.STORE] for val in store_vals: assert val.dtype == dtypes.float.vec(4) # and val.op is not Ops.VECTORIZE @@ -659,7 +670,7 @@ class TestLinearizer(unittest.TestCase): x = Tensor.randn((4,3,6,6)).realize() out = x.flip((0,1)).contiguous() k = helper_linearizer_opt(out)[-1] - store_val = [u.src[-1] for u in k.uops if u.op is Ops.STORE][0] + store_val = [u.src[-1] for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0] assert store_val.dtype == dtypes.float.vec(4) and store_val.op is not Ops.VECTORIZE @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @@ -672,16 +683,17 @@ class TestLinearizer(unittest.TestCase): Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 2)] # upcast accs in both reduces k = helper_linearizer_opt(out, opts=[opt])[-1] def get_recursive(uop): return set.union(set(uop.src), [uop], *[get_recursive(v) for v in uop.src]) - local_stores = [u for u in k.uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_LOCAL for x in get_recursive(u.src[0]))] - global_stores = [u for u in k.uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_GLOBAL for x in get_recursive(u.src[0]))] - barrier = [u for u in k.uops if u.op is Ops.BARRIER][0] + uops = get_program(k.get_optimized_ast(), k.opts).uops + local_stores = [u for u in uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_LOCAL for x in get_recursive(u.src[0]))] + global_stores = [u for u in uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_GLOBAL for x in get_recursive(u.src[0]))] + barrier = [u for u in uops if u.op is Ops.BARRIER][0] # check that the float4 cast collapses for all stores for store in local_stores+global_stores: assert store.src[-1].dtype.count > 1 # and store.src[2].op is not Ops.VECTORIZE # # check the children's vins # TODO: src ALU are not the same, should it? # assert barrier.src == tuple(local_stores) - assert len([u for u in k.uops if u.op is Ops.IF and u.src[-1] == barrier]) == 1 + assert len([u for u in uops if u.op is Ops.IF and u.src[-1] == barrier]) == 1 @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") @@ -690,7 +702,8 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.rand(1,128), Tensor.rand(128, 128) r = (x@y).relu() k = helper_linearizer_opt(r)[-1] - stores = [u for u in k.uops if u.op is Ops.STORE] + uops = get_program(k.get_optimized_ast(), k.opts).uops + stores = [u for u in uops if u.op is Ops.STORE] # the float4 value stores directly in lds and we skip upcast self.assertEqual(stores[0].src[-1].dtype, dtypes.float.vec(4)) @@ -715,7 +728,7 @@ class TestLinearizer(unittest.TestCase): Opt(op=OptOps.LOCAL, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2) ] k = helper_linearizer_ast(ast, [Tensor.randn(240*40).realize()], opts=[opt])[-1] - out = [u for u in k.uops if u.op is Ops.STORE][0] + out = [u for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0] assert out.src[-1].op is Ops.VECTORIZE and out.src[-1].dtype == dtypes.float.vec(4) @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @@ -733,7 +746,7 @@ class TestLinearizer(unittest.TestCase): Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=2)] k = helper_linearizer_ast(ast, [Tensor.randn(8*32).realize()], opts=[opt])[-1] - out = [u for u in k.uops if u.op is Ops.STORE][0] + out = [u for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0] assert out.src[-1].op is Ops.VECTORIZE and out.src[-1].dtype.count != 1 @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need backends that support float4") @@ -770,9 +783,9 @@ class TestFloat4(unittest.TestCase): k = Kernel(s.ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=4)) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=2)) - k.to_program() + uops = get_program(k.get_optimized_ast(), k.opts).uops - assert TestFloat4.count_float4(k.uops) == (4, 2) + assert TestFloat4.count_float4(uops) == (4, 2) @unittest.skipUnless(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "Only CPU with AMX upcasts float up to size 16") def test_float4_multidim_amx(self): @@ -785,6 +798,7 @@ class TestFloat4(unittest.TestCase): k = Kernel(s.ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=4)) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=2)) + # TODO: broken and not tested k.upcast() k.upcast() k.to_program() @@ -821,9 +835,9 @@ class TestFloat4(unittest.TestCase): k = Kernel(s.ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=1, arg=4)) k.apply_opt(Opt(op=OptOps.UPCAST, axis=1, arg=2)) - k.to_program() + uops = get_program(k.get_optimized_ast(), k.opts).uops - assert TestFloat4.count_float4(k.uops) == (0, 2) + assert TestFloat4.count_float4(uops) == (0, 2) @unittest.skipUnless(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "Only CPU with AMX upcasts float up to size 16") def test_float4_multidim_unaligned_load_amx(self): @@ -835,6 +849,7 @@ class TestFloat4(unittest.TestCase): s = c.schedule()[0] k = Kernel(s.ast) k.shift_to(len(k.full_unupcasted_shape)-1, 4) # manual trigger float4 dim + # TODO: broken and not tested k.upcast() k.shift_to(len(k.full_unupcasted_shape)-1, shift, insert_before=k.shape_len-1) k.upcast() @@ -859,9 +874,9 @@ class TestFloat4(unittest.TestCase): s = c.schedule()[0] k = Kernel(s.ast) k.apply_opt(Opt(op=OptOps.UNROLL, axis=0, arg=4)) - k.to_program() + uops = get_program(k.get_optimized_ast(), k.opts).uops - assert TestFloat4.count_float4(k.uops) == (0, 0) + assert TestFloat4.count_float4(uops) == (0, 0) def test_float4_multidim_sometimes_unaligned(self): a = Tensor.empty(1, 1, 7).realize() @@ -876,26 +891,9 @@ class TestFloat4(unittest.TestCase): k = Kernel(s.ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=0)) k.apply_opt(Opt(op=OptOps.UNROLL, axis=0, arg=0)) - k.to_program() + uops = get_program(k.get_optimized_ast(), k.opts).uops - assert TestFloat4.count_float4(k.uops) in {(0,1), (1,1)} - - @unittest.skip("no longer supported") - def test_float4_noncontiguous(self): - a = Tensor.empty(4, 2).realize() - b = Tensor.empty(4, 2).realize() - c = a + b - - # we will upcast the top axis of sz 4. they should not be coalesced into float4, - # since the top axis is not contiguous. - - s = c.schedule()[0] - k = Kernel(s.ast) - k.shift_to(0, 4, top=True) # top axes are float4 axes - k.upcast() - k.to_program() - - assert TestFloat4.count_float4(k.uops) == (0, 0) + assert TestFloat4.count_float4(uops) in {(0,1), (1,1)} def test_float4_expand(self): a = Tensor.empty(9).realize().shrink(((1, 9),)) @@ -908,9 +906,9 @@ class TestFloat4(unittest.TestCase): s = c.schedule()[0] k = Kernel(s.ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=4)) - k.to_program() + uops = get_program(k.get_optimized_ast(), k.opts).uops - assert TestFloat4.count_float4(k.uops) == (0, 1) + assert TestFloat4.count_float4(uops) == (0, 1) def test_float4_heterogeneous(self): a = Tensor.empty(8).realize() @@ -922,9 +920,9 @@ class TestFloat4(unittest.TestCase): s = c.schedule()[0] k = Kernel(s.ast) k.apply_opt(Opt(op=OptOps.UPCAST, axis=0, arg=4)) - k.to_program() + uops = get_program(k.get_optimized_ast(), k.opts).uops - assert TestFloat4.count_float4(k.uops) == (1, 1) + assert TestFloat4.count_float4(uops) == (1, 1) def test_half4_load_unrolled(self): # from llama 7B shard 4 gpus @@ -1109,7 +1107,7 @@ def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[] outbufs = [real_bufs[x.src[0].base.arg] for x in realized_ast.src] device = real_bufs[0].device - def get_prg(k:Kernel): return CompiledRunner(replace(k.to_program(), device=device)) + def get_prg(k:Kernel): return CompiledRunner(replace(get_program(k.get_optimized_ast(), k.opts), device=device)) def check_opt(opts, create_k, expected_color_size): k = create_k() diff --git a/test/test_linearizer_dumb.py b/test/test_linearizer_dumb.py index 669d0a5d02..a5dffaf11f 100644 --- a/test/test_linearizer_dumb.py +++ b/test/test_linearizer_dumb.py @@ -10,6 +10,7 @@ from tinygrad.helpers import getenv from tinygrad.shape.shapetracker import ShapeTracker, View from tinygrad.opt.search import Opt, OptOps from tinygrad.opt.kernel import Kernel +from tinygrad.engine.realize import get_program class TestLinearizerDumb(unittest.TestCase): @unittest.skipUnless(Device.DEFAULT == "METAL", "only tested on METAL") @@ -37,12 +38,12 @@ class TestLinearizerDumb(unittest.TestCase): opts = [Opt(op=OptOps.TC, axis=2, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.UNROLL, axis=1, arg=0)] k = Kernel(ast, opts=Device["METAL"].renderer) k.apply_opts(opts) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) print(prg.src) Device[Device.DEFAULT].compiler.compile_cached(prg.src) gate_count = len([x for x in prg.src.splitlines() if "if" in x]) assert gate_count == 1, f"must have only one gate {gate_count} != 1" - assert len([u for u in k.uops if u.op is Ops.IF]) == 1, "must have a single IF" + assert len([u for u in prg.uops if u.op is Ops.IF]) == 1, "must have a single IF" @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "need local") def test_max_simplify_and_cancel(self): @@ -76,7 +77,7 @@ class TestLinearizerDumb(unittest.TestCase): opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) print(prg.src) assert prg.uops is not None and not any(uop.op is Ops.MAX for uop in prg.uops), "leftover MAX" @@ -93,9 +94,9 @@ class TestLinearizerDumb(unittest.TestCase): opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) print(prg.src) - if_uops = [u for u in k.uops if u.op is Ops.IF] + if_uops = [u for u in prg.uops if u.op is Ops.IF] self.assertIn(len(if_uops), {1,2,3}) conditions = if_uops[0].src[0].toposort() self.assertLessEqual(len(conditions), 9) @@ -134,7 +135,7 @@ class TestLinearizerDumb(unittest.TestCase): UOp(Ops.VIEW, dtypes.half.ptr(131072000), arg=ShapeTracker(views=(View(shape=(4096, 32000, 1), strides=(1, 4096, 0), offset=0, mask=None, contiguous=False),)), src=( UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(131072000), arg=2, src=()),)),)),)),)),)),)),)),)) k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) print(prg.src) @unittest.expectedFailure @@ -163,7 +164,7 @@ class TestLinearizerDumb(unittest.TestCase): opts = [Opt(op=OptOps.UNROLL, axis=0, arg=0)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) print(prg.src) load_idxs = [x.src[1] for x in k.uops if x.op is Ops.LOAD and x.src[0].arg == 2] assert load_idxs[0] < load_idxs[1], f"first loaded idx {load_idxs[0].arg} then {load_idxs[1].arg}!" @@ -187,7 +188,7 @@ class TestLinearizerDumb(unittest.TestCase): opts = [Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.UPCAST, axis=2, arg=0)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) print(prg.src) store_idxs = [x.src[1] for x in k.uops if x.op is Ops.STORE] for i in range(len(store_idxs) - 1): diff --git a/test/test_linearizer_overflows.py b/test/test_linearizer_overflows.py index baeb0d533b..76d608bc5a 100644 --- a/test/test_linearizer_overflows.py +++ b/test/test_linearizer_overflows.py @@ -14,7 +14,6 @@ from tinygrad.shape.view import View def _test_overflow(ast, opts): lin = Kernel(ast) lin.apply_opts(opts) - lin.to_program() bufs = bufs_from_lin(lin) print(bufs) time_linearizer(lin, bufs) diff --git a/test/test_opt_gemm.py b/test/test_opt_gemm.py index f38caf2f33..3b31d0a2ad 100644 --- a/test/test_opt_gemm.py +++ b/test/test_opt_gemm.py @@ -3,7 +3,7 @@ import unittest from tinygrad import Tensor from tinygrad.helpers import get_single_element from tinygrad.opt.kernel import Kernel, Opt, OptOps -from tinygrad.engine.realize import CompiledRunner, ExecItem +from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program class TestOptGemm(unittest.TestCase): @classmethod @@ -19,7 +19,7 @@ class TestOptGemm(unittest.TestCase): si = get_single_element(t.schedule()) k = Kernel(si.ast) k.apply_opts(opts) - run = CompiledRunner(k.to_program()) + run = CompiledRunner(get_program(k.get_optimized_ast(), k.opts)) ExecItem(run, si.bufs).run() test = si.bufs[0].numpy().reshape(self.res.shape) np.testing.assert_allclose(self.res, test, atol=1e-4) diff --git a/test/test_quantize_onnx.py b/test/test_quantize_onnx.py index 20b3062c2d..e2d0b984ac 100644 --- a/test/test_quantize_onnx.py +++ b/test/test_quantize_onnx.py @@ -5,7 +5,7 @@ from dataclasses import replace from tinygrad import Tensor, Context, Device, dtypes from tinygrad.uop.ops import Ops, UOp # noqa: F401 # pylint: disable=unused-import from tinygrad.opt.kernel import Kernel, Opt, OptOps -from tinygrad.engine.realize import CompiledRunner, ExecItem, lower_schedule_item +from tinygrad.engine.realize import CompiledRunner, ExecItem, lower_schedule_item, get_program from tinygrad.opt.search import bufs_from_lin from tinygrad.shape.shapetracker import ShapeTracker, View # noqa: F401 # pylint: disable=unused-import @@ -41,7 +41,7 @@ def sexec(out:Tensor, opts:list[Opt], replace_src=None, run_count=3): k = Kernel(si.ast, opts=Device[Device.DEFAULT].renderer) #opts = [Opt(op=OptOps.UPCAST, axis=0, arg=128)] #, Opt(op=OptOps.UNROLL, axis=0, arg=4)] k.apply_opts(opts) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) if replace_src is not None: old_name = prg.src.split("__attribute__((noinline)) void ")[1].split("(")[0] prg = replace(prg, src=replace_src + "/* DSP boilerplate */" + prg.src.split("/* DSP boilerplate */")[1].replace(old_name, "fxn")) @@ -296,7 +296,7 @@ class TestDSPCache(unittest.TestCase): with Context(DEVECTORIZE=0, QUANTIZE=1): k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) - prg = k.to_program() + prg = get_program(k.get_optimized_ast(), k.opts) #print(prg.src) new_src = """ diff --git a/test/test_uops_stats.py b/test/test_uops_stats.py index b534c0b0e1..657b837b93 100644 --- a/test/test_uops_stats.py +++ b/test/test_uops_stats.py @@ -1,7 +1,7 @@ import unittest from tinygrad import Tensor from tinygrad.helpers import getenv, GlobalCounters -from tinygrad.engine.realize import lower_schedule_item, ProgramSpec +from tinygrad.engine.realize import lower_schedule_item, ProgramSpec, get_program from tinygrad.renderer import Estimates from tinygrad.codegen import full_rewrite from tinygrad.uop.ops import Ops, UOp @@ -173,7 +173,8 @@ class TestStatsOptimized(unittest.TestCase): self.assertEqual(p.estimates.mem, 3*N*N*4) # 3 NxN mats with floats def test_gemm(self): - p = Kernel(self.ast_gemm).to_program() + k = Kernel(self.ast_gemm) + p = get_program(k.get_optimized_ast(), k.opts) self.check_gemm(p) self.assertEqual(p.estimates.lds, 2*N*N*N*4 + 4*N*N) @@ -181,7 +182,7 @@ class TestStatsOptimized(unittest.TestCase): k = Kernel(self.ast_gemm) if not k.apply_tensor_cores(): self.skipTest("no tensor cores") k.apply_opt(Opt(OptOps.UNROLL, 0, 2)) - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) print(p.src) self.check_gemm(p) @@ -190,7 +191,7 @@ class TestStatsOptimized(unittest.TestCase): def test_gemm_one_upcasted(self): k = Kernel(self.ast_gemm) k.apply_opt(Opt(OptOps.UPCAST, 0, 4)) - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) self.check_gemm(p) self.assertEqual(p.estimates.lds, N*N*N*4 + N*N*N*4//4 + 4*N*N) @@ -199,7 +200,7 @@ class TestStatsOptimized(unittest.TestCase): k.apply_opt(Opt(OptOps.UPCAST, 0, 4)) k.apply_opt(Opt(OptOps.UPCAST, 1, 4)) k.apply_opt(Opt(OptOps.UNROLL, 0, 4)) - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) self.check_gemm(p) self.assertEqual(p.estimates.lds, 2*N*N*N*4//4 + 4*N*N) @@ -212,7 +213,7 @@ class TestStatsOptimized(unittest.TestCase): k.apply_opt(Opt(OptOps.LOCAL, 1, 5)) except KernelOptError: raise unittest.SkipTest("no locals") - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) self.check_gemm(p) self.assertEqual(p.estimates.lds, 2*N*N*N*4//4 + 4*N*N) @@ -223,14 +224,14 @@ class TestStatsOptimized(unittest.TestCase): except KernelOptError: raise unittest.SkipTest("no locals") SZ = N*N*4 - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) # NOTE: these are sort of wrong. they aren't honoring the IF statement self.check_gemm(p, extra_flops=SZ*4) self.assertEqual(p.estimates.lds, 2*N*N*N*4 + SZ*4 + (SZ*4 + 4*N*N)*4) def test_reduce(self): k = Kernel(self.ast_reduce) - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) print(p.name, p.estimates.ops, p.estimates.mem, p.estimates.lds) self.assertEqual(p.estimates.ops, N*N) self.assertEqual(p.estimates.mem, N*N*4 + 4) @@ -241,7 +242,7 @@ class TestStatsOptimized(unittest.TestCase): k.apply_opt(Opt(OptOps.GROUP, 0, 50)) except KernelOptError: raise unittest.SkipTest("no locals") - p = k.to_program() + p = get_program(k.get_optimized_ast(), k.opts) # NOTE: these are wrong, they don't respect the if statement print(p.name, p.estimates.ops, p.estimates.mem, p.estimates.lds) diff --git a/tinygrad/opt/kernel.py b/tinygrad/opt/kernel.py index a067c4c0e5..2b1f8ec233 100644 --- a/tinygrad/opt/kernel.py +++ b/tinygrad/opt/kernel.py @@ -9,7 +9,7 @@ from tinygrad.uop.ops import GroupOp, KernelInfo, UOp, Ops, can_pad, resolve, Va from tinygrad.uop.spec import type_verify, ast_spec from tinygrad.device import Device from tinygrad.opt.tc import TensorCore -from tinygrad.renderer import Renderer, ProgramSpec +from tinygrad.renderer import Renderer from tinygrad.dtype import ImageDType from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, DEBUG, TC_SELECT, TC_OPT, AMX from tinygrad.shape.shapetracker import ShapeTracker @@ -503,11 +503,3 @@ class Kernel: fixed_ast = fixup_ast(self.ast) del fixup_ast return graph_rewrite(fixed_ast, view_left, name="fixup optimized AST") - - # TODO: update the tests and delete these methods - - def to_program(self, name_override:Optional[str]=None) -> ProgramSpec: - from tinygrad.engine.realize import get_program - ret = get_program(self.get_optimized_ast(name_override), self.opts) - self.uops = ret.uops - return ret diff --git a/tinygrad/opt/search.py b/tinygrad/opt/search.py index db0591b852..d3bb084471 100644 --- a/tinygrad/opt/search.py +++ b/tinygrad/opt/search.py @@ -9,7 +9,7 @@ from tinygrad.helpers import IGNORE_BEAM_CACHE, TC_SEARCH_OVER_SHAPE from tinygrad.dtype import ImageDType, PtrDType from tinygrad.opt.kernel import Kernel, Opt, OptOps, KernelOptError from tinygrad.tensor import Tensor -from tinygrad.engine.realize import CompiledRunner +from tinygrad.engine.realize import CompiledRunner, get_program from tinygrad.renderer import ProgramSpec actions = [Opt(op=OptOps.UPCAST, axis=axis, arg=amt) for amt in [0,2,3,4,5,7] for axis in range(8)] @@ -64,7 +64,7 @@ def _try_compile_linearized_w_idx(x:tuple[int,Kernel], compiler:Compiler) -> tup signal.alarm(getenv("BEAM_TIMEOUT_SEC", 10)) ret = None try: - p = x[1].copy().to_program(name_override="test") + p = get_program(x[1].copy().get_optimized_ast(name_override="test"), x[1].opts) assert p.uops is not None, "uop list wasn't generated?" if len(p.uops) >= (uops_max:=getenv("BEAM_UOPS_MAX", 3000)) > 0: if getenv("BEAM_LOG_SURPASS_MAX"): print(f"too many uops. {len(p.uops)=}, {uops_max=}")