diff --git a/extra/gemm/max_matmul.py b/extra/gemm/max_matmul.py index 37c346bf4c..5cf78fc0ef 100644 --- a/extra/gemm/max_matmul.py +++ b/extra/gemm/max_matmul.py @@ -54,7 +54,6 @@ def randoms(): def ast_to_cuda_prog(compiler, ast, opts): k = Kernel(ast) - k.required_optimizations() k.apply_opts(opts) p = k.to_program() return CUDAProgram(device, p.function_name, compiler.compile(p.src)) diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py index 8af927237f..308fd008be 100644 --- a/test/external/fuzz_linearizer.py +++ b/test/external/fuzz_linearizer.py @@ -132,7 +132,6 @@ def compare_linearizer(lin: Kernel, rawbufs=None, var_vals=None, ground_truth=No if ground_truth is None and not has_bf16: unoptimized = Kernel(lin.ast) - unoptimized.required_optimizations() if run_linearizer(unoptimized, rawbufs, var_vals)[0] != "PASS": return ("BASELINE_ERROR", rawbufs, var_vals, ground_truth, None) ground_truth = np.frombuffer(rawbufs[0].as_buffer(), _to_np_dtype(rawbufs[0].dtype)).copy() diff --git a/test/external/speed_beam_v_hcopt.py b/test/external/speed_beam_v_hcopt.py index f2cadb530c..f6d2b8e5e1 100644 --- a/test/external/speed_beam_v_hcopt.py +++ b/test/external/speed_beam_v_hcopt.py @@ -19,7 +19,6 @@ if __name__ == "__main__": def new_lin(): return ast_str_to_lin(ast, opts=dev.renderer) k = new_lin() - # k.required_optimizations() if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k = hand_coded_optimizations(k) @@ -30,7 +29,6 @@ if __name__ == "__main__": lins.append(("hc", new_lin())) lins[-1][1] = hand_coded_optimizations(lins[-1][1]) kb = new_lin() - # kb.required_optimizations() test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1))))) timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2]) diff --git a/test/external/verify_kernel.py b/test/external/verify_kernel.py index a56b3d7fe6..55ebb6849b 100644 --- a/test/external/verify_kernel.py +++ b/test/external/verify_kernel.py @@ -51,7 +51,6 @@ if __name__ == "__main__": print(test_lin.ast) print(test_lin.applied_opts) unoptimized_lin = Kernel(test_lin.ast) - unoptimized_lin.required_optimizations() print(f"{unoptimized_lin.colored_shape()} -> {test_lin.colored_shape()}") (msg,rb,vv,gt) = compare_linearizer(test_lin, None, None, None, rtol=args.rtol, atol=args.atol) if msg != "PASS": diff --git a/test/test_linearizer_dumb.py b/test/test_linearizer_dumb.py index 11216155ca..003273d2db 100644 --- a/test/test_linearizer_dumb.py +++ b/test/test_linearizer_dumb.py @@ -37,7 +37,6 @@ class TestLinearizerDumb(unittest.TestCase): UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=2, arg=(-1, 2)), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.UNROLL, axis=1, arg=0)] k = Kernel(ast, opts=Device["METAL"].renderer) - k.required_optimizations() k.apply_opts(opts) prg = k.to_program() print(prg.src) @@ -72,7 +71,6 @@ class TestLinearizerDumb(unittest.TestCase): UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) - k.required_optimizations() k.apply_opts(opts) prg = k.to_program() print(prg.src) @@ -90,7 +88,6 @@ class TestLinearizerDumb(unittest.TestCase): UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=()),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) - k.required_optimizations() k.apply_opts(opts) prg = k.to_program() print(prg.src) diff --git a/tinygrad/codegen/devectorizer.py b/tinygrad/codegen/devectorizer.py index f15f11cdb1..897bed406f 100644 --- a/tinygrad/codegen/devectorizer.py +++ b/tinygrad/codegen/devectorizer.py @@ -22,7 +22,8 @@ def simplify_valid_load(buf:UOp, start_idx:UOp, valid:UOp) -> UOp|None: # can drop valid if idx is out of bound when valid is False drop_stmt = [] for stmt in split_uop(valid, Ops.AND): - X, is_upper_bound, c = parse_valid(stmt) + try: X, is_upper_bound, c = parse_valid(stmt) + except ValueError: return None # for X0 + X1 + ... >= 1, check if it's out of bound when Xi = 0 for all i if not is_upper_bound and c == 1 and all(u.op in GroupOp.Irreducible and u.vmin == 0 for u in split_uop(X, Ops.ADD)): diff --git a/tinygrad/codegen/heuristic.py b/tinygrad/codegen/heuristic.py index 9068fc68e2..8ac6722f0c 100644 --- a/tinygrad/codegen/heuristic.py +++ b/tinygrad/codegen/heuristic.py @@ -6,7 +6,7 @@ from tinygrad.ops import Ops, resolve def hand_coded_optimizations(k:Kernel) -> Kernel: # make a copy so it does not mutate the input - k = k.copy().required_optimizations() + k = k.copy() # should use matvec - TODO: adjust/tune based on the wide vs tall/large vs small mat MV_BLOCKSIZE, MV_THREADS_PER_ROW, MV_ROWS_PER_THREAD = getenv("MV_BLOCKSIZE", 4), getenv("MV_THREADS_PER_ROW", 8), getenv("MV_ROWS_PER_THREAD", 4) diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 1600bba956..7b87f3c565 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -432,13 +432,6 @@ class Kernel: def apply_opts(self, opts:Sequence[Opt]): for opt in opts: self.apply_opt(opt) - def required_optimizations(self) -> Kernel: - if isinstance(self.membufs[0].dtype, ImageDType): - unit_stride_axes_mul_4 = [i for i in self.sts[0].unit_stride_axes(ignore_valid=True) if self.sts[0].shape[i]%4 == 0] - assert unit_stride_axes_mul_4, f"needs a unit stride axis in {self.bufs[0]}" - if all(x < self.first_upcast for x in unit_stride_axes_mul_4): self.apply_opt(Opt(OptOps.UPCAST, unit_stride_axes_mul_4[0], 4)) - return self - # **** kernel outputs **** kernel_cnt: Final[defaultdict[str, int]] = defaultdict(int) diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py index dd5c109383..b1279f0054 100644 --- a/tinygrad/engine/realize.py +++ b/tinygrad/engine/realize.py @@ -14,12 +14,12 @@ from tinygrad.engine.schedule import ScheduleItem logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1) def get_kernel(renderer:Renderer, ast:UOp) -> Kernel: - k = Kernel(ast, opts=renderer).required_optimizations() + k = Kernel(ast, opts=renderer) if not NOOPT: if not k.apply_tensor_cores(getenv("TC", 1)): k = hand_coded_optimizations(k) if BEAM >= 1: from tinygrad.engine.search import beam_search, bufs_from_lin - kb = Kernel(ast, opts=renderer).required_optimizations() + kb = Kernel(ast, opts=renderer) rawbufs = bufs_from_lin(kb, allocate=False) k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1))) if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])