remove required_optimizations (#9848)

This commit is contained in:
chenyu
2025-04-19 16:51:16 -04:00
committed by GitHub
parent 218e01833d
commit 720f20865b
9 changed files with 5 additions and 19 deletions

View File

@@ -54,7 +54,6 @@ def randoms():
def ast_to_cuda_prog(compiler, ast, opts):
k = Kernel(ast)
k.required_optimizations()
k.apply_opts(opts)
p = k.to_program()
return CUDAProgram(device, p.function_name, compiler.compile(p.src))

View File

@@ -132,7 +132,6 @@ def compare_linearizer(lin: Kernel, rawbufs=None, var_vals=None, ground_truth=No
if ground_truth is None and not has_bf16:
unoptimized = Kernel(lin.ast)
unoptimized.required_optimizations()
if run_linearizer(unoptimized, rawbufs, var_vals)[0] != "PASS":
return ("BASELINE_ERROR", rawbufs, var_vals, ground_truth, None)
ground_truth = np.frombuffer(rawbufs[0].as_buffer(), _to_np_dtype(rawbufs[0].dtype)).copy()

View File

@@ -19,7 +19,6 @@ if __name__ == "__main__":
def new_lin(): return ast_str_to_lin(ast, opts=dev.renderer)
k = new_lin()
# k.required_optimizations()
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k = hand_coded_optimizations(k)
@@ -30,7 +29,6 @@ if __name__ == "__main__":
lins.append(("hc", new_lin()))
lins[-1][1] = hand_coded_optimizations(lins[-1][1])
kb = new_lin()
# kb.required_optimizations()
test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization
lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))))
timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])

View File

@@ -51,7 +51,6 @@ if __name__ == "__main__":
print(test_lin.ast)
print(test_lin.applied_opts)
unoptimized_lin = Kernel(test_lin.ast)
unoptimized_lin.required_optimizations()
print(f"{unoptimized_lin.colored_shape()} -> {test_lin.colored_shape()}")
(msg,rb,vv,gt) = compare_linearizer(test_lin, None, None, None, rtol=args.rtol, atol=args.atol)
if msg != "PASS":

View File

@@ -37,7 +37,6 @@ class TestLinearizerDumb(unittest.TestCase):
UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),))
opts = [Opt(op=OptOps.TC, axis=2, arg=(-1, 2)), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.UNROLL, axis=1, arg=0)]
k = Kernel(ast, opts=Device["METAL"].renderer)
k.required_optimizations()
k.apply_opts(opts)
prg = k.to_program()
print(prg.src)
@@ -72,7 +71,6 @@ class TestLinearizerDumb(unittest.TestCase):
UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),))
opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8)]
k = Kernel(ast, opts=Device[Device.DEFAULT].renderer)
k.required_optimizations()
k.apply_opts(opts)
prg = k.to_program()
print(prg.src)
@@ -90,7 +88,6 @@ class TestLinearizerDumb(unittest.TestCase):
UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=()),)),)),)),))
opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)]
k = Kernel(ast, opts=Device[Device.DEFAULT].renderer)
k.required_optimizations()
k.apply_opts(opts)
prg = k.to_program()
print(prg.src)

View File

@@ -22,7 +22,8 @@ def simplify_valid_load(buf:UOp, start_idx:UOp, valid:UOp) -> UOp|None:
# can drop valid if idx is out of bound when valid is False
drop_stmt = []
for stmt in split_uop(valid, Ops.AND):
X, is_upper_bound, c = parse_valid(stmt)
try: X, is_upper_bound, c = parse_valid(stmt)
except ValueError: return None
# for X0 + X1 + ... >= 1, check if it's out of bound when Xi = 0 for all i
if not is_upper_bound and c == 1 and all(u.op in GroupOp.Irreducible and u.vmin == 0 for u in split_uop(X, Ops.ADD)):

View File

@@ -6,7 +6,7 @@ from tinygrad.ops import Ops, resolve
def hand_coded_optimizations(k:Kernel) -> Kernel:
# make a copy so it does not mutate the input
k = k.copy().required_optimizations()
k = k.copy()
# should use matvec - TODO: adjust/tune based on the wide vs tall/large vs small mat
MV_BLOCKSIZE, MV_THREADS_PER_ROW, MV_ROWS_PER_THREAD = getenv("MV_BLOCKSIZE", 4), getenv("MV_THREADS_PER_ROW", 8), getenv("MV_ROWS_PER_THREAD", 4)

View File

@@ -432,13 +432,6 @@ class Kernel:
def apply_opts(self, opts:Sequence[Opt]):
for opt in opts: self.apply_opt(opt)
def required_optimizations(self) -> Kernel:
if isinstance(self.membufs[0].dtype, ImageDType):
unit_stride_axes_mul_4 = [i for i in self.sts[0].unit_stride_axes(ignore_valid=True) if self.sts[0].shape[i]%4 == 0]
assert unit_stride_axes_mul_4, f"needs a unit stride axis in {self.bufs[0]}"
if all(x < self.first_upcast for x in unit_stride_axes_mul_4): self.apply_opt(Opt(OptOps.UPCAST, unit_stride_axes_mul_4[0], 4))
return self
# **** kernel outputs ****
kernel_cnt: Final[defaultdict[str, int]] = defaultdict(int)

View File

@@ -14,12 +14,12 @@ from tinygrad.engine.schedule import ScheduleItem
logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
def get_kernel(renderer:Renderer, ast:UOp) -> Kernel:
k = Kernel(ast, opts=renderer).required_optimizations()
k = Kernel(ast, opts=renderer)
if not NOOPT:
if not k.apply_tensor_cores(getenv("TC", 1)): k = hand_coded_optimizations(k)
if BEAM >= 1:
from tinygrad.engine.search import beam_search, bufs_from_lin
kb = Kernel(ast, opts=renderer).required_optimizations()
kb = Kernel(ast, opts=renderer)
rawbufs = bufs_from_lin(kb, allocate=False)
k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])