diff --git a/test/external/speed_beam_v_hcopt.py b/test/external/speed_beam_v_hcopt.py deleted file mode 100644 index b241eb1b67..0000000000 --- a/test/external/speed_beam_v_hcopt.py +++ /dev/null @@ -1,41 +0,0 @@ -from tinygrad import Device -from tinygrad.helpers import getenv, DEBUG, BEAM -from tinygrad.codegen.opt.search import beam_search, bufs_from_lin -from tinygrad.codegen.opt.heuristic import hand_coded_optimizations -from extra.optimization.helpers import load_worlds, ast_str_to_lin, time_linearizer - -if __name__ == "__main__": - filter_reduce = bool(getenv("FILTER_REDUCE")) - ast_strs = load_worlds(filter_reduce=filter_reduce, filter_novariable=True) - dev = Device[Device.DEFAULT] - - test_n = getenv("TEST_N", 10) - single = getenv("NUM", -1) - if single != -1: ast_strs = ast_strs[single:single+1] - - beam_won, tested = 0, 0 - - for num, ast in enumerate(ast_strs[:test_n]): - def new_lin(): return ast_str_to_lin(ast, opts=dev.renderer) - - k = new_lin() - - if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.apply_opts(hand_coded_optimizations(k)) - - assert BEAM > 0 - - lins = [(("tc" if used_tensor_cores else "hc"), k)] - if used_tensor_cores: - lins.append(("hc", new_lin())) - lins[-1][1].apply_opts(hand_coded_optimizations(lins[-1][1])) - kb = new_lin() - test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization - lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1))))) - timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2]) - if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed)) - - tested += 1 - if timed[0][0].startswith("beam"): - beam_won += 1 - - print(f"{beam_won=} / {tested=} = {beam_won/tested:.3f}") \ No newline at end of file diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 312fdba932..722fba84a8 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -616,7 +616,8 @@ class TestLinearizer(unittest.TestCase): """ x, y = Tensor.randn(64,64), Tensor.randn(64,64) out = x.matmul(y) - k = helper_linearizer_opt(out)[-1] + with Context(TC=0): + k = helper_linearizer_opt(out)[-1] uops = get_program(k.ast, k.opts, k.applied_opts).uops # check that the float4 cast collapses store_vals = [u.src[1] for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG] diff --git a/tinygrad/codegen/opt/__init__.py b/tinygrad/codegen/opt/__init__.py index d879a2e561..6024fb8532 100644 --- a/tinygrad/codegen/opt/__init__.py +++ b/tinygrad/codegen/opt/__init__.py @@ -3,7 +3,7 @@ from tinygrad.codegen.opt.kernel import Kernel from tinygrad.codegen.opt.heuristic import hand_coded_optimizations from tinygrad.uop.ops import UOp, PatternMatcher, UPat, Ops, KernelInfo -from tinygrad.helpers import NOOPT, BEAM, USE_TC, getenv +from tinygrad.helpers import NOOPT, BEAM, getenv from tinygrad.renderer import Renderer from tinygrad.uop.spec import type_verify @@ -25,7 +25,7 @@ def get_optimized_ast(ast:UOp, renderer:Renderer) -> UOp|None: if new_arg is None: k = Kernel(ast, opts=renderer) if not NOOPT: - if not k.apply_tensor_cores(USE_TC.value): k.apply_opts(hand_coded_optimizations(k)) + k.apply_opts(hand_coded_optimizations(k)) if BEAM >= 1: from tinygrad.codegen.opt.search import beam_search, bufs_from_lin kb = Kernel(ast, opts=renderer) diff --git a/tinygrad/codegen/opt/heuristic.py b/tinygrad/codegen/opt/heuristic.py index 40b3a9d3cc..aa3bc65e40 100644 --- a/tinygrad/codegen/opt/heuristic.py +++ b/tinygrad/codegen/opt/heuristic.py @@ -1,10 +1,46 @@ import itertools from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps, KernelOptError, AxisType -from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS +from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS, TC_OPT, TC_SELECT, USE_TC, AMX from tinygrad.dtype import ImageDType from tinygrad.uop.ops import Ops, resolve def hand_coded_optimizations(k:Kernel) -> list[Opt]: + # first try the tensor cores + """ Attempts to apply a tensor core optimization to the kernel. If one exists and applies properly, return true, otherwise return false. + Tensor cores are optimized instructions that matrix multiply-accumulate across a wave of threads: D(M, N) = A(M, K) * B(K, N) + C(M, N). + + Keyword arguments: + use_tensor_cores -- controls how tensor cores are applied (default 1) + 0: will disable any tensor core matching + 1: enable tensor cores + 2: apply tensor core shape but don't use UOp.WMMA + extra_opts -- additional Opt's to apply after the tensor core instead of the hand-coded additional Opt's (default None) + tc_select -- specifies which tensor core(s) to use for optimization (default -1) + -1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes) + [0-N]: uses only the n'th tensor core available; useful for search + tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise) + 0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL + 1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers + 2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed + """ + if USE_TC > 0: + try: # check TC first and apply hand-coded opts if successful + tk = k.copy() + tk.apply_opt(Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, USE_TC.value))) + + # skip hand-coded TC opts if AMX, upcasting will make kernel slower + if (tc_opts:=tk.tensor_core_opts) is not None and not AMX: + # hand-coded TC opts + for tc_dim in [tc_dim for tc_dim in [1,0] if tc_opts.axes_exist[tc_dim]]: # attempt to upcast M and N + szs = [sz for sz in [5,4,3,2] if tk.full_shape[tc_opts.axes[tc_dim]] % sz == 0] + if szs: tk.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[tc_dim], szs[0])) + + if tc_opts.axes_exist[0] and (szs := [sz for sz in [4,2] if tk.full_shape[tc_opts.axes[0]] % sz == 0]): # attempt to local N + tk.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[0], szs[0])) + return tk.applied_opts + except KernelOptError: + pass + # make a copy so it does not mutate the input k = k.copy() diff --git a/tinygrad/codegen/opt/kernel.py b/tinygrad/codegen/opt/kernel.py index 7efe47e28a..43615d988d 100644 --- a/tinygrad/codegen/opt/kernel.py +++ b/tinygrad/codegen/opt/kernel.py @@ -11,7 +11,7 @@ from tinygrad.device import Device from tinygrad.codegen.opt.tc import TensorCore from tinygrad.renderer import Renderer from tinygrad.dtype import ImageDType -from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, argfix, DEBUG, TC_SELECT, TC_OPT, AMX +from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, argfix, DEBUG from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import strides_for_shape, get_contraction from tinygrad.codegen.opt.swizzler import view_left, view_left_through_load @@ -399,41 +399,6 @@ class Kernel: return True return False - def apply_tensor_cores(self, use_tensor_cores=1) -> bool: # , extra_opts:list[Opt]|None=None) -> bool: - """ Attempts to apply a tensor core optimization to the kernel. If one exists and applies properly, return true, otherwise return false. - Tensor cores are optimized instructions that matrix multiply-accumulate across a wave of threads: D(M, N) = A(M, K) * B(K, N) + C(M, N). - - Keyword arguments: - use_tensor_cores -- controls how tensor cores are applied (default 1) - 0: will disable any tensor core matching - 1: enable tensor cores - 2: apply tensor core shape but don't use UOp.WMMA - extra_opts -- additional Opt's to apply after the tensor core instead of the hand-coded additional Opt's (default None) - tc_select -- specifies which tensor core(s) to use for optimization (default -1) - -1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes) - [0-N]: uses only the n'th tensor core available; useful for search - tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise) - 0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL - 1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers - 2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed - """ - if not self.opts.tensor_cores: return False - try: # check TC first and apply hand-coded opts if successful - self.apply_opt(Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, use_tensor_cores))) - - if (tc_opts:=self.tensor_core_opts) is not None: - if AMX: return True # skip hand-coded TC opts if AMX, upcasting will make kernel slower - # hand-coded TC opts - for tc_dim in [tc_dim for tc_dim in [1,0] if tc_opts.axes_exist[tc_dim]]: # attempt to upcast M and N - szs = [sz for sz in [5,4,3,2] if self.full_shape[tc_opts.axes[tc_dim]] % sz == 0] - if szs: self.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[tc_dim], szs[0])) - - if tc_opts.axes_exist[0] and (szs := [sz for sz in [4,2] if self.full_shape[tc_opts.axes[0]] % sz == 0]): # attempt to local N - self.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[0], szs[0])) - return True - except KernelOptError: - return False - # strings like ['g0', 'g1', 'l0', 'l1', 'l2', 'l3', 'l4', 'l5', 'R0', 'r0', 'r1', 'r2', 'u0', 'u1', 'u2'] def shape_str(self) -> list[str]: ret: list[str] = []