diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 67ef980192..850f56a604 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -160,7 +160,7 @@ jobs: PYTHONPATH="." python test/external/dist/test_collectives.py - if: ${{ matrix.task == 'kopt' }} name: Test KOPT - run: PYTHONPATH="." KOPT=1 BUDGET=10 GPU=1 DEBUG=1 python -m pytest -rA -n=auto test/models/test_real_world.py + run: PYTHONPATH="." KOPT=1 BUDGET=20 GPU=1 DEBUG=1 python -m pytest -rA -n=auto test/models/test_real_world.py testmetalwebgpu: name: Metal and WebGPU Tests diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index 9bedd0fbab..1625b8c000 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -1,17 +1,41 @@ import unittest, time +import numpy as np from tinygrad.tensor import Tensor from tinygrad.nn import optim from tinygrad.nn.state import get_parameters from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE from tinygrad.ops import GlobalCounters, LazyOp, LoadOps from tinygrad.ops import Device -from tinygrad.helpers import CI, dtypes, getenv +from tinygrad.helpers import CI, dtypes, getenv, prod +from tinygrad.codegen.search import kernel_optimize_opts from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS from examples.hlb_cifar10 import SpeedyResNet from examples.llama import Transformer as LLaMaTransformer, MODEL_PARAMS as LLAMA_MODEL_PARAMS from examples.stable_diffusion import UNetModel +def kopt_search_hook(k, create_k, to_prg, baseline): + import nevergrad as ng + wanna_output = k.bufs[0].toCPU() + def check_opt(x): + try: + k = create_k() + k.process() + k.apply_auto_opt(x) + prg = to_prg(k) + first_tm = prg.exec(k.bufs, force_wait=True, optimizing=True) + np.testing.assert_allclose(wanna_output, k.bufs[0].toCPU(), atol=1e-4, rtol=1e-4) + return first_tm + except Exception: + return 10000_000 # 10000 seconds is infinity + opts = kernel_optimize_opts(k) + if not opts: return "BASELINE" + search_space = prod([len(x.choices) for x in opts]) + budget = getenv("BUDGET", 20) # THIS IS TEST BUDGET + optimizer = ng.optimizers.NGOpt(parametrization=ng.p.Tuple(*opts), budget=min(search_space, budget)) + recommendation = optimizer.minimize(check_opt) + return recommendation.value if recommendation.loss < baseline else "BASELINE" + def helper_test(nm, gen, train, max_memory_allowed, max_kernels_allowed): tms = [] for _ in range(4): @@ -43,6 +67,16 @@ def derandomize_model(model): p.realize() class TestRealWorld(unittest.TestCase): + def setUp(self): + np.random.seed(2002) + if getenv("KOPT"): + self.oldfunc = getattr(__import__("tinygrad.codegen.search", fromlist=["kernel_optimize_search"]), "kernel_optimize_search") + setattr(__import__("tinygrad.codegen.search", fromlist=["kernel_optimize_search"]), "kernel_optimize_search", kopt_search_hook) + + def tearDown(self): + if getenv("KOPT"): + setattr(__import__("tinygrad.codegen.search", fromlist=["kernel_optimize_search"]), "kernel_optimize_search", self.oldfunc) + @unittest.skipUnless(not CI, "too big for CI") def test_stable_diffusion(self): model = UNetModel() diff --git a/tinygrad/codegen/search.py b/tinygrad/codegen/search.py index 4bd21b0c7c..1311a0bc85 100644 --- a/tinygrad/codegen/search.py +++ b/tinygrad/codegen/search.py @@ -8,6 +8,20 @@ def get_divisors(n, min_div = 1, max_div = 512): for d in range(min_div, min(max_div, n//2) + 1): if n % d == 0: yield d +def kernel_optimize_opts(k:Linearizer): + import nevergrad as ng + opts = [] + if k.first_reduce < k.shape_len: # TODO: Grouped reduces do not work with other locals. More chances to mutate to 1, so locals can be used. + opts.append(ng.p.TransitionChoice([(0,s,"G") for s in get_divisors(k.full_shape[k.first_reduce], min_div=16) if all(st.shape[k.first_reduce] % s == 0 or st.shape[k.first_reduce] == 1 for st in k.sts)], transitions=(0.8, 0.2))) + for i in range(k.first_reduce): + # TODO: the upcast always happen first, you might want to reverse this? + # TODO: the order of the locals might improve things too + opts.append(ng.p.TransitionChoice([(i,s,"U") for s in get_divisors(k.full_shape[i], max_div=32)])) + opts.append(ng.p.TransitionChoice([(i,s,"L") for s in get_divisors(k.full_shape[i])])) + for i in range(k.shape_len-k.first_reduce): + opts.append(ng.p.TransitionChoice([(i,s,"R") for s in get_divisors(k.full_shape[k.first_reduce+i], max_div=32)])) + return opts + def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, baseline): import nevergrad as ng def opt(x): @@ -25,16 +39,7 @@ def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_p import traceback traceback.print_exc() return 10000_000 # 10000 seconds is infinity - opts = [] - if k.first_reduce < k.shape_len: # TODO: Grouped reduces do not work with other locals. More chances to mutate to 1, so locals can be used. - opts.append(ng.p.TransitionChoice([(0,s,"G") for s in get_divisors(k.full_shape[k.first_reduce], min_div=16) if all(st.shape[k.first_reduce] % s == 0 or st.shape[k.first_reduce] == 1 for st in k.sts)], transitions=(0.8, 0.2))) - for i in range(k.first_reduce): - # TODO: the upcast always happen first, you might want to reverse this? - # TODO: the order of the locals might improve things too - opts.append(ng.p.TransitionChoice([(i,s,"U") for s in get_divisors(k.full_shape[i], max_div=32)])) - opts.append(ng.p.TransitionChoice([(i,s,"L") for s in get_divisors(k.full_shape[i])])) - for i in range(k.shape_len-k.first_reduce): - opts.append(ng.p.TransitionChoice([(i,s,"R") for s in get_divisors(k.full_shape[k.first_reduce+i], max_div=32)])) + opts = kernel_optimize_opts(k) if not opts: return "BASELINE" search_space = prod([len(x.choices) for x in opts]) st = time.perf_counter()