diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 67ef980192..850f56a604 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -160,7 +160,7 @@ jobs:
           PYTHONPATH="." python test/external/dist/test_collectives.py
       - if: ${{ matrix.task == 'kopt' }}
         name: Test KOPT
-        run: PYTHONPATH="." KOPT=1 BUDGET=10 GPU=1 DEBUG=1 python -m pytest -rA -n=auto test/models/test_real_world.py
+        run: PYTHONPATH="." KOPT=1 BUDGET=20 GPU=1 DEBUG=1 python -m pytest -rA -n=auto test/models/test_real_world.py
 
   testmetalwebgpu:
     name: Metal and WebGPU Tests
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index 9bedd0fbab..1625b8c000 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -1,17 +1,41 @@
 import unittest, time
+import numpy as np
 from tinygrad.tensor import Tensor
 from tinygrad.nn import optim
 from tinygrad.nn.state import get_parameters
 from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
 from tinygrad.ops import GlobalCounters, LazyOp, LoadOps
 from tinygrad.ops import Device
-from tinygrad.helpers import CI, dtypes, getenv
+from tinygrad.helpers import CI, dtypes, getenv, prod
+from tinygrad.codegen.search import kernel_optimize_opts
 
 from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
 from examples.hlb_cifar10 import SpeedyResNet
 from examples.llama import Transformer as LLaMaTransformer, MODEL_PARAMS as LLAMA_MODEL_PARAMS
 from examples.stable_diffusion import UNetModel
 
+def kopt_search_hook(k, create_k, to_prg, baseline):
+  import nevergrad as ng
+  wanna_output = k.bufs[0].toCPU()
+  def check_opt(x):
+    try:
+      k = create_k()
+      k.process()
+      k.apply_auto_opt(x)
+      prg = to_prg(k)
+      first_tm = prg.exec(k.bufs, force_wait=True, optimizing=True)
+      np.testing.assert_allclose(wanna_output, k.bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
+      return first_tm
+    except Exception:
+      return 10000_000   # 10000 seconds is infinity
+  opts = kernel_optimize_opts(k)
+  if not opts: return "BASELINE"
+  search_space = prod([len(x.choices) for x in opts])
+  budget = getenv("BUDGET", 20) # THIS IS TEST BUDGET
+  optimizer = ng.optimizers.NGOpt(parametrization=ng.p.Tuple(*opts), budget=min(search_space, budget))
+  recommendation = optimizer.minimize(check_opt)
+  return recommendation.value if recommendation.loss < baseline else "BASELINE"
+
 def helper_test(nm, gen, train, max_memory_allowed, max_kernels_allowed):
   tms = []
   for _ in range(4):
@@ -43,6 +67,16 @@ def derandomize_model(model):
     p.realize()
 
 class TestRealWorld(unittest.TestCase):
+  def setUp(self):
+    np.random.seed(2002)
+    if getenv("KOPT"):
+      self.oldfunc = getattr(__import__("tinygrad.codegen.search", fromlist=["kernel_optimize_search"]), "kernel_optimize_search")
+      setattr(__import__("tinygrad.codegen.search", fromlist=["kernel_optimize_search"]), "kernel_optimize_search", kopt_search_hook)
+
+  def tearDown(self):
+    if getenv("KOPT"):
+      setattr(__import__("tinygrad.codegen.search", fromlist=["kernel_optimize_search"]), "kernel_optimize_search", self.oldfunc)
+
   @unittest.skipUnless(not CI, "too big for CI")
   def test_stable_diffusion(self):
     model = UNetModel()
diff --git a/tinygrad/codegen/search.py b/tinygrad/codegen/search.py
index 4bd21b0c7c..1311a0bc85 100644
--- a/tinygrad/codegen/search.py
+++ b/tinygrad/codegen/search.py
@@ -8,6 +8,20 @@ def get_divisors(n, min_div = 1, max_div = 512):
   for d in range(min_div, min(max_div, n//2) + 1):
     if n % d == 0: yield d
 
+def kernel_optimize_opts(k:Linearizer):
+  import nevergrad as ng
+  opts = []
+  if k.first_reduce < k.shape_len: # TODO: Grouped reduces do not work with other locals. More chances to mutate to 1, so locals can be used.
+    opts.append(ng.p.TransitionChoice([(0,s,"G") for s in get_divisors(k.full_shape[k.first_reduce], min_div=16) if all(st.shape[k.first_reduce] % s == 0 or st.shape[k.first_reduce] == 1 for st in k.sts)], transitions=(0.8, 0.2)))
+  for i in range(k.first_reduce):
+    # TODO: the upcast always happen first, you might want to reverse this?
+    # TODO: the order of the locals might improve things too
+    opts.append(ng.p.TransitionChoice([(i,s,"U") for s in get_divisors(k.full_shape[i], max_div=32)]))
+    opts.append(ng.p.TransitionChoice([(i,s,"L") for s in get_divisors(k.full_shape[i])]))
+  for i in range(k.shape_len-k.first_reduce):
+    opts.append(ng.p.TransitionChoice([(i,s,"R") for s in get_divisors(k.full_shape[k.first_reduce+i], max_div=32)]))
+  return opts
+
 def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, baseline):
   import nevergrad as ng
   def opt(x):
@@ -25,16 +39,7 @@ def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_p
         import traceback
         traceback.print_exc()
       return 10000_000   # 10000 seconds is infinity
-  opts = []
-  if k.first_reduce < k.shape_len: # TODO: Grouped reduces do not work with other locals. More chances to mutate to 1, so locals can be used.
-    opts.append(ng.p.TransitionChoice([(0,s,"G") for s in get_divisors(k.full_shape[k.first_reduce], min_div=16) if all(st.shape[k.first_reduce] % s == 0 or st.shape[k.first_reduce] == 1 for st in k.sts)], transitions=(0.8, 0.2)))
-  for i in range(k.first_reduce):
-    # TODO: the upcast always happen first, you might want to reverse this?
-    # TODO: the order of the locals might improve things too
-    opts.append(ng.p.TransitionChoice([(i,s,"U") for s in get_divisors(k.full_shape[i], max_div=32)]))
-    opts.append(ng.p.TransitionChoice([(i,s,"L") for s in get_divisors(k.full_shape[i])]))
-  for i in range(k.shape_len-k.first_reduce):
-    opts.append(ng.p.TransitionChoice([(i,s,"R") for s in get_divisors(k.full_shape[k.first_reduce+i], max_div=32)]))
+  opts = kernel_optimize_opts(k)
   if not opts: return "BASELINE"
   search_space = prod([len(x.choices) for x in opts])
   st = time.perf_counter()