update tests get_runner (#4522)

2026-01-09 15:08:02 -05:00 · 2024-05-10 20:09:22 -07:00
parent a0448ff595
commit 827058f030
5 changed files with 50 additions and 49 deletions
--- a/test/external/speed_compare_cuda_nv.py
+++ b/test/external/speed_compare_cuda_nv.py
@@ -25,12 +25,12 @@ if __name__ == "__main__":
    # cuda compile
    culin = ast_str_to_lin(ast, opts=cudev.compiler.compiler_opts)
    culin.hand_coded_optimizations()
-    cuda_prg = cudev.to_program(culin)
+    cuda_prg = cudev.to_runner(culin)
    cubufs = bufs_from_lin(culin)
    nvlin = ast_str_to_lin(ast, opts=nvdev.compiler.compiler_opts)
    nvlin.hand_coded_optimizations()
-    nv_prg = nvdev.to_program(nvlin)
+    nv_prg = nvdev.to_runner(nvlin)
    nvbufs = bufs_from_lin(nvlin)
    # warmup
--- a/test/test_fusion_op.py
+++ b/test/test_fusion_op.py
@@ -1,9 +1,9 @@
 import unittest
 import time
 import numpy as np
-from tinygrad import Tensor, dtypes, Device
+from tinygrad import Tensor, dtypes
 from tinygrad.engine.schedule import create_schedule
-from tinygrad.engine.realize import run_schedule
+from tinygrad.engine.realize import lower_schedule_item, run_schedule
 class TestFusionOp(unittest.TestCase):
  def test_contiguous_add(self):
@@ -27,9 +27,9 @@ class TestFusionOp(unittest.TestCase):
    a = Tensor([1,2,3,4])
    for _ in range(24): a = a + a
    sched = create_schedule([a.lazydata], None)
-    ji = Device[Device.DEFAULT].get_runner(*sched[-1].ast)
+    ei = lower_schedule_item(sched[-1])
    self.assertLess(time.perf_counter()-st, 1.0)
-    assert len(ji.p.src.splitlines()) < 250
+    assert len(ei.prg.p.src.splitlines()) < 250
  def test_recursive_add_cmp(self):
    st = time.perf_counter()
--- a/test/test_uops_stats.py
+++ b/test/test_uops_stats.py
@@ -1,6 +1,7 @@
 import unittest
-from tinygrad import Tensor, Device
+from tinygrad import Tensor
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.engine.realize import lower_schedule_item
 # TODO: can copy this in here when we remove it
 #from tinygrad.ops import get_lazyop_info
@@ -12,8 +13,8 @@ from tinygrad.engine.schedule import create_schedule
 def get_stats(x:Tensor):
  si = create_schedule([x.lazydata])[-1]
-  runner = Device[Device.DEFAULT].get_runner(*si.ast)
+  ei = lower_schedule_item(si)
-  return runner.op_estimate, runner.mem_estimate
+  return ei.prg.p.op_estimate, ei.prg.p.mem_estimate
 class TestUOpsStats(unittest.TestCase):
  def test_simple_add(self):
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -181,43 +181,6 @@ class Runner:
 # **************** for Compiled Devices ****************
 def fake_renderer(name, uops): raise NotImplementedError("needs a renderer")
@dataclass(frozen=True)
 class CompilerOptions:
  device: str = ""
  suffix: str = ""
  # TODO: make this generic with a list of supported types
  supports_float4: bool = True
  has_local: bool = True
  has_shared: bool = True
  has_tensor_cores: bool = False
  # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
  global_max: Optional[List[int]] = None
  local_max: Optional[List[int]] = None
  shared_max: int = 32768
  renderer: Callable = fake_renderer
 class Compiler:
  compiler_opts: ClassVar[CompilerOptions]
  def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
  def compile_cached(self, src:str) -> bytes:
    if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
      lib = self.compile(src)
      if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
    return lib
  def to_program(self, k:Linearizer, override_device:Optional[str]=None) -> Program:
    k.linearize()
    info = get_lazyop_info(k.ast[0])
    ops, mem = k.uops.flops_mem()
    run_count = prod((k.global_size if k.global_size else []) + (k.local_size if k.local_size else []))
    # NOTE: we use min here to ignore the indexing FLOPS
    return Program(k.name, self.compiler_opts.renderer(to_function_name(k.name), k.uops),
                   override_device if override_device else self.compiler_opts.device,
                   k.global_size, k.local_size, k.uops, min(info.flops, ops * run_count), min(info.mem_estimate, mem * run_count))
@dataclass(frozen=True)
 class Program:
  name:str
@@ -246,6 +209,43 @@ class Program:
    local_size = [sym_infer(sz, var_vals) for sz in self.local_size] if self.local_size is not None else None
    return global_size, local_size
 def fake_renderer(name, uops): raise NotImplementedError("needs a renderer")
@dataclass(frozen=True)
 class CompilerOptions:
  device: str = ""
  suffix: str = ""
  # TODO: make this generic with a list of supported types
  supports_float4: bool = True
  has_local: bool = True
  has_shared: bool = True
  has_tensor_cores: bool = False
  # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
  global_max: Optional[List[int]] = None
  local_max: Optional[List[int]] = None
  shared_max: int = 32768
  renderer: Callable = fake_renderer
  def to_program(self, k:Linearizer, override_device:Optional[str]=None) -> Program:
    k.linearize()
    info = get_lazyop_info(k.ast[0])
    ops, mem = k.uops.flops_mem()
    run_count = prod((k.global_size if k.global_size else []) + (k.local_size if k.local_size else []))
    # NOTE: we use min here to ignore the indexing FLOPS
    return Program(k.name, self.renderer(to_function_name(k.name), k.uops),
                   override_device if override_device else self.device,
                   k.global_size, k.local_size, k.uops, min(info.flops, ops * run_count), min(info.mem_estimate, mem * run_count))
 class Compiler:
  compiler_opts: ClassVar[CompilerOptions]
  def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
  def compile_cached(self, src:str) -> bytes:
    if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
      lib = self.compile(src)
      if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
    return lib
 class CompiledRunner(Runner):
  def __init__(self, p:Program, precompiled:Optional[bytes]=None):
    if DEBUG >= 4: print(p.src)
@@ -280,7 +280,7 @@ class Compiled:
    self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph
  def synchronize(self): pass  # override this in your device
-  def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(self.compiler.to_program(k, override_device=self.dname))
+  def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(self.compiler.compiler_opts.to_program(k, override_device=self.dname))
  def get_linearizer(self, *ast:LazyOp) -> Linearizer:
    if DEBUG >= 3:
--- a/tinygrad/features/search.py
+++ b/tinygrad/features/search.py
@@ -51,7 +51,7 @@ def _try_compile_linearized_w_idx(x:Tuple[int,Linearizer], compiler:Compiler) ->
  try:
    x[1].linearize()
    if len(x[1].uops.uops) >= getenv("BEAM_UOPS_MAX", 3000) > 0: raise RuntimeError("too many uops")
-    p = compiler.to_program(x[1])
+    p = compiler.compiler_opts.to_program(x[1])
    st = time.perf_counter()
    prog = compiler.compile(p.src)
    et = time.perf_counter() - st
@@ -174,7 +174,7 @@ def time_linearizer(lin:Linearizer, rawbufs:List[Buffer], allow_test_size=True,
  rawbufs = _ensure_buffer_alloc(rawbufs)
  var_vals = {k:(k.max+k.min)//2 for k in lin.ast[0].vars()}
-  p = dev.compiler.to_program(lin)
+  p = dev.compiler.compiler_opts.to_program(lin)
  tms = _time_program(p, dev.compiler.compile(p.src), var_vals, rawbufs,
                      max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name))