diff --git a/test/external/speed_compare_cuda_nv.py b/test/external/speed_compare_cuda_nv.py index 48b7578864..fc8eb167ab 100644 --- a/test/external/speed_compare_cuda_nv.py +++ b/test/external/speed_compare_cuda_nv.py @@ -25,12 +25,12 @@ if __name__ == "__main__": # cuda compile culin = ast_str_to_lin(ast, opts=cudev.compiler.compiler_opts) culin.hand_coded_optimizations() - cuda_prg = cudev.to_program(culin) + cuda_prg = cudev.to_runner(culin) cubufs = bufs_from_lin(culin) nvlin = ast_str_to_lin(ast, opts=nvdev.compiler.compiler_opts) nvlin.hand_coded_optimizations() - nv_prg = nvdev.to_program(nvlin) + nv_prg = nvdev.to_runner(nvlin) nvbufs = bufs_from_lin(nvlin) # warmup diff --git a/test/test_fusion_op.py b/test/test_fusion_op.py index d1b09c1e1c..faab57cf8c 100644 --- a/test/test_fusion_op.py +++ b/test/test_fusion_op.py @@ -1,9 +1,9 @@ import unittest import time import numpy as np -from tinygrad import Tensor, dtypes, Device +from tinygrad import Tensor, dtypes from tinygrad.engine.schedule import create_schedule -from tinygrad.engine.realize import run_schedule +from tinygrad.engine.realize import lower_schedule_item, run_schedule class TestFusionOp(unittest.TestCase): def test_contiguous_add(self): @@ -27,9 +27,9 @@ class TestFusionOp(unittest.TestCase): a = Tensor([1,2,3,4]) for _ in range(24): a = a + a sched = create_schedule([a.lazydata], None) - ji = Device[Device.DEFAULT].get_runner(*sched[-1].ast) + ei = lower_schedule_item(sched[-1]) self.assertLess(time.perf_counter()-st, 1.0) - assert len(ji.p.src.splitlines()) < 250 + assert len(ei.prg.p.src.splitlines()) < 250 def test_recursive_add_cmp(self): st = time.perf_counter() diff --git a/test/test_uops_stats.py b/test/test_uops_stats.py index 2dcc21df22..29b968e90e 100644 --- a/test/test_uops_stats.py +++ b/test/test_uops_stats.py @@ -1,6 +1,7 @@ import unittest -from tinygrad import Tensor, Device +from tinygrad import Tensor from tinygrad.engine.schedule import create_schedule +from tinygrad.engine.realize import lower_schedule_item # TODO: can copy this in here when we remove it #from tinygrad.ops import get_lazyop_info @@ -12,8 +13,8 @@ from tinygrad.engine.schedule import create_schedule def get_stats(x:Tensor): si = create_schedule([x.lazydata])[-1] - runner = Device[Device.DEFAULT].get_runner(*si.ast) - return runner.op_estimate, runner.mem_estimate + ei = lower_schedule_item(si) + return ei.prg.p.op_estimate, ei.prg.p.mem_estimate class TestUOpsStats(unittest.TestCase): def test_simple_add(self): diff --git a/tinygrad/device.py b/tinygrad/device.py index d59765e61e..25e9f3b10a 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -181,43 +181,6 @@ class Runner: # **************** for Compiled Devices **************** -def fake_renderer(name, uops): raise NotImplementedError("needs a renderer") - -@dataclass(frozen=True) -class CompilerOptions: - device: str = "" - suffix: str = "" - # TODO: make this generic with a list of supported types - supports_float4: bool = True - has_local: bool = True - has_shared: bool = True - has_tensor_cores: bool = False - # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered - global_max: Optional[List[int]] = None - local_max: Optional[List[int]] = None - shared_max: int = 32768 - renderer: Callable = fake_renderer - -class Compiler: - compiler_opts: ClassVar[CompilerOptions] - def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey - def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function") - def compile_cached(self, src:str) -> bytes: - if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None: - lib = self.compile(src) - if self.cachekey is not None: diskcache_put(self.cachekey, src, lib) - return lib - - def to_program(self, k:Linearizer, override_device:Optional[str]=None) -> Program: - k.linearize() - info = get_lazyop_info(k.ast[0]) - ops, mem = k.uops.flops_mem() - run_count = prod((k.global_size if k.global_size else []) + (k.local_size if k.local_size else [])) - # NOTE: we use min here to ignore the indexing FLOPS - return Program(k.name, self.compiler_opts.renderer(to_function_name(k.name), k.uops), - override_device if override_device else self.compiler_opts.device, - k.global_size, k.local_size, k.uops, min(info.flops, ops * run_count), min(info.mem_estimate, mem * run_count)) - @dataclass(frozen=True) class Program: name:str @@ -246,6 +209,43 @@ class Program: local_size = [sym_infer(sz, var_vals) for sz in self.local_size] if self.local_size is not None else None return global_size, local_size +def fake_renderer(name, uops): raise NotImplementedError("needs a renderer") + +@dataclass(frozen=True) +class CompilerOptions: + device: str = "" + suffix: str = "" + # TODO: make this generic with a list of supported types + supports_float4: bool = True + has_local: bool = True + has_shared: bool = True + has_tensor_cores: bool = False + # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered + global_max: Optional[List[int]] = None + local_max: Optional[List[int]] = None + shared_max: int = 32768 + renderer: Callable = fake_renderer + + def to_program(self, k:Linearizer, override_device:Optional[str]=None) -> Program: + k.linearize() + info = get_lazyop_info(k.ast[0]) + ops, mem = k.uops.flops_mem() + run_count = prod((k.global_size if k.global_size else []) + (k.local_size if k.local_size else [])) + # NOTE: we use min here to ignore the indexing FLOPS + return Program(k.name, self.renderer(to_function_name(k.name), k.uops), + override_device if override_device else self.device, + k.global_size, k.local_size, k.uops, min(info.flops, ops * run_count), min(info.mem_estimate, mem * run_count)) + +class Compiler: + compiler_opts: ClassVar[CompilerOptions] + def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey + def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function") + def compile_cached(self, src:str) -> bytes: + if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None: + lib = self.compile(src) + if self.cachekey is not None: diskcache_put(self.cachekey, src, lib) + return lib + class CompiledRunner(Runner): def __init__(self, p:Program, precompiled:Optional[bytes]=None): if DEBUG >= 4: print(p.src) @@ -280,7 +280,7 @@ class Compiled: self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph def synchronize(self): pass # override this in your device - def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(self.compiler.to_program(k, override_device=self.dname)) + def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(self.compiler.compiler_opts.to_program(k, override_device=self.dname)) def get_linearizer(self, *ast:LazyOp) -> Linearizer: if DEBUG >= 3: diff --git a/tinygrad/features/search.py b/tinygrad/features/search.py index 628004dced..a01fab9472 100644 --- a/tinygrad/features/search.py +++ b/tinygrad/features/search.py @@ -51,7 +51,7 @@ def _try_compile_linearized_w_idx(x:Tuple[int,Linearizer], compiler:Compiler) -> try: x[1].linearize() if len(x[1].uops.uops) >= getenv("BEAM_UOPS_MAX", 3000) > 0: raise RuntimeError("too many uops") - p = compiler.to_program(x[1]) + p = compiler.compiler_opts.to_program(x[1]) st = time.perf_counter() prog = compiler.compile(p.src) et = time.perf_counter() - st @@ -174,7 +174,7 @@ def time_linearizer(lin:Linearizer, rawbufs:List[Buffer], allow_test_size=True, rawbufs = _ensure_buffer_alloc(rawbufs) var_vals = {k:(k.max+k.min)//2 for k in lin.ast[0].vars()} - p = dev.compiler.to_program(lin) + p = dev.compiler.compiler_opts.to_program(lin) tms = _time_program(p, dev.compiler.compile(p.src), var_vals, rawbufs, max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name))