From b47f6cebb232ebf31ba7e4c9472cf93d9ab743ea Mon Sep 17 00:00:00 2001 From: chenyu Date: Thu, 28 Mar 2024 17:50:23 -0400 Subject: [PATCH] LinearizerOptions -> CompilerOptions (#3978) --- examples/handcode_resnet50_opt.py | 6 ++-- extra/backends/ops_hip.py | 5 ++- extra/backends/ops_webgl.py | 6 ++-- extra/backends/ops_webgpu.py | 5 ++- extra/gemm/tvm_gemm.py | 4 +-- extra/optimization/search.py | 2 +- test/external/speed_beam_v_hcopt.py | 2 +- test/external/speed_compare_cuda_ptx.py | 4 +-- test/test_linearizer.py | 48 ++++++++++++------------- test/test_uops.py | 2 +- tinygrad/codegen/kernel.py | 21 +++-------- tinygrad/device.py | 24 +++++++++---- tinygrad/runtime/ops_clang.py | 5 ++- tinygrad/runtime/ops_cuda.py | 11 +++--- tinygrad/runtime/ops_gpu.py | 5 ++- tinygrad/runtime/ops_hsa.py | 5 ++- tinygrad/runtime/ops_llvm.py | 5 ++- tinygrad/runtime/ops_metal.py | 5 ++- tinygrad/runtime/ops_python.py | 9 +++-- 19 files changed, 82 insertions(+), 92 deletions(-) diff --git a/examples/handcode_resnet50_opt.py b/examples/handcode_resnet50_opt.py index 212d3f02f1..bc3aac7b34 100644 --- a/examples/handcode_resnet50_opt.py +++ b/examples/handcode_resnet50_opt.py @@ -43,18 +43,18 @@ if __name__ == "__main__": lins:List[Linearizer] = [] # always try hand coded opt - lin = Linearizer(*si.ast, opts=device.compiler.linearizer_opts) + lin = Linearizer(*si.ast, opts=device.compiler.compiler_opts) lin.hand_coded_optimizations() lins.append(lin) # maybe try tensor cores - lin = Linearizer(*si.ast, opts=device.compiler.linearizer_opts) + lin = Linearizer(*si.ast, opts=device.compiler.compiler_opts) if lin.apply_tensor_cores(): lins.append(lin) # try a beam search if beam:=getenv("BEAM"): - lin = Linearizer(*si.ast, opts=device.compiler.linearizer_opts) + lin = Linearizer(*si.ast, opts=device.compiler.compiler_opts) lin = beam_search(lin, rawbufs, beam, bool(getenv("BEAM_ESTIMATE", 1))) lins.append(lin) diff --git a/extra/backends/ops_hip.py b/extra/backends/ops_hip.py index 774ed2a829..4c6a834211 100644 --- a/extra/backends/ops_hip.py +++ b/extra/backends/ops_hip.py @@ -4,13 +4,12 @@ from typing import Tuple, TypeVar, List, Any, cast, Set import tinygrad.runtime.autogen.hip as hip from tinygrad.helpers import DEBUG, getenv, init_c_var from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t -from tinygrad.device import Compiled, LRUAllocator, BufferOptions, JITRunner, Device, Buffer, MallocAllocator, update_stats, Compiler +from tinygrad.device import Compiled, LRUAllocator, BufferOptions, JITRunner, Device, Buffer, MallocAllocator, update_stats, Compiler, CompilerOptions from tinygrad.renderer.cstyle import HIPRenderer -from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.runtime.driver.hip_comgr import compile_hip class HIPCompiler(Compiler): - linearizer_opts = LinearizerOptions("HIP", has_tensor_cores=True, shared_max=65536) + compiler_opts = CompilerOptions("HIP", has_tensor_cores=True, shared_max=65536) def __init__(self, arch:str): self.arch = arch super().__init__(f"compile_hip_{self.arch}") diff --git a/extra/backends/ops_webgl.py b/extra/backends/ops_webgl.py index eef43d108b..93d42971a5 100644 --- a/extra/backends/ops_webgl.py +++ b/extra/backends/ops_webgl.py @@ -1,8 +1,8 @@ import numpy as np import functools from tinygrad.dtype import dtypes, ImageDType -from tinygrad.device import Compiled, Allocator -from tinygrad.codegen.kernel import LinearizerOptions, OptOps +from tinygrad.device import Compiled, Allocator, CompilerOptions +from tinygrad.codegen.kernel import OptOps from tinygrad.renderer.cstyle import uops_to_cstyle from tinygrad.renderer.cstyle import GLSLLanguage import moderngl @@ -47,6 +47,6 @@ class RawWebGLAllocator(Allocator): class WebGlDevice(Compiled): def __init__(self, device:str): super().__init__(RawWebGLAllocator(), - LinearizerOptions(device="WEBGL", global_max=[4096*4096,1,1], unsupported_opts=[OptOps.UPCAST, OptOps.UPCASTMID], + CompilerOptions(device="WEBGL", global_max=[4096*4096,1,1], unsupported_opts=[OptOps.UPCAST, OptOps.UPCASTMID], supports_float4=False, supports_float4_alu=False, has_local=False, has_shared=False, dont_use_locals=True), functools.partial(uops_to_cstyle, GLSLLanguage()), lambda x: x, WebGLProgram) diff --git a/extra/backends/ops_webgpu.py b/extra/backends/ops_webgpu.py index 70543f97bd..c88c8307dc 100644 --- a/extra/backends/ops_webgpu.py +++ b/extra/backends/ops_webgpu.py @@ -1,6 +1,5 @@ from wgpu.utils.device import get_default_device -from tinygrad.device import Compiled, Allocator -from tinygrad.codegen.kernel import LinearizerOptions +from tinygrad.device import Compiled, Allocator, CompilerOptions from tinygrad.renderer.cstyle import WGSLRenderer import wgpu @@ -37,5 +36,5 @@ class WebGpuAllocator(Allocator): class WebGpuDevice(Compiled): def __init__(self, device:str): - super().__init__(WebGpuAllocator(), LinearizerOptions(device="WEBGPU", supports_float4=False, local_max=[256, 256, 64], + super().__init__(WebGpuAllocator(), CompilerOptions(device="WEBGPU", supports_float4=False, local_max=[256, 256, 64], global_max=[65535, 65535, 65535]), WGSLRenderer, lambda x: x, WebGPUProgram) diff --git a/extra/gemm/tvm_gemm.py b/extra/gemm/tvm_gemm.py index 5961354088..3b04c1dc58 100644 --- a/extra/gemm/tvm_gemm.py +++ b/extra/gemm/tvm_gemm.py @@ -39,8 +39,8 @@ C = (A.reshape(M, 1, K) * B.permute(1,0).reshape(1, N, K)).sum(axis=2) sched = create_schedule([C.lazydata]) from tinygrad.codegen.linearizer import Linearizer -from tinygrad.codegen.kernel import LinearizerOptions -lin = Linearizer(sched[-1].ast, LinearizerOptions(has_local=False, supports_float4=False)) +from tinygrad.device import CompilerOptions +lin = Linearizer(sched[-1].ast, CompilerOptions(has_local=False, supports_float4=False)) #lin.hand_coded_optimizations() lin.linearize() from tinygrad.runtime.ops_clang import renderer diff --git a/extra/optimization/search.py b/extra/optimization/search.py index a6486c73f1..37f54f9883 100644 --- a/extra/optimization/search.py +++ b/extra/optimization/search.py @@ -25,7 +25,7 @@ if __name__ == '__main__': for ast_str in ast_strs: print(f"optimizing ast={ast_str}") - lin = ast_str_to_lin(ast_str, opts=device.compiler.linearizer_opts) + lin = ast_str_to_lin(ast_str, opts=device.compiler.compiler_opts) rawbufs = bufs_from_lin(lin) lin = beam_search(lin, rawbufs, getenv("BEAM", 8), bool(getenv("BEAM_ESTIMATE", 1))) diff --git a/test/external/speed_beam_v_hcopt.py b/test/external/speed_beam_v_hcopt.py index b34d5ed628..4676fbd1cf 100644 --- a/test/external/speed_beam_v_hcopt.py +++ b/test/external/speed_beam_v_hcopt.py @@ -15,7 +15,7 @@ if __name__ == "__main__": beam_won, tested = 0, 0 for num, ast in enumerate(ast_strs[:test_n]): - def new_lin(): return ast_str_to_lin(ast, opts=dev.compiler.linearizer_opts) + def new_lin(): return ast_str_to_lin(ast, opts=dev.compiler.compiler_opts) k = new_lin() # k.required_optimizations() diff --git a/test/external/speed_compare_cuda_ptx.py b/test/external/speed_compare_cuda_ptx.py index 137072def1..a977f5b2a0 100644 --- a/test/external/speed_compare_cuda_ptx.py +++ b/test/external/speed_compare_cuda_ptx.py @@ -26,14 +26,14 @@ if __name__ == "__main__": average_tm_cuda, average_tm_ptx = 0, 0 for num,ast in enumerate(ast_strs): # cuda compile - lin = ast_str_to_lin(ast, opts=dev.compiler.linearizer_opts) + lin = ast_str_to_lin(ast, opts=dev.compiler.compiler_opts) lin.hand_coded_optimizations() cuda_prg = dev.to_program(lin) bufs = bufs_from_lin(lin) # ptx compile - lin = ast_str_to_lin(ast, opts=ptx.linearizer_opts) + lin = ast_str_to_lin(ast, opts=ptx.compiler_opts) lin.hand_coded_optimizations() lin.linearize() ptx_src = ptx.render(to_function_name(lin.name), lin.uops) diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 7e31bdb94d..cd9dc70019 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -97,7 +97,7 @@ class TestLinearizer(unittest.TestCase): assert num_ops <= 1, "more alu uops than needed" def test_reduce_upcast(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4: + if not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4: self.skipTest("device does not support upcast") x, w = Tensor.randn((1,1,3)).realize(), Tensor.randn((1,1,2)).realize() r = Tensor.conv2d(x,w,padding=1).relu() @@ -113,7 +113,7 @@ class TestLinearizer(unittest.TestCase): assert stores[0].vin[-1].dtype == accs[0].dtype == dtypes.float.vec(4) def test_upcast_with_locals(self): - if not (opts:=Device[Device.DEFAULT].compiler.linearizer_opts).has_local or not opts.has_shared or not opts.supports_float4: + if not (opts:=Device[Device.DEFAULT].compiler.compiler_opts).has_local or not opts.has_shared or not opts.supports_float4: self.skipTest("device does not support upcasted reduce with locals") x, y = Tensor.rand(1,128), Tensor.rand(128, 128) @@ -183,9 +183,9 @@ class TestLinearizer(unittest.TestCase): helper_arg_acc_dtype(d.conv2d(w, acc_dtype=acc_dtype), expected_dtype) def test_tensor_cores(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_tensor_cores: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores: self.skipTest("device doesn't have tensor cores") - for tc in tensor_cores[Device[Device.DEFAULT].compiler.linearizer_opts.device]: + for tc in tensor_cores[Device[Device.DEFAULT].compiler.compiler_opts.device]: if getenv("EMULATE_CUDA") and (tc.dtype_in == dtypes.bfloat16 or tc.dtype_out == dtypes.bfloat16): continue a, b = Tensor.rand(tc.dims[1], tc.dims[2], dtype=tc.dtype_in), Tensor.rand(tc.dims[2], tc.dims[0], dtype=tc.dtype_in) np_a, np_b = a.numpy(), b.numpy() @@ -281,7 +281,7 @@ def helper_realized_ast(r:Tensor): output_buffer = Buffer((out:=s[-1].outputs[0]).device, prod((s if isinstance(s, int) else s.max for s in out.shape)), out.dtype).allocate() return s[-1].ast[0], [output_buffer] + [l.realized for l in s[-1].inputs] -@unittest.skipUnless(Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4, "need backends that support float4") +@unittest.skipUnless(Device[Device.DEFAULT].compiler.compiler_opts.supports_float4, "need backends that support float4") class TestFloat4(unittest.TestCase): @staticmethod def count_float4(k): @@ -490,7 +490,7 @@ class TestHandCodedOpts(unittest.TestCase): assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49 def test_matvec(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local: self.skipTest("Only devices with locals") N = 128 a = Tensor.rand(1, N).realize() @@ -539,9 +539,9 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e- for i, x in enumerate(opts): # Check custom transformations if any. check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_program, color_sizes[i] if i < len(color_sizes) else None) -class TestLinearizerOpts(unittest.TestCase): +class TestKernelOpts(unittest.TestCase): def test_local_and_grouped_reduce(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared: self.skipTest("Only Compiled uses linearizer with locals and shared") N = 128 @@ -587,7 +587,7 @@ class TestLinearizerOpts(unittest.TestCase): ]) def test_matmul(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared: self.skipTest("Only Compiled uses linearizer with locals and shared") N = 128 @@ -617,7 +617,7 @@ class TestLinearizerOpts(unittest.TestCase): ]) def test_double_reduce(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared: self.skipTest("Only Compiled uses linearizer with locals and shared") N = 128 @@ -644,7 +644,7 @@ class TestLinearizerOpts(unittest.TestCase): ]) def test_invalid_tensor_core_extra_opts(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_tensor_cores: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores: self.skipTest("device doesn't have tensor cores") if Device.DEFAULT not in tensor_cores: self.skipTest("No tensor cores for device") @@ -665,25 +665,25 @@ class TestLinearizerOpts(unittest.TestCase): assert k.apply_tensor_cores(use_tensor_cores=1, extra_opts=x), "no valid tensor core" # for METAL in runners def test_buf_index_not_found_tensor_core(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_tensor_cores: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores: self.skipTest("device doesn't have tensor cores") if Device.DEFAULT not in tensor_cores: self.skipTest("No tensor cores for device") ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BinaryOps.MUL, src=(LazyOp(op=UnaryOps.CAST, src=(LazyOp(op=BinaryOps.CMPEQ, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(1243, 256), strides=(0, 1), offset=0, mask=None, contiguous=False),)))), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=2, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=(dtypes.float, False)), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=3, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=(0,)),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1, 256), strides=(0, 1), offset=0, mask=None, contiguous=True),)))) # noqa: E501 - k = Linearizer(ast, opts=Device[Device.DEFAULT].compiler.linearizer_opts) + k = Linearizer(ast, opts=Device[Device.DEFAULT].compiler.compiler_opts) with self.assertRaises(KernelOptError): k.apply_opt(Opt(OptOps.TC, 0, 1)) def test_tensor_core_opts(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_tensor_cores: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores: self.skipTest("device doesn't have tensor cores") if Device.DEFAULT not in tensor_cores: self.skipTest("No tensor cores for device") N = 128 Tensor.manual_seed(1552) - for tc in tensor_cores[Device[Device.DEFAULT].compiler.linearizer_opts.device]: + for tc in tensor_cores[Device[Device.DEFAULT].compiler.compiler_opts.device]: a, b = Tensor.rand(N, N, dtype=tc.dtype_in), Tensor.rand(N, N, dtype=tc.dtype_in) r = a.matmul(b, acc_dtype=tc.dtype_out) (atol, rtol) = ((0.25, 0.01) if tc.dtype_out == dtypes.half else (3e-2, 1e-3)) if tc.dtype_in == dtypes.half else (1e-4, 1e-4) @@ -749,7 +749,7 @@ class TestLinearizerOpts(unittest.TestCase): ]) def test_color_shapes_with_local(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared: self.skipTest("Only Compiled uses linearizer with locals and shared") N = 32 @@ -819,7 +819,7 @@ class TestLinearizerHelper(unittest.TestCase): assert expand_idxs(idxs) == (uidx0, NumNode(0), uidx1) class TestLinearizerUOptimize(unittest.TestCase): - @unittest.skipUnless(Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4, "device doesn't support float4") + @unittest.skipUnless(Device[Device.DEFAULT].compiler.compiler_opts.supports_float4, "device doesn't support float4") def test_grouped_store_phis(self): x, y = Tensor.randn(64,64), Tensor.randn(64,64) out = x.matmul(y) @@ -833,7 +833,7 @@ class TestLinearizerUOptimize(unittest.TestCase): for val in store_vals: assert val.dtype == dtypes.float.vec(4) and val.uop != UOps.CAST - @unittest.skipUnless(Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4, "device doesn't support float4") + @unittest.skipUnless(Device[Device.DEFAULT].compiler.compiler_opts.supports_float4, "device doesn't support float4") def test_grouped_store_values(self): x = Tensor.randn((4,3,6,6)).realize() out = x.flip((0,1)).contiguous() @@ -846,8 +846,8 @@ class TestLinearizerUOptimize(unittest.TestCase): assert store_val.dtype == dtypes.float.vec(4) and store_val.uop != UOps.CAST def test_grouped_store_locals_and_globals(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared or \ - not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared or \ + not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4: self.skipTest("Only Compiled uses linearizer with locals, shared, and float4") x, y = Tensor.rand(128, 128), Tensor.rand(128, 128) @@ -871,8 +871,8 @@ class TestLinearizerUOptimize(unittest.TestCase): assert len([u for u in k.uops if u.uop is UOps.IF and u.vin[-1] == barrier]) == 1 def test_grouped_store_local_only(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared or \ - not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared or \ + not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4: self.skipTest("Only Compiled uses linearizer with locals, shared, and float4") x, y = Tensor.rand(1,128), Tensor.rand(128, 128) @@ -891,7 +891,7 @@ class TestLinearizerUOptimize(unittest.TestCase): assert stores[1].vin[-1].dtype == dtypes.float def test_skip_unmatching_upcasts(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4: self.skipTest("Needs locals and float4") ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(1, 240, 0, 0), offset=0, mask=None, contiguous=False),)))),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(40, 1, 0, 0), offset=0, mask=None, contiguous=True),)))) # noqa: E501 opts = [ @@ -907,7 +907,7 @@ class TestLinearizerUOptimize(unittest.TestCase): assert out.vin[-1].uop is UOps.CAST and out.vin[-1].dtype == dtypes.float.vec(4) def test_skip_unmatching_upcasts_with_gep(self): - if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4: + if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4: self.skipTest("Needs locals and float4") ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(1, 8, 0, 0), offset=0, mask=None, contiguous=False),)))),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(32, 1, 0, 0), offset=0, mask=None, contiguous=True),)))) # noqa: E501 opts = [Opt(op=OptOps.LOCAL, axis=1, amt=4), Opt(op=OptOps.UPCAST, axis=2, amt=2), Opt(op=OptOps.LOCAL, axis=1, amt=8), diff --git a/test/test_uops.py b/test/test_uops.py index 2dc3c51dae..d0dc014532 100644 --- a/test/test_uops.py +++ b/test/test_uops.py @@ -12,7 +12,7 @@ from test.helpers import is_dtype_supported def _uops_to_prg(uops): src = Device[Device.DEFAULT].compiler.render("test", uops) - has_local = Device[Device.DEFAULT].compiler.linearizer_opts.has_local + has_local = Device[Device.DEFAULT].compiler.compiler_opts.has_local return CompiledASTRunner("test", src, Device.DEFAULT, [1] if has_local else None, [1] if has_local else None) def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp: diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index d6711223e1..4e58536889 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -2,7 +2,7 @@ from __future__ import annotations import math, itertools from typing import NamedTuple, Optional, List, Tuple, cast, Dict, Union from tinygrad.ops import LazyOp, FlopCounter, get_lazyop_info, UnaryOps, BinaryOps, ReduceOps, MemBuffer, ConstBuffer, BufferOps -from tinygrad.device import Device +from tinygrad.device import Device, CompilerOptions from tinygrad.dtype import dtypes, ImageDType, DType from tinygrad.helpers import colored, ansilen, dedup, flatten, getenv, prod, DEBUG, round_up, all_int, get_contraction from tinygrad.shape.shapetracker import ShapeTracker @@ -61,23 +61,10 @@ class LocalBuffer(NamedTuple): realized: None = None def __str__(self): return f"localbuffer<{self.name}[{self.size}]>" -class LinearizerOptions(NamedTuple): - device: str = "" - suffix: str = "" - # TODO: make this generic with a list of supported types - supports_float4: bool = True - has_local: bool = True - has_shared: bool = True - has_tensor_cores: bool = False - # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered - global_max: Optional[List[int]] = None - local_max: Optional[List[int]] = None - shared_max: int = 32768 - class Kernel: - def __init__(self, *ast:LazyOp, opts:Optional[LinearizerOptions]=None): - self.opts = opts if opts is not None else (device.compiler.linearizer_opts if (device:=Device[Device.DEFAULT]).compiler is not None else - LinearizerOptions(Device.DEFAULT)) + def __init__(self, *ast:LazyOp, opts:Optional[CompilerOptions]=None): + self.opts = opts if opts is not None else (device.compiler.compiler_opts if (device:=Device[Device.DEFAULT]).compiler is not None else + CompilerOptions(Device.DEFAULT)) assert all(op.op is BufferOps.STORE for op in ast), f"kernels must have stores as the output, got {ast}" assert len(set(op.arg.st.size for op in ast)) == 1, f"all outbufs should have the same size, got {[op.arg.st for op in ast]}" self.ast = ast diff --git a/tinygrad/device.py b/tinygrad/device.py index df07b248c2..b42ab551d0 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -1,6 +1,6 @@ from __future__ import annotations from collections import defaultdict -from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar +from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar, NamedTuple import importlib, inspect, functools, pathlib, time, ctypes from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put from tinygrad.helpers import prod, CACHECOLLECTING @@ -11,7 +11,6 @@ from tinygrad.codegen.uops import UOpGraph if TYPE_CHECKING: from tinygrad.codegen.linearizer import Linearizer - from tinygrad.codegen.kernel import LinearizerOptions # **************** Device **************** @@ -134,8 +133,21 @@ MallocAllocator = _MallocAllocator() # **************** for Compiled Devices **************** +class CompilerOptions(NamedTuple): + device: str = "" + suffix: str = "" + # TODO: make this generic with a list of supported types + supports_float4: bool = True + has_local: bool = True + has_shared: bool = True + has_tensor_cores: bool = False + # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered + global_max: Optional[List[int]] = None + local_max: Optional[List[int]] = None + shared_max: int = 32768 + class Compiler: - linearizer_opts: ClassVar[LinearizerOptions] + compiler_opts: ClassVar[CompilerOptions] def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey def render(self, name:str, uops:UOpGraph) -> str: raise NotImplementedError("need a render function") def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function") @@ -215,16 +227,16 @@ class Compiled: from tinygrad.features.graph import print_tree for op in ast: print_tree(op) from tinygrad.codegen.linearizer import Linearizer - k = Linearizer(*ast, opts=self.compiler.linearizer_opts) + k = Linearizer(*ast, opts=self.compiler.compiler_opts) k.required_optimizations() if not NOOPT: if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations() if BEAM >= 1: lins = [(("tc" if used_tensor_cores else "hc"), k)] if used_tensor_cores: - lins.append(("hc", Linearizer(*ast, opts=self.compiler.linearizer_opts))) + lins.append(("hc", Linearizer(*ast, opts=self.compiler.compiler_opts))) lins[-1][1].hand_coded_optimizations() - kb = Linearizer(*ast, opts=self.compiler.linearizer_opts) + kb = Linearizer(*ast, opts=self.compiler.compiler_opts) kb.required_optimizations() from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 13607453c4..44d5fd3979 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -1,13 +1,12 @@ import ctypes, subprocess, pathlib, tempfile -from tinygrad.device import Compiled, MallocAllocator, Compiler +from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions from tinygrad.helpers import cpu_time_execution -from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage CLANG_PROGRAM_HEADER = '#include \n#include \n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n' class ClangCompiler(Compiler): - linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False) + compiler_opts = CompilerOptions("CLANG", supports_float4=False, has_local=False) def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops) def compile(self, src:str) -> bytes: # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 367b3327ac..ed6eb3d388 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -4,8 +4,7 @@ from pathlib import Path from typing import Tuple, Optional, List import tinygrad.runtime.autogen.cuda as cuda from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution -from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions -from tinygrad.codegen.kernel import LinearizerOptions +from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions, CompilerOptions from tinygrad.renderer.cstyle import CUDARenderer from tinygrad.renderer.assembly import PTXRenderer if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 @@ -53,20 +52,20 @@ def _get_bytes(arg, get_str, get_sz, check) -> bytes: return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value) class PTXCompiler(Compiler): - linearizer_opts = LinearizerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152) + compiler_opts = CompilerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152) def __init__(self, arch:str): self.arch = arch self.version = "7.8" if arch >= "sm_89" else "7.5" - PTXCompiler.linearizer_opts = PTXCompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80) + PTXCompiler.compiler_opts = PTXCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80) super().__init__(f"compile_ptx_{self.arch}") def render(self, name:str, uops) -> str: return PTXRenderer(name, uops).replace("TARGET", self.arch).replace("VERSION", self.version) def compile(self, src:str) -> bytes: return src.encode() class CUDACompiler(Compiler): - linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152) + compiler_opts = CompilerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152) def __init__(self, arch:str): self.arch = arch - CUDACompiler.linearizer_opts = CUDACompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80) + CUDACompiler.compiler_opts = CUDACompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80) check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int()))) self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"] if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal") diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 4ce0aa2c53..f0449990db 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -3,9 +3,8 @@ from typing import Tuple, Optional, List, cast import ctypes, functools, hashlib import tinygrad.runtime.autogen.opencl as cl from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG -from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.cstyle import OpenCLRenderer -from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler +from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something OSX_TIMING_RATIO = (125/3) if OSX else 1.0 @@ -15,7 +14,7 @@ def check(status): def checked(ret, status): return (check(status.value), ret)[1] class CLCompiler(Compiler): - linearizer_opts = LinearizerOptions("GPU") + compiler_opts = CompilerOptions("GPU") def __init__(self, device:CLDevice, compile_key:str): self.device = device super().__init__(f"compile_cl_{compile_key}") diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py index ee21a025fc..c0cc4ec81c 100644 --- a/tinygrad/runtime/ops_hsa.py +++ b/tinygrad/runtime/ops_hsa.py @@ -3,8 +3,7 @@ import ctypes, functools, subprocess, io, atexit, collections, json from typing import Tuple, TypeVar, List, Dict, Any import tinygrad.runtime.autogen.hsa as hsa from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv -from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler -from tinygrad.codegen.kernel import LinearizerOptions +from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.runtime.driver.hip_comgr import compile_hip @@ -43,7 +42,7 @@ class HSAProfiler: Profiler = HSAProfiler() class HSACompiler(Compiler): - linearizer_opts = LinearizerOptions("HSA", has_tensor_cores=True, shared_max=65536) + compiler_opts = CompilerOptions("HSA", has_tensor_cores=True, shared_max=65536) def __init__(self, arch:str): self.arch = arch super().__init__(f"compile_hip_{self.arch}") diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index 869940652d..32d5db0d41 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -1,14 +1,13 @@ from __future__ import annotations import ctypes, functools from typing import Tuple -from tinygrad.device import Compiled, MallocAllocator, Compiler +from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions from tinygrad.helpers import DEBUG, cpu_time_execution -from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.llvmir import uops_to_llvm_ir import llvmlite.binding as llvm class LLVMCompiler(Compiler): - linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False) + compiler_opts = CompilerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False) def __init__(self, device:LLVMDevice): self.device = device super().__init__("compile_llvm") diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index f209240798..b6050b6cc5 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -2,9 +2,8 @@ from __future__ import annotations import os, subprocess, pathlib, ctypes, tempfile, functools import Metal, libdispatch from typing import List, Set, Any, Tuple, Optional -from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.helpers import prod, getenv, DEBUG, unwrap2 -from tinygrad.device import Compiled, LRUAllocator, Compiler +from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions from tinygrad.renderer.cstyle import MetalRenderer def wait_check(cbuf: Any): @@ -13,7 +12,7 @@ def wait_check(cbuf: Any): raise RuntimeError(error) class MetalCompiler(Compiler): - linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768) + compiler_opts = CompilerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768) def __init__(self, device:Optional[MetalDevice]): self.device = device super().__init__("compile_metal") diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 0cad1b91c7..1febe89117 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -5,10 +5,9 @@ from typing import Tuple, List, Optional, Any, Dict import pickle, base64, itertools, time, struct from tinygrad.dtype import DType, dtypes, ImageDType from tinygrad.helpers import all_same, getenv, flatten -from tinygrad.device import Compiled, Allocator, Compiler +from tinygrad.device import Compiled, Allocator, Compiler, CompilerOptions from tinygrad.codegen.uops import UOpGraph, UOps, exec_alu from tinygrad.ops import BinaryOps, TernaryOps -from tinygrad.codegen.kernel import LinearizerOptions def _load(m, i): if i < 0 or i >= len(m): raise IndexError(f"load out of bounds, size is {len(m)} and access is {i}") @@ -179,9 +178,9 @@ class PythonProgram: return time.perf_counter() - st class PythonCompiler(Compiler): - linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \ - (LinearizerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \ - (LinearizerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else LinearizerOptions("PYTHON"))) + compiler_opts = CompilerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \ + (CompilerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \ + (CompilerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else CompilerOptions("PYTHON"))) def render(self, name:str, uops:UOpGraph) -> str: lops = [(u.uop, u.dtype, [uops.uops.index(v) for v in u.vin], u.arg) for u in uops] return base64.b64encode(pickle.dumps(lops)).decode()