mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-13 08:05:10 -05:00
LinearizerOptions -> CompilerOptions (#3978)
This commit is contained in:
@@ -1,13 +1,12 @@
|
||||
import ctypes, subprocess, pathlib, tempfile
|
||||
from tinygrad.device import Compiled, MallocAllocator, Compiler
|
||||
from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
|
||||
from tinygrad.helpers import cpu_time_execution
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
|
||||
|
||||
CLANG_PROGRAM_HEADER = '#include <stdbool.h>\n#include <tgmath.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n'
|
||||
|
||||
class ClangCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False)
|
||||
compiler_opts = CompilerOptions("CLANG", supports_float4=False, has_local=False)
|
||||
def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops)
|
||||
def compile(self, src:str) -> bytes:
|
||||
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
|
||||
|
||||
@@ -4,8 +4,7 @@ from pathlib import Path
|
||||
from typing import Tuple, Optional, List
|
||||
import tinygrad.runtime.autogen.cuda as cuda
|
||||
from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
|
||||
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions, CompilerOptions
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from tinygrad.renderer.assembly import PTXRenderer
|
||||
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
|
||||
@@ -53,20 +52,20 @@ def _get_bytes(arg, get_str, get_sz, check) -> bytes:
|
||||
return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
|
||||
|
||||
class PTXCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
|
||||
compiler_opts = CompilerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
self.version = "7.8" if arch >= "sm_89" else "7.5"
|
||||
PTXCompiler.linearizer_opts = PTXCompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
PTXCompiler.compiler_opts = PTXCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
super().__init__(f"compile_ptx_{self.arch}")
|
||||
def render(self, name:str, uops) -> str: return PTXRenderer(name, uops).replace("TARGET", self.arch).replace("VERSION", self.version)
|
||||
def compile(self, src:str) -> bytes: return src.encode()
|
||||
|
||||
class CUDACompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
|
||||
compiler_opts = CompilerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
CUDACompiler.linearizer_opts = CUDACompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
CUDACompiler.compiler_opts = CUDACompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
|
||||
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
|
||||
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
|
||||
|
||||
@@ -3,9 +3,8 @@ from typing import Tuple, Optional, List, cast
|
||||
import ctypes, functools, hashlib
|
||||
import tinygrad.runtime.autogen.opencl as cl
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cstyle import OpenCLRenderer
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions
|
||||
|
||||
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
||||
OSX_TIMING_RATIO = (125/3) if OSX else 1.0
|
||||
@@ -15,7 +14,7 @@ def check(status):
|
||||
def checked(ret, status): return (check(status.value), ret)[1]
|
||||
|
||||
class CLCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("GPU")
|
||||
compiler_opts = CompilerOptions("GPU")
|
||||
def __init__(self, device:CLDevice, compile_key:str):
|
||||
self.device = device
|
||||
super().__init__(f"compile_cl_{compile_key}")
|
||||
|
||||
@@ -3,8 +3,7 @@ import ctypes, functools, subprocess, io, atexit, collections, json
|
||||
from typing import Tuple, TypeVar, List, Dict, Any
|
||||
import tinygrad.runtime.autogen.hsa as hsa
|
||||
from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions
|
||||
from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
|
||||
from tinygrad.renderer.cstyle import HIPRenderer
|
||||
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
||||
@@ -43,7 +42,7 @@ class HSAProfiler:
|
||||
Profiler = HSAProfiler()
|
||||
|
||||
class HSACompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("HSA", has_tensor_cores=True, shared_max=65536)
|
||||
compiler_opts = CompilerOptions("HSA", has_tensor_cores=True, shared_max=65536)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
super().__init__(f"compile_hip_{self.arch}")
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
from __future__ import annotations
|
||||
import ctypes, functools
|
||||
from typing import Tuple
|
||||
from tinygrad.device import Compiled, MallocAllocator, Compiler
|
||||
from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
|
||||
from tinygrad.helpers import DEBUG, cpu_time_execution
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.llvmir import uops_to_llvm_ir
|
||||
import llvmlite.binding as llvm
|
||||
|
||||
class LLVMCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
|
||||
compiler_opts = CompilerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
|
||||
def __init__(self, device:LLVMDevice):
|
||||
self.device = device
|
||||
super().__init__("compile_llvm")
|
||||
|
||||
@@ -2,9 +2,8 @@ from __future__ import annotations
|
||||
import os, subprocess, pathlib, ctypes, tempfile, functools
|
||||
import Metal, libdispatch
|
||||
from typing import List, Set, Any, Tuple, Optional
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
|
||||
from tinygrad.device import Compiled, LRUAllocator, Compiler
|
||||
from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
|
||||
from tinygrad.renderer.cstyle import MetalRenderer
|
||||
|
||||
def wait_check(cbuf: Any):
|
||||
@@ -13,7 +12,7 @@ def wait_check(cbuf: Any):
|
||||
raise RuntimeError(error)
|
||||
|
||||
class MetalCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768)
|
||||
compiler_opts = CompilerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768)
|
||||
def __init__(self, device:Optional[MetalDevice]):
|
||||
self.device = device
|
||||
super().__init__("compile_metal")
|
||||
|
||||
@@ -5,10 +5,9 @@ from typing import Tuple, List, Optional, Any, Dict
|
||||
import pickle, base64, itertools, time, struct
|
||||
from tinygrad.dtype import DType, dtypes, ImageDType
|
||||
from tinygrad.helpers import all_same, getenv, flatten
|
||||
from tinygrad.device import Compiled, Allocator, Compiler
|
||||
from tinygrad.device import Compiled, Allocator, Compiler, CompilerOptions
|
||||
from tinygrad.codegen.uops import UOpGraph, UOps, exec_alu
|
||||
from tinygrad.ops import BinaryOps, TernaryOps
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
|
||||
def _load(m, i):
|
||||
if i < 0 or i >= len(m): raise IndexError(f"load out of bounds, size is {len(m)} and access is {i}")
|
||||
@@ -179,9 +178,9 @@ class PythonProgram:
|
||||
return time.perf_counter() - st
|
||||
|
||||
class PythonCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
|
||||
(LinearizerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \
|
||||
(LinearizerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else LinearizerOptions("PYTHON")))
|
||||
compiler_opts = CompilerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
|
||||
(CompilerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \
|
||||
(CompilerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else CompilerOptions("PYTHON")))
|
||||
def render(self, name:str, uops:UOpGraph) -> str:
|
||||
lops = [(u.uop, u.dtype, [uops.uops.index(v) for v in u.vin], u.arg) for u in uops]
|
||||
return base64.b64encode(pickle.dumps(lops)).decode()
|
||||
|
||||
Reference in New Issue
Block a user