LinearizerOptions -> CompilerOptions (#3978)

This commit is contained in:
chenyu
2024-03-28 17:50:23 -04:00
committed by GitHub
parent 2bfb1d3e39
commit b47f6cebb2
19 changed files with 82 additions and 92 deletions

View File

@@ -1,13 +1,12 @@
import ctypes, subprocess, pathlib, tempfile
from tinygrad.device import Compiled, MallocAllocator, Compiler
from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
from tinygrad.helpers import cpu_time_execution
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
CLANG_PROGRAM_HEADER = '#include <stdbool.h>\n#include <tgmath.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n'
class ClangCompiler(Compiler):
linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False)
compiler_opts = CompilerOptions("CLANG", supports_float4=False, has_local=False)
def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops)
def compile(self, src:str) -> bytes:
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here

View File

@@ -4,8 +4,7 @@ from pathlib import Path
from typing import Tuple, Optional, List
import tinygrad.runtime.autogen.cuda as cuda
from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions, CompilerOptions
from tinygrad.renderer.cstyle import CUDARenderer
from tinygrad.renderer.assembly import PTXRenderer
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
@@ -53,20 +52,20 @@ def _get_bytes(arg, get_str, get_sz, check) -> bytes:
return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
class PTXCompiler(Compiler):
linearizer_opts = LinearizerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
compiler_opts = CompilerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
def __init__(self, arch:str):
self.arch = arch
self.version = "7.8" if arch >= "sm_89" else "7.5"
PTXCompiler.linearizer_opts = PTXCompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
PTXCompiler.compiler_opts = PTXCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
super().__init__(f"compile_ptx_{self.arch}")
def render(self, name:str, uops) -> str: return PTXRenderer(name, uops).replace("TARGET", self.arch).replace("VERSION", self.version)
def compile(self, src:str) -> bytes: return src.encode()
class CUDACompiler(Compiler):
linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
compiler_opts = CompilerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
def __init__(self, arch:str):
self.arch = arch
CUDACompiler.linearizer_opts = CUDACompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
CUDACompiler.compiler_opts = CUDACompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")

View File

@@ -3,9 +3,8 @@ from typing import Tuple, Optional, List, cast
import ctypes, functools, hashlib
import tinygrad.runtime.autogen.opencl as cl
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import OpenCLRenderer
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -15,7 +14,7 @@ def check(status):
def checked(ret, status): return (check(status.value), ret)[1]
class CLCompiler(Compiler):
linearizer_opts = LinearizerOptions("GPU")
compiler_opts = CompilerOptions("GPU")
def __init__(self, device:CLDevice, compile_key:str):
self.device = device
super().__init__(f"compile_cl_{compile_key}")

View File

@@ -3,8 +3,7 @@ import ctypes, functools, subprocess, io, atexit, collections, json
from typing import Tuple, TypeVar, List, Dict, Any
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions
from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
@@ -43,7 +42,7 @@ class HSAProfiler:
Profiler = HSAProfiler()
class HSACompiler(Compiler):
linearizer_opts = LinearizerOptions("HSA", has_tensor_cores=True, shared_max=65536)
compiler_opts = CompilerOptions("HSA", has_tensor_cores=True, shared_max=65536)
def __init__(self, arch:str):
self.arch = arch
super().__init__(f"compile_hip_{self.arch}")

View File

@@ -1,14 +1,13 @@
from __future__ import annotations
import ctypes, functools
from typing import Tuple
from tinygrad.device import Compiled, MallocAllocator, Compiler
from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
from tinygrad.helpers import DEBUG, cpu_time_execution
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.llvmir import uops_to_llvm_ir
import llvmlite.binding as llvm
class LLVMCompiler(Compiler):
linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
compiler_opts = CompilerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
def __init__(self, device:LLVMDevice):
self.device = device
super().__init__("compile_llvm")

View File

@@ -2,9 +2,8 @@ from __future__ import annotations
import os, subprocess, pathlib, ctypes, tempfile, functools
import Metal, libdispatch
from typing import List, Set, Any, Tuple, Optional
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
from tinygrad.device import Compiled, LRUAllocator, Compiler
from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
from tinygrad.renderer.cstyle import MetalRenderer
def wait_check(cbuf: Any):
@@ -13,7 +12,7 @@ def wait_check(cbuf: Any):
raise RuntimeError(error)
class MetalCompiler(Compiler):
linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768)
compiler_opts = CompilerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768)
def __init__(self, device:Optional[MetalDevice]):
self.device = device
super().__init__("compile_metal")

View File

@@ -5,10 +5,9 @@ from typing import Tuple, List, Optional, Any, Dict
import pickle, base64, itertools, time, struct
from tinygrad.dtype import DType, dtypes, ImageDType
from tinygrad.helpers import all_same, getenv, flatten
from tinygrad.device import Compiled, Allocator, Compiler
from tinygrad.device import Compiled, Allocator, Compiler, CompilerOptions
from tinygrad.codegen.uops import UOpGraph, UOps, exec_alu
from tinygrad.ops import BinaryOps, TernaryOps
from tinygrad.codegen.kernel import LinearizerOptions
def _load(m, i):
if i < 0 or i >= len(m): raise IndexError(f"load out of bounds, size is {len(m)} and access is {i}")
@@ -179,9 +178,9 @@ class PythonProgram:
return time.perf_counter() - st
class PythonCompiler(Compiler):
linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
(LinearizerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \
(LinearizerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else LinearizerOptions("PYTHON")))
compiler_opts = CompilerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
(CompilerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \
(CompilerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else CompilerOptions("PYTHON")))
def render(self, name:str, uops:UOpGraph) -> str:
lops = [(u.uop, u.dtype, [uops.uops.index(v) for v in u.vin], u.arg) for u in uops]
return base64.b64encode(pickle.dumps(lops)).decode()