mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-15 09:05:40 -05:00
bring ptx back (#3623)
* bring ptx back * ptx back * fix define var * fix a few bugs * bugfixes * fixes * fix llvm bug * fix test bug
This commit is contained in:
@@ -7,6 +7,7 @@ from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, co
|
||||
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from tinygrad.renderer.assembly import PTXRenderer
|
||||
|
||||
def pretty_ptx(s):
|
||||
# all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
|
||||
@@ -33,6 +34,15 @@ def _get_bytes(arg, get_str, get_sz, check) -> bytes:
|
||||
sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
|
||||
return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
|
||||
|
||||
class PTXCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], supports_float4=False)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
PTXCompiler.linearizer_opts = PTXCompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
super().__init__(f"compile_ptx_{self.arch}")
|
||||
def render(self, name:str, uops) -> str: return PTXRenderer(name, uops).replace("TARGET", self.arch)
|
||||
def compile(self, src:str) -> bytes: return src.encode()
|
||||
|
||||
class CUDACompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024])
|
||||
def __init__(self, arch:str):
|
||||
@@ -100,7 +110,8 @@ class CUDADevice(Compiled):
|
||||
self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
|
||||
|
||||
from tinygrad.runtime.graph.cuda import CUDAGraph
|
||||
super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator, CUDACompiler(self.arch),
|
||||
super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
|
||||
PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
|
||||
functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
|
||||
def synchronize(self):
|
||||
if not CUDACPU:
|
||||
|
||||
@@ -6,7 +6,7 @@ import pickle, base64, itertools, time, struct
|
||||
from tinygrad.dtype import DType, dtypes, ImageDType
|
||||
from tinygrad.helpers import all_same, getenv, flatten
|
||||
from tinygrad.device import Compiled, Allocator, Compiler
|
||||
from tinygrad.codegen.uops import UOp, UOps, exec_alu
|
||||
from tinygrad.codegen.uops import UOpGraph, UOps, exec_alu
|
||||
from tinygrad.ops import BinaryOps, TernaryOps
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
|
||||
@@ -188,8 +188,8 @@ class PythonCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
|
||||
(LinearizerOptions("HIP", has_tensor_cores=True) if getenv("EMULATE_HIP") else \
|
||||
(LinearizerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else LinearizerOptions("PYTHON")))
|
||||
def render(self, name:str, uops:List[UOp]) -> str:
|
||||
lops = [(u.uop, u.dtype, [uops.index(v) for v in u.vin], u.arg) for u in uops]
|
||||
def render(self, name:str, uops:UOpGraph) -> str:
|
||||
lops = [(u.uop, u.dtype, [uops.uops.index(v) for v in u.vin], u.arg) for u in uops]
|
||||
return base64.b64encode(pickle.dumps(lops)).decode()
|
||||
def compile(self, src:str) -> bytes: return base64.b64decode(src)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user