move all to compile api (#2203)

* move metal+clang to compile api * all to the new style * remove binary arg * fix triton * fixup tests * fix clang * diskcache is generic * __wrapped__ * compile_gpu * fix thneed * keep the src in the ASTRunner * lib * move compile_gpu * compile_gpu in device * put compiler in astrunner * test reverts * triton compiler * ugh, that too
2026-02-13 08:05:10 -05:00 · 2023-11-01 23:01:32 -07:00
parent 8932816816
commit 03cf0afa4f
18 changed files with 128 additions and 136 deletions
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
+import os
+os.environ['PYOPENCL_NO_CACHE'] = '1'
 import pathlib
 import numpy as np
 import pyopencl as cl  # type: ignore
 from typing import Optional, List
-from tinygrad.helpers import DEBUG, getenv, prod, ImageDType, OSX, fromimport
+from tinygrad.helpers import DEBUG, getenv, prod, ImageDType, OSX, fromimport, diskcache
 from tinygrad.ops import Compiled
 from tinygrad.renderer.opencl import OpenCLRenderer
 from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer
@@ -61,23 +63,28 @@ class CLBuffer(RawBufferCopyInOut, RawBufferTransfer):
      cl.enqueue_copy_buffer_p2p_amd(CL.cl_platform, CL.cl_queue[x._buf.device], x._buf, self._buf, x.size * x.dtype.itemsize).wait()
    else: raise NotImplementedError("p2p transfer between devices not implemented on non-amd")

+@diskcache
+def compile_gpu(prg:str) -> bytes:
+  clprg = cl.Program(CL.cl_ctxs[0], prg)
+  clprg.build()
+  return clprg.get_info(cl.program_info.BINARIES)[0]
+
 class CLProgram:
-  def __init__(self, name:str, prg:str, binary=False, argdtypes=None, options=None):
-    self.name, self.clprograms = name, [cl.Program(ctx, ctx.devices, [prg]*len(ctx.devices)) if binary else cl.Program(ctx, prg) for ctx in CL.cl_ctxs]  # type: ignore
+  def __init__(self, name:str, prg:bytes, argdtypes=None, options=None):
+    self.name, self.clprograms = name, [cl.Program(ctx, ctx.devices, [prg]*len(ctx.devices)) for ctx in CL.cl_ctxs]  # type: ignore
    self._clprgs = [clprogram.build(options=options) for clprogram in self.clprograms]
    self.clprgs = [clprg.__getattr__(name) for clprg in self._clprgs]
    if DEBUG >= 5 and not OSX:
      if 'Adreno' in CL.cl_ctxs[0].devices[0].name:
-        fromimport('disassemblers.adreno', 'disasm')(self.binary())
+        fromimport('disassemblers.adreno', 'disasm')(prg)
      elif CL.cl_ctxs[0].devices[0].name.startswith('gfx'):
-        asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], self.binary()))
+        asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], prg))
        print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
      else:
        # print the PTX for NVIDIA. TODO: probably broken for everything else
-        print(self.binary().decode('utf-8'))
+        print(prg.decode('utf-8'))
    if argdtypes is not None: self.set_argdtypes(argdtypes)

-  def binary(self): return self.clprograms[0].get_info(cl.program_info.BINARIES)[0]
  def set_argdtypes(self, argdtypes): self.argdtypes, _ = argdtypes, [clprg.set_scalar_arg_dtypes(argdtypes) for clprg in self.clprgs]

  @staticmethod
@@ -100,4 +107,4 @@ class CLProgram:
        return None
    return None

-GPUBuffer = Compiled(CLBuffer, LinearizerOptions(), OpenCLRenderer, CLProgram, CL.synchronize)
+GPUBuffer = Compiled(CLBuffer, LinearizerOptions(), OpenCLRenderer, compile_gpu, CLProgram, CL.synchronize)