linearizer! (#714)

* linearizer outputs something * working ish * cstyle codegen * clang mostly works * fix load valid * fix numberless loop * fancy gen * working * fix enet compiler * cleanups * float4 upcasting * less lines * supports_float4 * constant folding * mulacc * internet tests flaky in CI * 90% image support * fix image generic * bugs exposed with shapetracker and single view * new llvm * use vload, remove OLD * that's really poorly done * ending up being more lines
2026-02-10 14:45:35 -05:00 · 2023-03-19 23:43:49 -07:00
parent b629fd4cd8
commit 5495c7d64e
20 changed files with 792 additions and 776 deletions
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -6,7 +6,7 @@ from typing import Optional, List
 from tinygrad.helpers import DEBUG, getenv, prod, ImageDType
 from tinygrad.ops import Compiled
 from tinygrad.runtime.lib import RawBufferCopyInOut
-from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
+from tinygrad.codegen.cstyle import CStyleCodegen, CStyleLanguage

 OSX = platform.system() == "Darwin"
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
@@ -68,11 +68,11 @@ class CLProgram:
      return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9
    return None

-class CLCodegen(GPUCodegen):
-  lang = GPULanguage(
+class CLCodegen(CStyleCodegen):
+  lang = CStyleLanguage(
    kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",
    half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
    barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
-    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])
+    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True)

 GPUBuffer = Compiled(CLBuffer, CLCodegen, CLProgram)