From cb7289f9c99be4871517b23e23eb85e2608feb33 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 4 May 2024 08:38:01 -0700
Subject: [PATCH] remove clang program header (#4422)

* remove clang program header

* proper max

* bools are numbers

* fix compile enet
---
 examples/efficientnet.py      |  7 ++++---
 extra/export_model.py         |  3 +--
 test/unit/test_disk_tensor.py |  4 ++--
 tinygrad/renderer/cstyle.py   |  9 ++++++++-
 tinygrad/runtime/ops_clang.py | 10 ++++------
 5 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/examples/efficientnet.py b/examples/efficientnet.py
index 863af3d124..e8e8bd916b 100644
--- a/examples/efficientnet.py
+++ b/examples/efficientnet.py
@@ -83,6 +83,7 @@ if __name__ == "__main__":
     cv2.destroyAllWindows()
   else:
     img = Image.open(fetch(url))
-    with Timing("did inference in "):
-      out, _ = infer(model, img)
-      print(np.argmax(out), np.max(out), lbls[np.argmax(out)])
+    for i in range(getenv("CNT", 1)):
+      with Timing("did inference in "):
+        out, _ = infer(model, img)
+        print(np.argmax(out), np.max(out), lbls[np.argmax(out)])
diff --git a/extra/export_model.py b/extra/export_model.py
index 20b1b78096..ba97111f3d 100644
--- a/extra/export_model.py
+++ b/extra/export_model.py
@@ -64,8 +64,7 @@ def jit_model(model, *args) -> Tuple[TinyJit,Dict[int,str]]:
   return run, special_names
 
 def export_model_clang(functions:Dict[str,str], statements:Dict[str,Tuple[str,int,int]], bufs:Dict[str,Tuple[str,int,int]], bufs_to_save:Dict[str,Tensor], input_names:List[str], output_names:List[str]) -> str:
-  from tinygrad.runtime.ops_clang import CLANG_PROGRAM_HEADER
-  cprog = [CLANG_PROGRAM_HEADER]
+  cprog = ["#include <tgmath.h>"]
 
   for name,cl in bufs_to_save.items():
     weight = ''.join(["\\x%02X"%x for x in bytes(cl._buf)])
diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py
index 6095f716e8..4017aa6eaf 100644
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -279,8 +279,8 @@ class TestDiskTensor(unittest.TestCase):
     np.testing.assert_array_equal(t.numpy(), np.array([3] * 10))
 
   def test_bitcast(self):
-    with open(temp('bf16'), "wb") as f: f.write(bytes(range(10,20)))
-    t = Tensor.empty(5, dtype=dtypes.int16, device=f"disk:{temp('bf16')}")
+    with open(temp('range_1020'), "wb") as f: f.write(bytes(range(10,20)))
+    t = Tensor.empty(5, dtype=dtypes.int16, device=f"disk:{temp('range_1020')}")
     ret = t.to("CLANG").bitcast(dtypes.uint16) + 1
     assert ret.tolist() == [2827, 3341, 3855, 4369, 4883]
 
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index 3b81da302a..71f13fc9a8 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -45,7 +45,8 @@ class CStyleLanguage(NamedTuple):
     if math.isnan(x): val = "NAN"
     elif math.isinf(x): val = ("-" if x < 0 else "") + "INFINITY"
     elif dtype == dtypes.float64: val = f"{x}"
-    else: val = f"{x}f" if dtypes.is_float(dtype) else f"{x}" if dtypes.is_int(dtype) else f"{x}".lower()
+    elif dtype == dtypes.bool: val = "1" if x else "0"
+    else: val = f"{x}f" if dtypes.is_float(dtype) else f"{x}"
     return (self.render_cast([val] * dtype.count, dtype) if dtype.count > 1 or dtype not in [dtypes.float, dtypes.int, dtypes.bool] else val)
 
   # returns a str expression of the loaded value with the output type
@@ -174,6 +175,12 @@ def uops_to_cstyle(lang:CStyleLanguage, function_name:str, uops:UOpGraph) -> str
 
   return lang.render_kernel(function_name, kernel, bufs, uops)
 
+class ClangLanguage(CStyleLanguage):
+  buffer_suffix = " restrict"
+  type_map = {dtypes.bool:"_Bool", dtypes.half:"__fp16"}
+  code_for_op = {**CStyleLanguage().code_for_op, BinaryOps.MAX: lambda a,b,dtype: f"(({a}>{b})?{a}:{b})"}
+ClangRenderer = functools.partial(uops_to_cstyle, ClangLanguage())
+
 class OpenCLLanguage(CStyleLanguage):
   kernel_prefix = "__kernel "
   buffer_prefix = "__global "
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
index 30a3446684..b13e026122 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,18 +1,16 @@
 import ctypes, subprocess, pathlib, tempfile
 from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
 from tinygrad.helpers import cpu_time_execution
-from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
-
-CLANG_PROGRAM_HEADER = '#include <stdbool.h>\n#include <tgmath.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n'
+from tinygrad.renderer.cstyle import ClangRenderer
 
 class ClangCompiler(Compiler):
   compiler_opts = CompilerOptions("CLANG", supports_float4=False, has_local=False)
-  def render(self, name:str, uops) -> str: return CLANG_PROGRAM_HEADER + uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops)
+  def render(self, name:str, uops) -> str: return ClangRenderer(name, uops)
   def compile(self, src:str) -> bytes:
     # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
     with tempfile.NamedTemporaryFile(delete=True) as output_file:
-      subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+ str(output_file.name)).split(),
-                              input=src.encode('utf-8'))
+      subprocess.check_output(['clang', '-include', 'tgmath.h', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-',
+                               '-o', str(output_file.name)], input=src.encode('utf-8'))
       return pathlib.Path(output_file.name).read_bytes()
 
 class ClangProgram: