From 5f13e7c3cf10144929649637df8e3312fad6f3e7 Mon Sep 17 00:00:00 2001
From: cloud11665 <Cloud11665@gmail.com>
Date: Mon, 12 Jun 2023 20:15:44 +0200
Subject: [PATCH] cuda: fix fp16, uint8, int64, half4 codegen (#968)

* cuda: add uchar, int64 typedefs

* cuda: fix float16 codegen

* fuck it, half4 stub. llama time!

* inline fp16 half4, revert changes to CStyleLanguage

* add inline just in case

* remove half4 operators

* use dict
---
 tinygrad/codegen/cstyle.py   |  4 ++--
 tinygrad/runtime/ops_cuda.py | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tinygrad/codegen/cstyle.py b/tinygrad/codegen/cstyle.py
index 590478b586..36aca36125 100644
--- a/tinygrad/codegen/cstyle.py
+++ b/tinygrad/codegen/cstyle.py
@@ -141,8 +141,8 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan
       # NOTE: if min and max are both 0, it should be a CONST in the Linearizer
       if args.valid.min == 1: kk(f"{newvar.render(True)} = {val};")
       else:
-        zero = f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f);" if newvar.ltype == LocalTypes.float4 else "0.0f"
-        kk(f"{newvar.render(True)} = ({args.valid.render(render_cl)}) ? ({val}) : {zero};")
+        casts = {LocalTypes.float4: ("", f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f)"), LocalTypes.half: ("(half)", "(half)(0.0f)"), LocalTypes.float: ("(float)", "0.0f")}[newvar.ltype]
+        kk(f"{newvar.render(True)} = ({args.valid.render(render_cl)}) ? {casts[0]}({val}) : {casts[1]};")
     elif uop == UOps.STORE and (vin[0].ltype == LocalTypes.float or (vin[0].ltype == LocalTypes.float4 and vin[0].offset is not None)):
       assert not isinstance(bufs[args.i].dtype, ImageDType), "image store must be float4"
       assert args.valid.min == 1, "store must be valid"
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index 4784beaae7..3c1ce004ae 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -47,8 +47,16 @@ class CUDAProgram:
 class CUDACodegen(CStyleCodegen):
   lang = CStyleLanguage(
     kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4",
-    half_prekernel = "#include <cuda_fp16.h>",
     gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)],
-    lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])
+    lid = [f'threadIdx.{chr(120+i)}' for i in range(3)],
+    half_prekernel = """
+      #include <cuda_fp16.h>
+      struct __align__(8) half4 {
+        half2 x, y;
+        __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); }
+      };
+      typedef unsigned char uchar;
+      typedef long long int64;
+    """)
   supports_float4_alu = False
 CUDABuffer = Compiled(RawCUDABuffer, CUDACodegen, CUDAProgram, cuda.Context.synchronize)