diff --git a/tinygrad/codegen/cstyle.py b/tinygrad/codegen/cstyle.py index 590478b586..36aca36125 100644 --- a/tinygrad/codegen/cstyle.py +++ b/tinygrad/codegen/cstyle.py @@ -141,8 +141,8 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan # NOTE: if min and max are both 0, it should be a CONST in the Linearizer if args.valid.min == 1: kk(f"{newvar.render(True)} = {val};") else: - zero = f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f);" if newvar.ltype == LocalTypes.float4 else "0.0f" - kk(f"{newvar.render(True)} = ({args.valid.render(render_cl)}) ? ({val}) : {zero};") + casts = {LocalTypes.float4: ("", f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f)"), LocalTypes.half: ("(half)", "(half)(0.0f)"), LocalTypes.float: ("(float)", "0.0f")}[newvar.ltype] + kk(f"{newvar.render(True)} = ({args.valid.render(render_cl)}) ? {casts[0]}({val}) : {casts[1]};") elif uop == UOps.STORE and (vin[0].ltype == LocalTypes.float or (vin[0].ltype == LocalTypes.float4 and vin[0].offset is not None)): assert not isinstance(bufs[args.i].dtype, ImageDType), "image store must be float4" assert args.valid.min == 1, "store must be valid" diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 4784beaae7..3c1ce004ae 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -47,8 +47,16 @@ class CUDAProgram: class CUDACodegen(CStyleCodegen): lang = CStyleLanguage( kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4", - half_prekernel = "#include ", gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)], - lid = [f'threadIdx.{chr(120+i)}' for i in range(3)]) + lid = [f'threadIdx.{chr(120+i)}' for i in range(3)], + half_prekernel = """ + #include + struct __align__(8) half4 { + half2 x, y; + __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); } + }; + typedef unsigned char uchar; + typedef long long int64; + """) supports_float4_alu = False CUDABuffer = Compiled(RawCUDABuffer, CUDACodegen, CUDAProgram, cuda.Context.synchronize)