cuda: fix fp16, uint8, int64, half4 codegen (#968)

* cuda: add uchar, int64 typedefs

* cuda: fix float16 codegen

* fuck it, half4 stub. llama time!

* inline fp16 half4, revert changes to CStyleLanguage

* add inline just in case

* remove half4 operators

* use dict
This commit is contained in:
cloud11665
2023-06-12 20:15:44 +02:00
committed by GitHub
parent e54b6c5e7f
commit 5f13e7c3cf
2 changed files with 12 additions and 4 deletions

View File

@@ -141,8 +141,8 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan
# NOTE: if min and max are both 0, it should be a CONST in the Linearizer
if args.valid.min == 1: kk(f"{newvar.render(True)} = {val};")
else:
zero = f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f);" if newvar.ltype == LocalTypes.float4 else "0.0f"
kk(f"{newvar.render(True)} = ({args.valid.render(render_cl)}) ? ({val}) : {zero};")
casts = {LocalTypes.float4: ("", f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f)"), LocalTypes.half: ("(half)", "(half)(0.0f)"), LocalTypes.float: ("(float)", "0.0f")}[newvar.ltype]
kk(f"{newvar.render(True)} = ({args.valid.render(render_cl)}) ? {casts[0]}({val}) : {casts[1]};")
elif uop == UOps.STORE and (vin[0].ltype == LocalTypes.float or (vin[0].ltype == LocalTypes.float4 and vin[0].offset is not None)):
assert not isinstance(bufs[args.i].dtype, ImageDType), "image store must be float4"
assert args.valid.min == 1, "store must be valid"