* disabled float4 ALU ops for CUDA, small fix to add half_prekernel before kernel_prefix

* added supports_float4_alu option, and disabled for ops_cuda
This commit is contained in:
crthilakraj
2023-05-29 16:59:36 +02:00
committed by GitHub
parent 6ea5df19b2
commit 7925fa58d9
4 changed files with 10 additions and 5 deletions

View File

@@ -91,5 +91,6 @@ class CLCodegen(CStyleCodegen):
half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True)
supports_float4_alu = True
supports_float4 = True
GPUBuffer = Compiled(CLBuffer, CLCodegen, CLProgram, CL.synchronize)