diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 76afbc8543..286a32e601 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -264,17 +264,13 @@ class CUDALanguage(CStyleLanguage): """ CUDARenderer = functools.partial(uops_to_cstyle, CUDALanguage()) -class HIPLanguage(CStyleLanguage): +class HIPLanguage(CUDALanguage): kernel_prefix = "#include \n#define INFINITY (__builtin_inff())\n#define NAN (__builtin_nanf(\"\"))" + """ typedef float float8 __attribute__((ext_vector_type(8))); __device__ float8 make_float8(float x, float y, float z, float w, float a, float b, float c, float d) { return {x, y, z, w, a, b, c, d}; } extern "C" __global__ """ launch_bounds = True - smem_prefix = "__shared__ " - smem_prefix_for_cast=False - barrier = "__syncthreads();" - float4 = "make_float4" uses_ptr_arithmetic=True half_prekernel = "#include \n" + """ typedef union { struct { half x, y, z, w; } __attribute__((aligned(8))); half data[4]; } half4; @@ -286,10 +282,6 @@ __device__ half16 make_half16(half x, half y, half z, half w, half a, half b, ha half e, half f, half g, half h, half i, half j, half k, half l) { return {x, y, z, w, a, b, c, d, e, f, g, h, i, j, k, l}; } """ - gid = [f'blockIdx.{chr(120+i)}' for i in range(3)] - lid = [f'threadIdx.{chr(120+i)}' for i in range(3)] - xid = [f'(blockIdx.{chr(120+i)}*blockDim.{chr(120+i)}+threadIdx.{chr(120+i)})' for i in range(3)] - code_for_op = {**CStyleLanguage().code_for_op, **code_for_op_half} HIPRenderer = functools.partial(uops_to_cstyle, HIPLanguage()) # TODO: how much of this can be merged with above?