diff --git a/extra/hiprtc/hiprtc_runtime.h b/extra/hiprtc/hiprtc_runtime.h new file mode 100644 index 0000000000..b390504ec7 --- /dev/null +++ b/extra/hiprtc/hiprtc_runtime.h @@ -0,0 +1,15143 @@ +#pragma clang diagnostic ignored "-Weverything" + +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" +# 1 "" 1 +# 1 "" 3 +# 845 "" 3 +# 1 "" 1 +# 1 "" 2 +# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 1 3 +# 33 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3 +extern "C" { + __attribute__((__visibility__("default"))) + __attribute__((weak)) + __attribute__((noreturn)) + __attribute__((device)) void __cxa_pure_virtual(void) { + __builtin_trap(); + } + __attribute__((__visibility__("default"))) + __attribute__((weak)) + __attribute__((noreturn)) + __attribute__((device)) void __cxa_deleted_virtual(void) { + __builtin_trap(); + } +} +# 57 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3 +typedef long unsigned int size_t; +# 74 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3 +typedef long unsigned int __hip_size_t; + + +extern "C" { + + + +extern "C" __attribute__((device)) unsigned long long __ockl_dm_alloc(unsigned long long __size); +extern "C" __attribute__((device)) void __ockl_dm_dealloc(unsigned long long __addr); +# 95 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3 +__attribute__((weak)) inline __attribute__((device)) void *malloc(__hip_size_t __size) { + return (void *) __ockl_dm_alloc(__size); +} +__attribute__((weak)) inline __attribute__((device)) void free(void *__ptr) { + __ockl_dm_dealloc((unsigned long long)__ptr); +} +# 124 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3 +} + + +# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_libdevice_declares.h" 1 3 +# 14 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_libdevice_declares.h" 3 +extern "C" { + + + +__attribute__((device)) __attribute__((const)) float __ocml_acos_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_acosh_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_asin_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_asinh_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_atan2_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_atan_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_atanh_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_cbrt_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_ceil_f32(float); +__attribute__((device)) __attribute__((const)) __attribute__((device)) float __ocml_copysign_f32(float, + float); +__attribute__((device)) float __ocml_cos_f32(float); +__attribute__((device)) float __ocml_native_cos_f32(float); +__attribute__((device)) __attribute__((pure)) __attribute__((device)) float __ocml_cosh_f32(float); +__attribute__((device)) float __ocml_cospi_f32(float); +__attribute__((device)) float __ocml_i0_f32(float); +__attribute__((device)) float __ocml_i1_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_erfc_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_erfcinv_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_erfcx_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_erf_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_erfinv_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_exp10_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_native_exp10_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_exp_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_native_exp_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_expm1_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_fabs_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_fdim_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_floor_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_fma_f32(float, float, float); +__attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_fmin_f32(float, float); +__attribute__((device)) __attribute__((const)) __attribute__((device)) float __ocml_fmod_f32(float, + float); +__attribute__((device)) float __ocml_frexp_f32(float, + __attribute__((address_space(5))) int *); +__attribute__((device)) __attribute__((const)) float __ocml_hypot_f32(float, float); +__attribute__((device)) __attribute__((const)) int __ocml_ilogb_f32(float); +__attribute__((device)) __attribute__((const)) int __ocml_isfinite_f32(float); +__attribute__((device)) __attribute__((const)) int __ocml_isinf_f32(float); +__attribute__((device)) __attribute__((const)) int __ocml_isnan_f32(float); +__attribute__((device)) float __ocml_j0_f32(float); +__attribute__((device)) float __ocml_j1_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_ldexp_f32(float, int); +__attribute__((device)) float __ocml_lgamma_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_log10_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_native_log10_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_log1p_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_native_log2_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_logb_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_log_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_native_log_f32(float); +__attribute__((device)) float __ocml_modf_f32(float, + __attribute__((address_space(5))) float *); +__attribute__((device)) __attribute__((const)) float __ocml_nearbyint_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_nextafter_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_len3_f32(float, float, float); +__attribute__((device)) __attribute__((const)) float __ocml_len4_f32(float, float, float, + float); +__attribute__((device)) __attribute__((pure)) float __ocml_ncdf_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_ncdfinv_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_pow_f32(float, float); +__attribute__((device)) __attribute__((pure)) float __ocml_pown_f32(float, int); +__attribute__((device)) __attribute__((pure)) float __ocml_rcbrt_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_remainder_f32(float, float); +__attribute__((device)) float __ocml_remquo_f32(float, float, + __attribute__((address_space(5))) int *); +__attribute__((device)) __attribute__((const)) float __ocml_rhypot_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_rint_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_rlen3_f32(float, float, float); +__attribute__((device)) __attribute__((const)) float __ocml_rlen4_f32(float, float, float, + float); +__attribute__((device)) __attribute__((const)) float __ocml_round_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_rsqrt_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_scalb_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_scalbn_f32(float, int); +__attribute__((device)) __attribute__((const)) int __ocml_signbit_f32(float); +__attribute__((device)) float __ocml_sincos_f32(float, + __attribute__((address_space(5))) float *); +__attribute__((device)) float __ocml_sincospi_f32(float, + __attribute__((address_space(5))) float *); +__attribute__((device)) float __ocml_sin_f32(float); +__attribute__((device)) float __ocml_native_sin_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_sinh_f32(float); +__attribute__((device)) float __ocml_sinpi_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_native_sqrt_f32(float); +__attribute__((device)) float __ocml_tan_f32(float); +__attribute__((device)) __attribute__((pure)) float __ocml_tanh_f32(float); +__attribute__((device)) float __ocml_tgamma_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_trunc_f32(float); +__attribute__((device)) float __ocml_y0_f32(float); +__attribute__((device)) float __ocml_y1_f32(float); + + +__attribute__((device)) __attribute__((const)) float __ocml_add_rte_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_add_rtn_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_add_rtp_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_add_rtz_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_sub_rte_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_sub_rtn_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_sub_rtp_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_sub_rtz_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_mul_rte_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_mul_rtn_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_mul_rtp_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_mul_rtz_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_div_rte_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_div_rtn_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_div_rtp_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_div_rtz_f32(float, float); +__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rte_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtn_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtp_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtz_f32(float); +__attribute__((device)) __attribute__((const)) float __ocml_fma_rte_f32(float, float, float); +__attribute__((device)) __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float); +__attribute__((device)) __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float); +__attribute__((device)) __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float); + +__attribute__((device)) inline __attribute__((const)) float +__llvm_amdgcn_cos_f32(float __x) { + return __builtin_amdgcn_cosf(__x); +} +__attribute__((device)) inline __attribute__((const)) float +__llvm_amdgcn_rcp_f32(float __x) { + return __builtin_amdgcn_rcpf(__x); +} +__attribute__((device)) inline __attribute__((const)) float +__llvm_amdgcn_rsq_f32(float __x) { + return __builtin_amdgcn_rsqf(__x); +} +__attribute__((device)) inline __attribute__((const)) float +__llvm_amdgcn_sin_f32(float __x) { + return __builtin_amdgcn_sinf(__x); +} + + + + +__attribute__((device)) __attribute__((const)) double __ocml_acos_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_acosh_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_asin_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_asinh_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_atan2_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_atan_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_atanh_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_cbrt_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_ceil_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_copysign_f64(double, double); +__attribute__((device)) double __ocml_cos_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_cosh_f64(double); +__attribute__((device)) double __ocml_cospi_f64(double); +__attribute__((device)) double __ocml_i0_f64(double); +__attribute__((device)) double __ocml_i1_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_erfc_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_erfcinv_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_erfcx_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_erf_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_erfinv_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_exp10_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_exp2_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_exp_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_expm1_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_fabs_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_fdim_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_floor_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_fma_f64(double, double, double); +__attribute__((device)) __attribute__((const)) double __ocml_fmax_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_fmin_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_fmod_f64(double, double); +__attribute__((device)) double __ocml_frexp_f64(double, + __attribute__((address_space(5))) int *); +__attribute__((device)) __attribute__((const)) double __ocml_hypot_f64(double, double); +__attribute__((device)) __attribute__((const)) int __ocml_ilogb_f64(double); +__attribute__((device)) __attribute__((const)) int __ocml_isfinite_f64(double); +__attribute__((device)) __attribute__((const)) int __ocml_isinf_f64(double); +__attribute__((device)) __attribute__((const)) int __ocml_isnan_f64(double); +__attribute__((device)) double __ocml_j0_f64(double); +__attribute__((device)) double __ocml_j1_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_ldexp_f64(double, int); +__attribute__((device)) double __ocml_lgamma_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_log10_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_log1p_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_log2_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_logb_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_log_f64(double); +__attribute__((device)) double __ocml_modf_f64(double, + __attribute__((address_space(5))) double *); +__attribute__((device)) __attribute__((const)) double __ocml_nearbyint_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_nextafter_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_len3_f64(double, double, + double); +__attribute__((device)) __attribute__((const)) double __ocml_len4_f64(double, double, double, + double); +__attribute__((device)) __attribute__((pure)) double __ocml_ncdf_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_ncdfinv_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_pow_f64(double, double); +__attribute__((device)) __attribute__((pure)) double __ocml_pown_f64(double, int); +__attribute__((device)) __attribute__((pure)) double __ocml_rcbrt_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_remainder_f64(double, double); +__attribute__((device)) double __ocml_remquo_f64(double, double, + __attribute__((address_space(5))) int *); +__attribute__((device)) __attribute__((const)) double __ocml_rhypot_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_rint_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_rlen3_f64(double, double, + double); +__attribute__((device)) __attribute__((const)) double __ocml_rlen4_f64(double, double, + double, double); +__attribute__((device)) __attribute__((const)) double __ocml_round_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_rsqrt_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_scalb_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_scalbn_f64(double, int); +__attribute__((device)) __attribute__((const)) int __ocml_signbit_f64(double); +__attribute__((device)) double __ocml_sincos_f64(double, + __attribute__((address_space(5))) double *); +__attribute__((device)) double +__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *); +__attribute__((device)) double __ocml_sin_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_sinh_f64(double); +__attribute__((device)) double __ocml_sinpi_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_sqrt_f64(double); +__attribute__((device)) double __ocml_tan_f64(double); +__attribute__((device)) __attribute__((pure)) double __ocml_tanh_f64(double); +__attribute__((device)) double __ocml_tgamma_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_trunc_f64(double); +__attribute__((device)) double __ocml_y0_f64(double); +__attribute__((device)) double __ocml_y1_f64(double); + + +__attribute__((device)) __attribute__((const)) double __ocml_add_rte_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_add_rtn_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_add_rtp_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_add_rtz_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_sub_rte_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_sub_rtn_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_sub_rtp_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_sub_rtz_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_mul_rte_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_mul_rtn_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_mul_rtp_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_mul_rtz_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_div_rte_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_div_rtn_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_div_rtp_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_div_rtz_f64(double, double); +__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rte_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtn_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtp_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtz_f64(double); +__attribute__((device)) __attribute__((const)) double __ocml_fma_rte_f64(double, double, + double); +__attribute__((device)) __attribute__((const)) double __ocml_fma_rtn_f64(double, double, + double); +__attribute__((device)) __attribute__((const)) double __ocml_fma_rtp_f64(double, double, + double); +__attribute__((device)) __attribute__((const)) double __ocml_fma_rtz_f64(double, double, + double); + +__attribute__((device)) inline __attribute__((const)) double +__llvm_amdgcn_rcp_f64(double __x) { + return __builtin_amdgcn_rcp(__x); +} +__attribute__((device)) inline __attribute__((const)) double +__llvm_amdgcn_rsq_f64(double __x) { + return __builtin_amdgcn_rsq(__x); +} + +__attribute__((device)) __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16); +__attribute__((device)) _Float16 __ocml_cos_f16(_Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float); +__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16); +__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16); +__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_floor_f16(_Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16, + _Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16); +__attribute__((device)) __attribute__((const)) int __ocml_isinf_f16(_Float16); +__attribute__((device)) __attribute__((const)) int __ocml_isnan_f16(_Float16); +__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log_f16(_Float16); +__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16); +__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16); +__attribute__((device)) __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_rint_f16(_Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16); +__attribute__((device)) _Float16 __ocml_sin_f16(_Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16); +__attribute__((device)) __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16); +__attribute__((device)) __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int); + +typedef _Float16 __2f16 __attribute__((ext_vector_type(2))); +typedef short __2i16 __attribute__((ext_vector_type(2))); + + +__attribute__((device)) __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, + float c, bool s); + + + + +__attribute__((device)) __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16); +__attribute__((device)) __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16); +__attribute__((device)) __2f16 __ocml_cos_2f16(__2f16); +__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16); +__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16); +__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16); +__attribute__((device)) __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16); +__attribute__((device)) __attribute__((const)) +__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16); +__attribute__((device)) __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16); +__attribute__((device)) __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16); +__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16); +__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16); +__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); +__attribute__((device)) inline __2f16 +__llvm_amdgcn_rcp_2f16(__2f16 __x) +{ + return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)); +} +__attribute__((device)) __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); +__attribute__((device)) __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); +__attribute__((device)) __2f16 __ocml_sin_2f16(__2f16); +__attribute__((device)) __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16); +__attribute__((device)) __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16); +__attribute__((device)) __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16); + + +} +# 128 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3 +# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 1 3 +# 94 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +long unsigned int __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) { + long unsigned int __r = 0; + while (*__tagp != '\0') { + char __tmp = *__tagp; + + if (__tmp >= '0' && __tmp <= '7') + __r = (__r * 8u) + __tmp - '0'; + else + return 0; + + ++__tagp; + } + + return __r; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +long unsigned int __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) { + long unsigned int __r = 0; + while (*__tagp != '\0') { + char __tmp = *__tagp; + + if (__tmp >= '0' && __tmp <= '9') + __r = (__r * 10u) + __tmp - '0'; + else + return 0; + + ++__tagp; + } + + return __r; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +long unsigned int __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) { + long unsigned int __r = 0; + while (*__tagp != '\0') { + char __tmp = *__tagp; + + if (__tmp >= '0' && __tmp <= '9') + __r = (__r * 16u) + __tmp - '0'; + else if (__tmp >= 'a' && __tmp <= 'f') + __r = (__r * 16u) + __tmp - 'a' + 10; + else if (__tmp >= 'A' && __tmp <= 'F') + __r = (__r * 16u) + __tmp - 'A' + 10; + else + return 0; + + ++__tagp; + } + + return __r; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +long unsigned int __make_mantissa(const char *__tagp __attribute__((nonnull))) { + if (*__tagp == '0') { + ++__tagp; + + if (*__tagp == 'x' || *__tagp == 'X') + return __make_mantissa_base16(__tagp); + else + return __make_mantissa_base8(__tagp); + } + + return __make_mantissa_base10(__tagp); +} + + + +static __attribute__((device)) inline __attribute__((always_inline)) +int abs(int __x) { + int __sgn = __x >> (sizeof(int) * 8 - 1); + return (__x ^ __sgn) - __sgn; +} +static __attribute__((device)) inline __attribute__((always_inline)) +long labs(long __x) { + long __sgn = __x >> (sizeof(long) * 8 - 1); + return (__x ^ __sgn) - __sgn; +} +static __attribute__((device)) inline __attribute__((always_inline)) +long long llabs(long long __x) { + long long __sgn = __x >> (sizeof(long long) * 8 - 1); + return (__x ^ __sgn) - __sgn; +} + + +static __attribute__((device)) inline __attribute__((always_inline)) +float acosf(float __x) { return __ocml_acos_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float acoshf(float __x) { return __ocml_acosh_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float asinf(float __x) { return __ocml_asin_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float asinhf(float __x) { return __ocml_asinh_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float atanf(float __x) { return __ocml_atan_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float atanhf(float __x) { return __ocml_atanh_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float ceilf(float __x) { return __ocml_ceil_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float cosf(float __x) { return __ocml_cos_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float coshf(float __x) { return __ocml_cosh_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float cospif(float __x) { return __ocml_cospi_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float erfcf(float __x) { return __ocml_erfc_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float erff(float __x) { return __ocml_erf_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float exp10f(float __x) { return __ocml_exp10_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float exp2f(float __x) { return __ocml_exp2_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float expf(float __x) { return __ocml_exp_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float expm1f(float __x) { return __ocml_expm1_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float fabsf(float __x) { return __builtin_fabsf(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float fdividef(float __x, float __y) { return __x / __y; } + +static __attribute__((device)) inline __attribute__((always_inline)) +float floorf(float __x) { return __ocml_floor_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float fmaf(float __x, float __y, float __z) { + return __ocml_fma_f32(__x, __y, __z); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float frexpf(float __x, int *__nptr) { + int __tmp; + + + + float __r = + __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp); + *__nptr = __tmp; + + return __r; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +bool __finitef(float __x) { return __ocml_isfinite_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +bool __isinff(float __x) { return __ocml_isinf_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +bool __isnanf(float __x) { return __ocml_isnan_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float j0f(float __x) { return __ocml_j0_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float j1f(float __x) { return __ocml_j1_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float jnf(int __n, float __x) { + + + + if (__n == 0) + return j0f(__x); + if (__n == 1) + return j1f(__x); + + float __x0 = j0f(__x); + float __x1 = j1f(__x); + for (int __i = 1; __i < __n; ++__i) { + float __x2 = (2 * __i) / __x * __x1 - __x0; + __x0 = __x1; + __x1 = __x2; + } + + return __x1; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +long long int llrintf(float __x) { return __ocml_rint_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +long long int llroundf(float __x) { return __ocml_round_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float log10f(float __x) { return __ocml_log10_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float log1pf(float __x) { return __ocml_log1p_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float log2f(float __x) { return __ocml_log2_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float log2fi(int __x) { return __ocml_log2_f32((float) __x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float logbf(float __x) { return __ocml_logb_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float logf(float __x) { return __ocml_log_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +long int lrintf(float __x) { return __ocml_rint_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +long int lroundf(float __x) { return __ocml_round_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float modff(float __x, float *__iptr) { + float __tmp; + + + + float __r = + __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); + *__iptr = __tmp; + return __r; +} + + + +static __attribute__((device)) inline __attribute__((always_inline)) +float nanf(const char *__tagp __attribute__((nonnull))) { + union { + float val; + struct ieee_float { + unsigned int mantissa : 22; + unsigned int quiet : 1; + unsigned int exponent : 8; + unsigned int sign : 1; + } bits; + } __tmp; + static_assert((sizeof(__tmp.val)) == (sizeof(__tmp.bits)), ""); + + __tmp.bits.sign = 0u; + __tmp.bits.exponent = ~0u; + __tmp.bits.quiet = 1u; + __tmp.bits.mantissa = __make_mantissa(__tagp); + + return __tmp.val; +} + + +static __attribute__((device)) inline __attribute__((always_inline)) +float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float nextafterf(float __x, float __y) { + return __ocml_nextafter_f32(__x, __y); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float norm3df(float __x, float __y, float __z) { + return __ocml_len3_f32(__x, __y, __z); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float norm4df(float __x, float __y, float __z, float __w) { + return __ocml_len4_f32(__x, __y, __z, __w); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float normcdff(float __x) { return __ocml_ncdf_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float normf(int __dim, + const float *__a) { + float __r = 0; + while (__dim--) { + __r += __a[0] * __a[0]; + ++__a; + } + + return __ocml_sqrt_f32(__r); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +int powii(int __base, int __exp) { + if (__exp < 0 ) + return -1; + int __result = 1; + for (;;) { + if (__exp & 1) + __result *= __base; + __exp >>= 1; + if (!__exp) + break; + __base *= __base; + } + return __result; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float remainderf(float __x, float __y) { + return __ocml_remainder_f32(__x, __y); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float remquof(float __x, float __y, int *__quo) { + int __tmp; + + + + float __r = __ocml_remquo_f32( + __x, __y, (__attribute__((address_space(5))) int *)&__tmp); + *__quo = __tmp; + + return __r; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float rintf(float __x) { return __ocml_rint_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float rnorm3df(float __x, float __y, float __z) { + return __ocml_rlen3_f32(__x, __y, __z); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float rnorm4df(float __x, float __y, float __z, float __w) { + return __ocml_rlen4_f32(__x, __y, __z, __w); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float rnormf(int __dim, + const float *__a) { + float __r = 0; + while (__dim--) { + __r += __a[0] * __a[0]; + ++__a; + } + + return __ocml_rsqrt_f32(__r); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float roundf(float __x) { return __ocml_round_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float scalblnf(float __x, long int __n) { + return (__n < 9223372036854775807L) ? __ocml_scalbn_f32(__x, __n) + : __ocml_scalb_f32(__x, __n); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); } + +static __attribute__((device)) inline __attribute__((always_inline)) +bool __signbitf(float __x) { return __ocml_signbit_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +void sincosf(float __x, float *__sinptr, float *__cosptr) { + float __tmp; + + + + *__sinptr = + __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); + *__cosptr = __tmp; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +void sincospif(float __x, float *__sinptr, float *__cosptr) { + float __tmp; + + + + *__sinptr = __ocml_sincospi_f32( + __x, (__attribute__((address_space(5))) float *)&__tmp); + *__cosptr = __tmp; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float sinf(float __x) { return __ocml_sin_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float sinhf(float __x) { return __ocml_sinh_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float sinpif(float __x) { return __ocml_sinpi_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float sqrtf(float __x) { return __ocml_sqrt_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float tanf(float __x) { return __ocml_tan_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float tanhf(float __x) { return __ocml_tanh_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float truncf(float __x) { return __ocml_trunc_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float y0f(float __x) { return __ocml_y0_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float y1f(float __x) { return __ocml_y1_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float ynf(int __n, float __x) { + + + + + if (__n == 0) + return y0f(__x); + if (__n == 1) + return y1f(__x); + + float __x0 = y0f(__x); + float __x1 = y1f(__x); + for (int __i = 1; __i < __n; ++__i) { + float __x2 = (2 * __i) / __x * __x1 - __x0; + __x0 = __x1; + __x1 = __x2; + } + + return __x1; +} + + + +static __attribute__((device)) inline __attribute__((always_inline)) +float __cosf(float __x) { return __ocml_native_cos_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float __expf(float __x) { return __ocml_native_exp_f32(__x); } +# 627 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +float __fadd_rn(float __x, float __y) { return __x + __y; } +# 641 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +float __fdiv_rn(float __x, float __y) { return __x / __y; } + + +static __attribute__((device)) inline __attribute__((always_inline)) +float __fdividef(float __x, float __y) { return __x / __y; } +# 666 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +float __fmaf_rn(float __x, float __y, float __z) { + return __ocml_fma_f32(__x, __y, __z); +} +# 682 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +float __fmul_rn(float __x, float __y) { return __x * __y; } +# 696 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +float __frcp_rn(float __x) { return 1.0f / __x; } + + +static __attribute__((device)) inline __attribute__((always_inline)) +float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); } +# 713 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } +# 727 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +float __fsub_rn(float __x, float __y) { return __x - __y; } + + +static __attribute__((device)) inline __attribute__((always_inline)) +float __log10f(float __x) { return __ocml_native_log10_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float __log2f(float __x) { return __ocml_native_log2_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float __logf(float __x) { return __ocml_native_log_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +void __sincosf(float __x, float *__sinptr, float *__cosptr) { + *__sinptr = __ocml_native_sin_f32(__x); + *__cosptr = __ocml_native_cos_f32(__x); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float __sinf(float __x) { return __ocml_native_sin_f32(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float __tanf(float __x) { return __ocml_tan_f32(__x); } + + + + +static __attribute__((device)) inline __attribute__((always_inline)) +double acos(double __x) { return __ocml_acos_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double acosh(double __x) { return __ocml_acosh_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double asin(double __x) { return __ocml_asin_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double asinh(double __x) { return __ocml_asinh_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double atan(double __x) { return __ocml_atan_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double atanh(double __x) { return __ocml_atanh_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double cbrt(double __x) { return __ocml_cbrt_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double ceil(double __x) { return __ocml_ceil_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double copysign(double __x, double __y) { + return __ocml_copysign_f64(__x, __y); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double cos(double __x) { return __ocml_cos_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double cosh(double __x) { return __ocml_cosh_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double cospi(double __x) { return __ocml_cospi_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double erf(double __x) { return __ocml_erf_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double erfc(double __x) { return __ocml_erfc_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double erfcx(double __x) { return __ocml_erfcx_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double erfinv(double __x) { return __ocml_erfinv_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double exp(double __x) { return __ocml_exp_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double exp10(double __x) { return __ocml_exp10_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double exp2(double __x) { return __ocml_exp2_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double expm1(double __x) { return __ocml_expm1_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double fabs(double __x) { return __builtin_fabs(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double floor(double __x) { return __ocml_floor_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double fma(double __x, double __y, double __z) { + return __ocml_fma_f64(__x, __y, __z); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double frexp(double __x, int *__nptr) { + int __tmp; + + + + double __r = + __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp); + *__nptr = __tmp; + return __r; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +int ilogb(double __x) { return __ocml_ilogb_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +bool __finite(double __x) { return __ocml_isfinite_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +bool __isinf(double __x) { return __ocml_isinf_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +bool __isnan(double __x) { return __ocml_isnan_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double j0(double __x) { return __ocml_j0_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double j1(double __x) { return __ocml_j1_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double jn(int __n, double __x) { + + + + + if (__n == 0) + return j0(__x); + if (__n == 1) + return j1(__x); + + double __x0 = j0(__x); + double __x1 = j1(__x); + for (int __i = 1; __i < __n; ++__i) { + double __x2 = (2 * __i) / __x * __x1 - __x0; + __x0 = __x1; + __x1 = __x2; + } + return __x1; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double lgamma(double __x) { return __ocml_lgamma_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +long long int llrint(double __x) { return __ocml_rint_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +long long int llround(double __x) { return __ocml_round_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double log(double __x) { return __ocml_log_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double log10(double __x) { return __ocml_log10_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double log1p(double __x) { return __ocml_log1p_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double log2(double __x) { return __ocml_log2_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double logb(double __x) { return __ocml_logb_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +long int lrint(double __x) { return __ocml_rint_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +long int lround(double __x) { return __ocml_round_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double modf(double __x, double *__iptr) { + double __tmp; + + + + double __r = + __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); + *__iptr = __tmp; + + return __r; +} + + + +static __attribute__((device)) inline __attribute__((always_inline)) +double nan(const char *__tagp) { + + union { + double val; + struct ieee_double { + long unsigned int mantissa : 51; + unsigned int quiet : 1; + unsigned int exponent : 11; + unsigned int sign : 1; + } bits; + } __tmp; + static_assert((sizeof(__tmp.val)) == (sizeof(__tmp.bits)), ""); + + __tmp.bits.sign = 0u; + __tmp.bits.exponent = ~0u; + __tmp.bits.quiet = 1u; + __tmp.bits.mantissa = __make_mantissa(__tagp); + + return __tmp.val; + + + + + + +} + + +static __attribute__((device)) inline __attribute__((always_inline)) +double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double nextafter(double __x, double __y) { + return __ocml_nextafter_f64(__x, __y); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double norm(int __dim, + const double *__a) { + double __r = 0; + while (__dim--) { + __r += __a[0] * __a[0]; + ++__a; + } + + return __ocml_sqrt_f64(__r); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double norm3d(double __x, double __y, double __z) { + return __ocml_len3_f64(__x, __y, __z); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double norm4d(double __x, double __y, double __z, double __w) { + return __ocml_len4_f64(__x, __y, __z, __w); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double normcdf(double __x) { return __ocml_ncdf_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double remainder(double __x, double __y) { + return __ocml_remainder_f64(__x, __y); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double remquo(double __x, double __y, int *__quo) { + int __tmp; + + + + double __r = __ocml_remquo_f64( + __x, __y, (__attribute__((address_space(5))) int *)&__tmp); + *__quo = __tmp; + + return __r; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double rint(double __x) { return __ocml_rint_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double rnorm(int __dim, + const double *__a) { + double __r = 0; + while (__dim--) { + __r += __a[0] * __a[0]; + ++__a; + } + + return __ocml_rsqrt_f64(__r); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double rnorm3d(double __x, double __y, double __z) { + return __ocml_rlen3_f64(__x, __y, __z); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double rnorm4d(double __x, double __y, double __z, double __w) { + return __ocml_rlen4_f64(__x, __y, __z, __w); +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double round(double __x) { return __ocml_round_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double scalbln(double __x, long int __n) { + return (__n < 9223372036854775807L) ? __ocml_scalbn_f64(__x, __n) + : __ocml_scalb_f64(__x, __n); +} +static __attribute__((device)) inline __attribute__((always_inline)) +double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); } + +static __attribute__((device)) inline __attribute__((always_inline)) +bool __signbit(double __x) { return __ocml_signbit_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double sin(double __x) { return __ocml_sin_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +void sincos(double __x, double *__sinptr, double *__cosptr) { + double __tmp; + + + + *__sinptr = __ocml_sincos_f64( + __x, (__attribute__((address_space(5))) double *)&__tmp); + *__cosptr = __tmp; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +void sincospi(double __x, double *__sinptr, double *__cosptr) { + double __tmp; + + + + *__sinptr = __ocml_sincospi_f64( + __x, (__attribute__((address_space(5))) double *)&__tmp); + *__cosptr = __tmp; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +double sinh(double __x) { return __ocml_sinh_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double sinpi(double __x) { return __ocml_sinpi_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double sqrt(double __x) { return __ocml_sqrt_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double tan(double __x) { return __ocml_tan_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double tanh(double __x) { return __ocml_tanh_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double tgamma(double __x) { return __ocml_tgamma_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double trunc(double __x) { return __ocml_trunc_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double y0(double __x) { return __ocml_y0_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double y1(double __x) { return __ocml_y1_f64(__x); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double yn(int __n, double __x) { + + + + + if (__n == 0) + return y0(__x); + if (__n == 1) + return y1(__x); + + double __x0 = y0(__x); + double __x1 = y1(__x); + for (int __i = 1; __i < __n; ++__i) { + double __x2 = (2 * __i) / __x * __x1 - __x0; + __x0 = __x1; + __x1 = __x2; + } + + return __x1; +} +# 1190 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +double __dadd_rn(double __x, double __y) { return __x + __y; } +# 1212 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +double __ddiv_rn(double __x, double __y) { return __x / __y; } +# 1234 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +double __dmul_rn(double __x, double __y) { return __x * __y; } +# 1248 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +double __drcp_rn(double __x) { return 1.0 / __x; } +# 1262 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); } +# 1284 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +double __dsub_rn(double __x, double __y) { return __x - __y; } +# 1306 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) +double __fma_rn(double __x, double __y, double __z) { + return __ocml_fma_f64(__x, __y, __z); +} +# 1325 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3 +template static __attribute__((device)) inline __attribute__((always_inline)) T min(T __arg1, T __arg2) { + return (__arg1 < __arg2) ? __arg1 : __arg2; +} + +template static __attribute__((device)) inline __attribute__((always_inline)) T max(T __arg1, T __arg2) { + return (__arg1 > __arg2) ? __arg1 : __arg2; +} + + +static __attribute__((device)) inline __attribute__((always_inline)) int min(int __arg1, int __arg2) { + return (__arg1 < __arg2) ? __arg1 : __arg2; +} +static __attribute__((device)) inline __attribute__((always_inline)) int max(int __arg1, int __arg2) { + return (__arg1 > __arg2) ? __arg1 : __arg2; +} + +static __attribute__((device)) inline __attribute__((always_inline)) +float max(float __x, float __y) { return fmaxf(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double max(double __x, double __y) { return fmax(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +float min(float __x, float __y) { return fminf(__x, __y); } + +static __attribute__((device)) inline __attribute__((always_inline)) +double min(double __x, double __y) { return fmin(__x, __y); } +# 129 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3 +# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_stdlib.h" 1 3 +# 130 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3 + + +# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 1 3 +# 41 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) double abs(double __x) { return ::fabs(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float abs(float __x) { return ::fabsf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) long long abs(long long __n) { return ::llabs(__n); } +static __attribute__((device)) inline __attribute__((always_inline)) long abs(long __n) { return ::labs(__n); } +static __attribute__((device)) inline __attribute__((always_inline)) float fma(float __x, float __y, float __z) { + return ::fmaf(__x, __y, __z); +} +# 61 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) float frexp(float __arg, int *__exp) { + return ::frexpf(__arg, __exp); +} +# 93 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) bool isinf(float __x) { return ::__isinff(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) bool isinf(double __x) { return ::__isinf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) bool isfinite(float __x) { return ::__finitef(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) bool isfinite(double __x) { return ::__finite(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) bool isnan(float __x) { return ::__isnanf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) bool isnan(double __x) { return ::__isnan(__x); } + + + + + +static __attribute__((device)) inline __attribute__((always_inline)) bool isgreater(float __x, float __y) { + return __builtin_isgreater(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isgreater(double __x, double __y) { + return __builtin_isgreater(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isgreaterequal(float __x, float __y) { + return __builtin_isgreaterequal(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isgreaterequal(double __x, double __y) { + return __builtin_isgreaterequal(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isless(float __x, float __y) { + return __builtin_isless(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isless(double __x, double __y) { + return __builtin_isless(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool islessequal(float __x, float __y) { + return __builtin_islessequal(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool islessequal(double __x, double __y) { + return __builtin_islessequal(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool islessgreater(float __x, float __y) { + return __builtin_islessgreater(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool islessgreater(double __x, double __y) { + return __builtin_islessgreater(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isnormal(float __x) { + return __builtin_isnormal(__x); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isnormal(double __x) { + return __builtin_isnormal(__x); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isunordered(float __x, float __y) { + return __builtin_isunordered(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool isunordered(double __x, double __y) { + return __builtin_isunordered(__x, __y); +} +static __attribute__((device)) inline __attribute__((always_inline)) float modf(float __x, float *__iptr) { + return ::modff(__x, __iptr); +} +static __attribute__((device)) inline __attribute__((always_inline)) float pow(float __base, int __iexp) { + return ::powif(__base, __iexp); +} +static __attribute__((device)) inline __attribute__((always_inline)) double pow(double __base, int __iexp) { + return ::powi(__base, __iexp); +} +static __attribute__((device)) inline __attribute__((always_inline)) float remquo(float __x, float __y, int *__quo) { + return ::remquof(__x, __y, __quo); +} +static __attribute__((device)) inline __attribute__((always_inline)) float scalbln(float __x, long int __n) { + return ::scalblnf(__x, __n); +} +static __attribute__((device)) inline __attribute__((always_inline)) bool signbit(float __x) { return ::__signbitf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) bool signbit(double __x) { return ::__signbit(__x); } + + + + + + +static __attribute__((device)) inline __attribute__((always_inline)) _Float16 fma(_Float16 __x, _Float16 __y, + _Float16 __z) { + return __ocml_fma_f16(__x, __y, __z); +} +static __attribute__((device)) inline __attribute__((always_inline)) _Float16 pow(_Float16 __base, int __iexp) { + return __ocml_pown_f16(__base, __iexp); +} +# 202 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3 +static __attribute__((device)) inline __attribute__((always_inline)) float acos(float __x) { return acosf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float acosh(float __x) { return acoshf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float asin(float __x) { return asinf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float asinh(float __x) { return asinhf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float atan(float __x) { return atanf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float atan2(float __x, float __y) { return atan2f(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float atanh(float __x) { return atanhf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float cbrt(float __x) { return cbrtf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float ceil(float __x) { return ceilf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float copysign(float __x, float __y) { return copysignf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float cos(float __x) { return cosf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float cosh(float __x) { return coshf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float erf(float __x) { return erff(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float erfc(float __x) { return erfcf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float exp(float __x) { return expf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float exp2(float __x) { return exp2f(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float expm1(float __x) { return expm1f(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float fabs(float __x) { return fabsf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float fdim(float __x, float __y) { return fdimf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float floor(float __x) { return floorf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float fmax(float __x, float __y) { return fmaxf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float fmin(float __x, float __y) { return fminf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float fmod(float __x, float __y) { return fmodf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float hypot(float __x, float __y) { return hypotf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) int ilogb(float __x) { return ilogbf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float ldexp(float __x, int __y) { return ldexpf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float lgamma(float __x) { return lgammaf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float log(float __x) { return logf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float log10(float __x) { return log10f(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float log1p(float __x) { return log1pf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float log2(float __x) { return log2f(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float logb(float __x) { return logbf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) long long llrint(float __x) { return llrintf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) long long llround(float __x) { return llroundf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) long lrint(float __x) { return lrintf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) long lround(float __x) { return lroundf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float nearbyint(float __x) { return nearbyintf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float nextafter(float __x, float __y) { return nextafterf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float pow(float __x, float __y) { return powf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float remainder(float __x, float __y) { return remainderf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float rint(float __x) { return rintf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float round(float __x) { return roundf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float scalbn(float __x, int __y) { return scalbnf(__x, __y); } +static __attribute__((device)) inline __attribute__((always_inline)) float sin(float __x) { return sinf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float sinh(float __x) { return sinhf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float sqrt(float __x) { return sqrtf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float tan(float __x) { return tanf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float tanh(float __x) { return tanhf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float tgamma(float __x) { return tgammaf(__x); } +static __attribute__((device)) inline __attribute__((always_inline)) float trunc(float __x) { return truncf(__x); } +# 265 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3 +template struct __hip_enable_if {}; + +template struct __hip_enable_if { typedef __T type; }; + +namespace __hip { +template struct is_integral { + enum { value = 0 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; +template <> struct is_integral { + enum { value = 1 }; +}; + + +template struct is_arithmetic { + enum { value = 0 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; +template <> struct is_arithmetic { + enum { value = 1 }; +}; + +struct true_type { + static const __attribute__((constant)) bool value = true; +}; +struct false_type { + static const __attribute__((constant)) bool value = false; +}; + +template struct is_same : public false_type {}; +template struct is_same<__T, __T> : public true_type {}; + +template struct add_rvalue_reference { typedef __T &&type; }; + +template typename add_rvalue_reference<__T>::type declval(); + + + + +template struct __numeric_type { + static void __test(...); + static _Float16 __test(_Float16); + static float __test(float); + static double __test(char); + static double __test(int); + static double __test(unsigned); + static double __test(long); + static double __test(unsigned long); + static double __test(long long); + static double __test(unsigned long long); + static double __test(double); + + static double __test(long double); + + typedef decltype(__test(declval<_Tp>())) type; + static const bool value = !is_same::value; +}; + +template <> struct __numeric_type { static const bool value = true; }; + +template ::value &&__numeric_type<_A2>::value + &&__numeric_type<_A3>::value> +class __promote_imp { +public: + static const bool value = false; +}; + +template +class __promote_imp<_A1, _A2, _A3, true> { +private: + typedef typename __promote_imp<_A1>::type __type1; + typedef typename __promote_imp<_A2>::type __type2; + typedef typename __promote_imp<_A3>::type __type3; + +public: + typedef decltype(__type1() + __type2() + __type3()) type; + static const bool value = true; +}; + +template class __promote_imp<_A1, _A2, void, true> { +private: + typedef typename __promote_imp<_A1>::type __type1; + typedef typename __promote_imp<_A2>::type __type2; + +public: + typedef decltype(__type1() + __type2()) type; + static const bool value = true; +}; + +template class __promote_imp<_A1, void, void, true> { +public: + typedef typename __numeric_type<_A1>::type type; + static const bool value = true; +}; + +template +class __promote : public __promote_imp<_A1, _A2, _A3> {}; + +} +# 478 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3 +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type acos(__T __x) { return ::acos((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type acosh(__T __x) { return ::acosh((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type asin(__T __x) { return ::asin((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type asinh(__T __x) { return ::asinh((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type atan(__T __x) { return ::atan((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type atan2(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return atan2((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type atanh(__T __x) { return ::atanh((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cbrt(__T __x) { return ::cbrt((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type ceil(__T __x) { return ::ceil((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type copysign(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return copysign((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cos(__T __x) { return ::cos((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cosh(__T __x) { return ::cosh((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type erf(__T __x) { return ::erf((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type erfc(__T __x) { return ::erfc((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type exp(__T __x) { return ::exp((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type exp2(__T __x) { return ::exp2((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type expm1(__T __x) { return ::expm1((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type fabs(__T __x) { return ::fabs((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fdim(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fdim((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type floor(__T __x) { return ::floor((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmax(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmax((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmin(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmin((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmod(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmod((__result_type)__x, (__result_type)__y); } + + + +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type hypot(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return hypot((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, int>::type ilogb(__T __x) { return ::ilogb((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isfinite(__T __x) { return ::isfinite((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isgreater(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isgreater((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isgreaterequal(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isgreaterequal((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isinf(__T __x) { return ::isinf((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isless(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isless((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type islessequal(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return islessequal((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type islessgreater(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return islessgreater((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isnan(__T __x) { return ::isnan((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isnormal(__T __x) { return ::isnormal((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isunordered(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isunordered((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type lgamma(__T __x) { return ::lgamma((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log(__T __x) { return ::log((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log10(__T __x) { return ::log10((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log1p(__T __x) { return ::log1p((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log2(__T __x) { return ::log2((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type logb(__T __x) { return ::logb((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long long>::type llrint(__T __x) { return ::llrint((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long long>::type llround(__T __x) { return ::llround((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long>::type lrint(__T __x) { return ::lrint((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long>::type lround(__T __x) { return ::lround((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type nearbyint(__T __x) { return ::nearbyint((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type nextafter(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return nextafter((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type pow(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return pow((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type remainder(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return remainder((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type rint(__T __x) { return ::rint((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type round(__T __x) { return ::round((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type signbit(__T __x) { return ::signbit((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sin(__T __x) { return ::sin((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sinh(__T __x) { return ::sinh((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sqrt(__T __x) { return ::sqrt((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tan(__T __x) { return ::tan((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tanh(__T __x) { return ::tanh((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tgamma(__T __x) { return ::tgamma((double)__x); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type trunc(__T __x) { return ::trunc((double)__x); } + + +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type max(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return max((__result_type)__x, (__result_type)__y); } +template static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type min(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return min((__result_type)__x, (__result_type)__y); } + + + +template +static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< + __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value && + __hip::is_arithmetic<__T3>::value, + typename __hip::__promote<__T1, __T2, __T3>::type>::type +fma(__T1 __x, __T2 __y, __T3 __z) { + typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type; + return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z); +} +# 568 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3 +template +static __attribute__((device)) inline __attribute__((always_inline)) + typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type + frexp(__T __x, int *__exp) { + return ::frexp((double)__x, __exp); +} + +template +static __attribute__((device)) inline __attribute__((always_inline)) + typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type + ldexp(__T __x, int __exp) { + return ::ldexp((double)__x, __exp); +} + +template +static __attribute__((device)) inline __attribute__((always_inline)) + typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type + modf(__T __x, double *__exp) { + return ::modf((double)__x, __exp); +} + + +template +static __attribute__((device)) inline __attribute__((always_inline)) + typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && + __hip::is_arithmetic<__T2>::value, + typename __hip::__promote<__T1, __T2>::type>::type + remquo(__T1 __x, __T2 __y, int *__quo) { + typedef typename __hip::__promote<__T1, __T2>::type __result_type; + return ::remquo((__result_type)__x, (__result_type)__y, __quo); +} +# 610 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3 +template +static __attribute__((device)) inline __attribute__((always_inline)) + typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type + scalbln(__T __x, long int __exp) { + return ::scalbln((double)__x, __exp); +} + +template +static __attribute__((device)) inline __attribute__((always_inline)) + typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type + scalbn(__T __x, int __exp) { + return ::scalbn((double)__x, __exp); +} +# 133 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3 +# 2 "" 2 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2 + + + + +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 1 3 +# 58 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 3 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/include/hip/hip_version.h" 1 3 +# 59 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 1 3 +# 27 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 3 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +# 97 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 3 +#pragma clang diagnostic pop +# 60 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3 + + +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 1 3 +# 32 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_common.h" 1 3 +# 33 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 2 3 +# 43 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3 +extern "C" { +# 54 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3 +const char* amd_dbgapi_get_build_name(); +# 63 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3 +const char* amd_dbgapi_get_git_hash(); +# 72 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3 +size_t amd_dbgapi_get_build_id(); + + +} +# 92 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3 +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +typedef signed int int32_t; +typedef signed long long int64_t; +namespace std { +using ::uint32_t; +using ::uint64_t; +using ::int32_t; +using ::int64_t; +} +# 124 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 1 3 +# 27 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 3 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 1 3 +# 31 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/host_defines.h" 1 3 +# 38 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/host_defines.h" 3 +namespace __hip_internal { +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long long int64_t; + +template struct integral_constant { + static constexpr const _Tp value = __v; + typedef _Tp value_type; + typedef integral_constant type; + constexpr operator value_type() const { return value; } + constexpr value_type operator()() const { return value; } +}; +template constexpr const _Tp integral_constant<_Tp, __v>::value; + +typedef integral_constant true_type; +typedef integral_constant false_type; + +template using bool_constant = integral_constant; +typedef bool_constant true_type; +typedef bool_constant false_type; + +template struct enable_if {}; +template struct enable_if { typedef __T type; }; + +template struct true_or_false_type : public false_type {}; +template<> struct true_or_false_type : public true_type {}; + +template struct is_integral : public false_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; + +template struct is_arithmetic : public false_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; + +template struct is_floating_point : public false_type {}; +template<> struct is_floating_point : public true_type {}; +template<> struct is_floating_point : public true_type {}; +template<> struct is_floating_point : public true_type {}; + +template struct is_same : public false_type {}; +template struct is_same<__T, __T> : public true_type {}; + +template::value> + struct is_signed : public false_type {}; +template + struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {}; + +template struct char_traits; +template> class basic_istream; +template> class basic_ostream; +typedef basic_istream istream; +typedef basic_ostream ostream; + +template + struct is_standard_layout + : public integral_constant + { }; + +template + struct is_trivial + : public integral_constant + { }; +} +typedef __hip_internal::uint8_t __hip_uint8_t; +typedef __hip_internal::uint16_t __hip_uint16_t; +typedef __hip_internal::uint32_t __hip_uint32_t; +typedef __hip_internal::uint64_t __hip_uint64_t; +typedef __hip_internal::int8_t __hip_int8_t; +typedef __hip_internal::int16_t __hip_int16_t; +typedef __hip_internal::int32_t __hip_int32_t; +typedef __hip_internal::int64_t __hip_int64_t; +# 32 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 2 3 +# 52 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3 +namespace std { +using ::size_t; + +template struct integral_constant { + static constexpr const _Tp value = __v; + typedef _Tp value_type; + typedef integral_constant type; + constexpr operator value_type() const { return value; } + constexpr value_type operator()() const { return value; } +}; +template constexpr const _Tp integral_constant<_Tp, __v>::value; + +typedef integral_constant true_type; +typedef integral_constant false_type; + +template using bool_constant = integral_constant; +typedef bool_constant true_type; +typedef bool_constant false_type; + +template struct enable_if {}; +template struct enable_if { typedef __T type; }; + +template struct true_or_false_type : public false_type {}; +template<> struct true_or_false_type : public true_type {}; + +template struct is_integral : public false_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; +template <> struct is_integral : public true_type {}; + +template struct is_arithmetic : public false_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; +template <> struct is_arithmetic : public true_type {}; + +template struct is_floating_point : public false_type {}; +template<> struct is_floating_point : public true_type {}; +template<> struct is_floating_point : public true_type {}; +template<> struct is_floating_point : public true_type {}; + +template struct is_same : public false_type {}; +template struct is_same<__T, __T> : public true_type {}; + +template::value> + struct is_signed : public false_type {}; +template + struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {}; + +template struct is_convertible + : public true_or_false_type<__is_convertible_to(_T1, _T2)> {}; + +template struct char_traits; +template> class basic_istream; +template> class basic_ostream; +typedef basic_istream istream; +typedef basic_ostream ostream; + +template struct is_scalar : public integral_constant {}; +} + + + namespace hip_impl { + inline + constexpr + unsigned int next_pot(unsigned int x) { + + return 1u << (32u - __builtin_clz(x - 1u)); + } + } + + template struct HIP_vector_base; + + template + struct HIP_vector_base { + using Native_vec_ = T __attribute__((ext_vector_type(1))); + + union { + Native_vec_ data; + struct { + T x; + }; + }; + + using value_type = T; + + __attribute__((device)) + HIP_vector_base() = default; + __attribute__((device)) + explicit + constexpr + HIP_vector_base(T x_) noexcept : data{x_} {} + __attribute__((device)) + constexpr + HIP_vector_base(const HIP_vector_base&) = default; + __attribute__((device)) + constexpr + HIP_vector_base(HIP_vector_base&&) = default; + __attribute__((device)) + ~HIP_vector_base() = default; + __attribute__((device)) + HIP_vector_base& operator=(const HIP_vector_base&) = default; + }; + + template + struct HIP_vector_base { + using Native_vec_ = T __attribute__((ext_vector_type(2))); + + union + + + + { + Native_vec_ data; + struct { + T x; + T y; + }; + }; + + using value_type = T; + + __attribute__((device)) + HIP_vector_base() = default; + __attribute__((device)) + explicit + constexpr + HIP_vector_base(T x_) noexcept : data{x_, x_} {} + __attribute__((device)) + constexpr + HIP_vector_base(T x_, T y_) noexcept : data{x_, y_} {} + __attribute__((device)) + constexpr + HIP_vector_base(const HIP_vector_base&) = default; + __attribute__((device)) + constexpr + HIP_vector_base(HIP_vector_base&&) = default; + __attribute__((device)) + ~HIP_vector_base() = default; + __attribute__((device)) + HIP_vector_base& operator=(const HIP_vector_base&) = default; + }; + + template + struct HIP_vector_base { + struct Native_vec_ { + T d[3]; + + __attribute__((device)) + Native_vec_() = default; + + __attribute__((device)) + explicit + constexpr + Native_vec_(T x_) noexcept : d{x_, x_, x_} {} + __attribute__((device)) + constexpr + Native_vec_(T x_, T y_, T z_) noexcept : d{x_, y_, z_} {} + __attribute__((device)) + constexpr + Native_vec_(const Native_vec_&) = default; + __attribute__((device)) + constexpr + Native_vec_(Native_vec_&&) = default; + __attribute__((device)) + ~Native_vec_() = default; + + __attribute__((device)) + Native_vec_& operator=(const Native_vec_&) = default; + __attribute__((device)) + Native_vec_& operator=(Native_vec_&&) = default; + + __attribute__((device)) + T& operator[](unsigned int idx) noexcept { return d[idx]; } + __attribute__((device)) + T operator[](unsigned int idx) const noexcept { return d[idx]; } + + __attribute__((device)) + Native_vec_& operator+=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] += x_.d[i]; + return *this; + } + __attribute__((device)) + Native_vec_& operator-=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] -= x_.d[i]; + return *this; + } + + __attribute__((device)) + Native_vec_& operator*=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] *= x_.d[i]; + return *this; + } + __attribute__((device)) + Native_vec_& operator/=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] /= x_.d[i]; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + Native_vec_ operator-() const noexcept + { + auto r{*this}; + for (auto&& x : r.d) x = -x; + return r; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + Native_vec_ operator~() const noexcept + { + auto r{*this}; + for (auto&& x : r.d) x = ~x; + return r; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + Native_vec_& operator%=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] %= x_.d[i]; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + Native_vec_& operator^=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] ^= x_.d[i]; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + Native_vec_& operator|=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] |= x_.d[i]; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + Native_vec_& operator&=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] &= x_.d[i]; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + Native_vec_& operator>>=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] >>= x_.d[i]; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + Native_vec_& operator<<=(const Native_vec_& x_) noexcept + { + for (auto i = 0u; i != 3u; ++i) d[i] <<= x_.d[i]; + return *this; + } + + + + + + + using Vec3_cmp = int __attribute__((vector_size(4 * sizeof(int)))); + + __attribute__((device)) + Vec3_cmp operator==(const Native_vec_& x_) const noexcept + { + return Vec3_cmp{d[0] == x_.d[0], d[1] == x_.d[1], d[2] == x_.d[2]}; + } + }; + + union { + Native_vec_ data; + struct { + T x; + T y; + T z; + }; + }; + + using value_type = T; + + __attribute__((device)) + HIP_vector_base() = default; + __attribute__((device)) + explicit + constexpr + HIP_vector_base(T x_) noexcept : data{x_, x_, x_} {} + __attribute__((device)) + constexpr + HIP_vector_base(T x_, T y_, T z_) noexcept : data{x_, y_, z_} {} + __attribute__((device)) + constexpr + HIP_vector_base(const HIP_vector_base&) = default; + __attribute__((device)) + constexpr + HIP_vector_base(HIP_vector_base&&) = default; + __attribute__((device)) + ~HIP_vector_base() = default; + + __attribute__((device)) + HIP_vector_base& operator=(const HIP_vector_base&) = default; + __attribute__((device)) + HIP_vector_base& operator=(HIP_vector_base&&) = default; + }; + + template + struct HIP_vector_base { + using Native_vec_ = T __attribute__((ext_vector_type(4))); + + union + + + + { + Native_vec_ data; + struct { + T x; + T y; + T z; + T w; + }; + }; + + using value_type = T; + + __attribute__((device)) + HIP_vector_base() = default; + __attribute__((device)) + explicit + constexpr + HIP_vector_base(T x_) noexcept : data{x_, x_, x_, x_} {} + __attribute__((device)) + constexpr + HIP_vector_base(T x_, T y_, T z_, T w_) noexcept : data{x_, y_, z_, w_} {} + __attribute__((device)) + constexpr + HIP_vector_base(const HIP_vector_base&) = default; + __attribute__((device)) + constexpr + HIP_vector_base(HIP_vector_base&&) = default; + __attribute__((device)) + ~HIP_vector_base() = default; + __attribute__((device)) + HIP_vector_base& operator=(const HIP_vector_base&) = default; + }; + + template + struct HIP_vector_type : public HIP_vector_base { + using HIP_vector_base::data; + using typename HIP_vector_base::Native_vec_; + + __attribute__((device)) + HIP_vector_type() = default; + template< + typename U, + typename std::enable_if< + std::is_convertible::value>::type* = nullptr> + __attribute__((device)) + explicit + constexpr + HIP_vector_type(U x_) noexcept + : HIP_vector_base{static_cast(x_)} + {} + template< + typename... Us, + typename std::enable_if< + (rank > 1) && sizeof...(Us) == rank>::type* = nullptr> + __attribute__((device)) + constexpr + HIP_vector_type(Us... xs) noexcept + : HIP_vector_base{static_cast(xs)...} + {} + __attribute__((device)) + constexpr + HIP_vector_type(const HIP_vector_type&) = default; + __attribute__((device)) + constexpr + HIP_vector_type(HIP_vector_type&&) = default; + __attribute__((device)) + ~HIP_vector_type() = default; + + __attribute__((device)) + HIP_vector_type& operator=(const HIP_vector_type&) = default; + __attribute__((device)) + HIP_vector_type& operator=(HIP_vector_type&&) = default; + + + __attribute__((device)) + HIP_vector_type& operator++() noexcept + { + return *this += HIP_vector_type{1}; + } + __attribute__((device)) + HIP_vector_type operator++(int) noexcept + { + auto tmp(*this); + ++*this; + return tmp; + } + + __attribute__((device)) + HIP_vector_type& operator--() noexcept + { + return *this -= HIP_vector_type{1}; + } + __attribute__((device)) + HIP_vector_type operator--(int) noexcept + { + auto tmp(*this); + --*this; + return tmp; + } + + __attribute__((device)) + HIP_vector_type& operator+=(const HIP_vector_type& x) noexcept + { + data += x.data; + return *this; + } + template< + typename U, + typename std::enable_if< + std::is_convertible{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator+=(U x) noexcept + { + return *this += HIP_vector_type{x}; + } + + __attribute__((device)) + HIP_vector_type& operator-=(const HIP_vector_type& x) noexcept + { + data -= x.data; + return *this; + } + template< + typename U, + typename std::enable_if< + std::is_convertible{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator-=(U x) noexcept + { + return *this -= HIP_vector_type{x}; + } + + __attribute__((device)) + HIP_vector_type& operator*=(const HIP_vector_type& x) noexcept + { + data *= x.data; + return *this; + } + + friend __attribute__((device)) inline constexpr HIP_vector_type operator*( + HIP_vector_type x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{ x } *= y; + } + + template< + typename U, + typename std::enable_if< + std::is_convertible{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator*=(U x) noexcept + { + return *this *= HIP_vector_type{x}; + } + + friend __attribute__((device)) inline constexpr HIP_vector_type operator/( + HIP_vector_type x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{ x } /= y; + } + + __attribute__((device)) + HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept + { + data /= x.data; + return *this; + } + template< + typename U, + typename std::enable_if< + std::is_convertible{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator/=(U x) noexcept + { + return *this /= HIP_vector_type{x}; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type operator-() const noexcept + { + auto tmp(*this); + tmp.data = -tmp.data; + return tmp; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type operator~() const noexcept + { + HIP_vector_type r{*this}; + r.data = ~r.data; + return r; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator%=(const HIP_vector_type& x) noexcept + { + data %= x.data; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator^=(const HIP_vector_type& x) noexcept + { + data ^= x.data; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator|=(const HIP_vector_type& x) noexcept + { + data |= x.data; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator&=(const HIP_vector_type& x) noexcept + { + data &= x.data; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator>>=(const HIP_vector_type& x) noexcept + { + data >>= x.data; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __attribute__((device)) + HIP_vector_type& operator<<=(const HIP_vector_type& x) noexcept + { + data <<= x.data; + return *this; + } + }; + + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator+( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} += y; + } + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator+( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} += HIP_vector_type{y}; + } + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator+( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} += y; + } + + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator-( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} -= y; + } + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator-( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} -= HIP_vector_type{y}; + } + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator-( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} -= y; + } + + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator*( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} *= HIP_vector_type{y}; + } + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator*( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} *= y; + } + + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator/( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} /= HIP_vector_type{y}; + } + template + __attribute__((device)) + inline + constexpr + HIP_vector_type operator/( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} /= y; + } + + template + __attribute__((device)) + inline + constexpr + bool _hip_any_zero(const V& x, int n) noexcept + { + return + (n == -1) ? true : ((x[n] == 0) ? false : _hip_any_zero(x, n - 1)); + } + + template + __attribute__((device)) + inline + constexpr + bool operator==( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return _hip_any_zero(x.data == y.data, n - 1); + } + template + __attribute__((device)) + inline + constexpr + bool operator==(const HIP_vector_type& x, U y) noexcept + { + return x == HIP_vector_type{y}; + } + template + __attribute__((device)) + inline + constexpr + bool operator==(U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} == y; + } + + template + __attribute__((device)) + inline + constexpr + bool operator!=( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return !(x == y); + } + template + __attribute__((device)) + inline + constexpr + bool operator!=(const HIP_vector_type& x, U y) noexcept + { + return !(x == y); + } + template + __attribute__((device)) + inline + constexpr + bool operator!=(U x, const HIP_vector_type& y) noexcept + { + return !(x == y); + } + + template< + typename T, + unsigned int n, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator%( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} %= y; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator%( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} %= HIP_vector_type{y}; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator%( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} %= y; + } + + template< + typename T, + unsigned int n, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator^( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} ^= y; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator^( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} ^= HIP_vector_type{y}; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator^( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} ^= y; + } + + template< + typename T, + unsigned int n, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator|( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} |= y; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator|( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} |= HIP_vector_type{y}; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator|( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} |= y; + } + + template< + typename T, + unsigned int n, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator&( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} &= y; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator&( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} &= HIP_vector_type{y}; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator&( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} &= y; + } + + template< + typename T, + unsigned int n, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator>>( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} >>= y; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator>>( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} >>= HIP_vector_type{y}; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator>>( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} >>= y; + } + + template< + typename T, + unsigned int n, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator<<( + const HIP_vector_type& x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} <<= y; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator<<( + const HIP_vector_type& x, U y) noexcept + { + return HIP_vector_type{x} <<= HIP_vector_type{y}; + } + template< + typename T, + unsigned int n, + typename U, + typename std::enable_if::value>::type, + typename std::enable_if{}>* = nullptr> + __attribute__((device)) + inline + constexpr + HIP_vector_type operator<<( + U x, const HIP_vector_type& y) noexcept + { + return HIP_vector_type{x} <<= y; + } + + + + + template + inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 1 && rankU >= 1), + const HIP_vector_type>::type + __hipMapVector(const HIP_vector_type& u) { + return HIP_vector_type(static_cast(u.x)); + }; + + template + inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 2 && rankU == 1), + const HIP_vector_type>::type + __hipMapVector(const HIP_vector_type& u) { + return HIP_vector_type (static_cast(u.x), static_cast(0)); + }; + + template + inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 2 && rankU >= 2), + const HIP_vector_type>::type + __hipMapVector(const HIP_vector_type& u) { + return HIP_vector_type (static_cast(u.x), static_cast(u.y)); + }; + + template + inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 1), + const HIP_vector_type>::type + __hipMapVector(const HIP_vector_type& u) { + return HIP_vector_type (static_cast(u.x), static_cast(0), + static_cast(0), static_cast(0)); + }; + + template + inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 2), + const HIP_vector_type>::type + __hipMapVector(const HIP_vector_type& u) { + return HIP_vector_type(static_cast(u.x), static_cast(u.y), + static_cast(0), static_cast(0)); + }; + + template + inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 4), + const HIP_vector_type>::type + __hipMapVector(const HIP_vector_type& u) { + return HIP_vector_type (static_cast(u.x), static_cast(u.y), + static_cast(u.z), static_cast(u.w)); + }; +# 1135 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3 +using uchar1 = HIP_vector_type; using uchar2 = HIP_vector_type; using uchar3 = HIP_vector_type; using uchar4 = HIP_vector_type;; +using char1 = HIP_vector_type; using char2 = HIP_vector_type; using char3 = HIP_vector_type; using char4 = HIP_vector_type;; +using ushort1 = HIP_vector_type; using ushort2 = HIP_vector_type; using ushort3 = HIP_vector_type; using ushort4 = HIP_vector_type;; +using short1 = HIP_vector_type; using short2 = HIP_vector_type; using short3 = HIP_vector_type; using short4 = HIP_vector_type;; +using uint1 = HIP_vector_type; using uint2 = HIP_vector_type; using uint3 = HIP_vector_type; using uint4 = HIP_vector_type;; +using int1 = HIP_vector_type; using int2 = HIP_vector_type; using int3 = HIP_vector_type; using int4 = HIP_vector_type;; +using ulong1 = HIP_vector_type; using ulong2 = HIP_vector_type; using ulong3 = HIP_vector_type; using ulong4 = HIP_vector_type;; +using long1 = HIP_vector_type; using long2 = HIP_vector_type; using long3 = HIP_vector_type; using long4 = HIP_vector_type;; +using ulonglong1 = HIP_vector_type; using ulonglong2 = HIP_vector_type; using ulonglong3 = HIP_vector_type; using ulonglong4 = HIP_vector_type;; +using longlong1 = HIP_vector_type; using longlong2 = HIP_vector_type; using longlong3 = HIP_vector_type; using longlong4 = HIP_vector_type;; +using float1 = HIP_vector_type; using float2 = HIP_vector_type; using float3 = HIP_vector_type; using float4 = HIP_vector_type;; +using double1 = HIP_vector_type; using double2 = HIP_vector_type; using double3 = HIP_vector_type; using double4 = HIP_vector_type;; +# 2117 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3 +static inline __attribute__((device)) uchar1 make_uchar1(unsigned char x) { uchar1 r{x}; return r; }; +static inline __attribute__((device)) uchar2 make_uchar2(unsigned char x, unsigned char y) { uchar2 r{x, y}; return r; }; +static inline __attribute__((device)) uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) { uchar3 r{x, y, z}; return r; }; +static inline __attribute__((device)) uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) { uchar4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) char1 make_char1(signed char x) { char1 r{x}; return r; }; +static inline __attribute__((device)) char2 make_char2(signed char x, signed char y) { char2 r{x, y}; return r; }; +static inline __attribute__((device)) char3 make_char3(signed char x, signed char y, signed char z) { char3 r{x, y, z}; return r; }; +static inline __attribute__((device)) char4 make_char4(signed char x, signed char y, signed char z, signed char w) { char4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) ushort1 make_ushort1(unsigned short x) { ushort1 r{x}; return r; }; +static inline __attribute__((device)) ushort2 make_ushort2(unsigned short x, unsigned short y) { ushort2 r{x, y}; return r; }; +static inline __attribute__((device)) ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) { ushort3 r{x, y, z}; return r; }; +static inline __attribute__((device)) ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) { ushort4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) short1 make_short1(signed short x) { short1 r{x}; return r; }; +static inline __attribute__((device)) short2 make_short2(signed short x, signed short y) { short2 r{x, y}; return r; }; +static inline __attribute__((device)) short3 make_short3(signed short x, signed short y, signed short z) { short3 r{x, y, z}; return r; }; +static inline __attribute__((device)) short4 make_short4(signed short x, signed short y, signed short z, signed short w) { short4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) uint1 make_uint1(unsigned int x) { uint1 r{x}; return r; }; +static inline __attribute__((device)) uint2 make_uint2(unsigned int x, unsigned int y) { uint2 r{x, y}; return r; }; +static inline __attribute__((device)) uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) { uint3 r{x, y, z}; return r; }; +static inline __attribute__((device)) uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) { uint4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) int1 make_int1(signed int x) { int1 r{x}; return r; }; +static inline __attribute__((device)) int2 make_int2(signed int x, signed int y) { int2 r{x, y}; return r; }; +static inline __attribute__((device)) int3 make_int3(signed int x, signed int y, signed int z) { int3 r{x, y, z}; return r; }; +static inline __attribute__((device)) int4 make_int4(signed int x, signed int y, signed int z, signed int w) { int4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) float1 make_float1(float x) { float1 r{x}; return r; }; +static inline __attribute__((device)) float2 make_float2(float x, float y) { float2 r{x, y}; return r; }; +static inline __attribute__((device)) float3 make_float3(float x, float y, float z) { float3 r{x, y, z}; return r; }; +static inline __attribute__((device)) float4 make_float4(float x, float y, float z, float w) { float4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) double1 make_double1(double x) { double1 r{x}; return r; }; +static inline __attribute__((device)) double2 make_double2(double x, double y) { double2 r{x, y}; return r; }; +static inline __attribute__((device)) double3 make_double3(double x, double y, double z) { double3 r{x, y, z}; return r; }; +static inline __attribute__((device)) double4 make_double4(double x, double y, double z, double w) { double4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) ulong1 make_ulong1(unsigned long x) { ulong1 r{x}; return r; }; +static inline __attribute__((device)) ulong2 make_ulong2(unsigned long x, unsigned long y) { ulong2 r{x, y}; return r; }; +static inline __attribute__((device)) ulong3 make_ulong3(unsigned long x, unsigned long y, unsigned long z) { ulong3 r{x, y, z}; return r; }; +static inline __attribute__((device)) ulong4 make_ulong4(unsigned long x, unsigned long y, unsigned long z, unsigned long w) { ulong4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) long1 make_long1(signed long x) { long1 r{x}; return r; }; +static inline __attribute__((device)) long2 make_long2(signed long x, signed long y) { long2 r{x, y}; return r; }; +static inline __attribute__((device)) long3 make_long3(signed long x, signed long y, signed long z) { long3 r{x, y, z}; return r; }; +static inline __attribute__((device)) long4 make_long4(signed long x, signed long y, signed long z, signed long w) { long4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) ulonglong1 make_ulonglong1(unsigned long long x) { ulonglong1 r{x}; return r; }; +static inline __attribute__((device)) ulonglong2 make_ulonglong2(unsigned long long x, unsigned long long y) { ulonglong2 r{x, y}; return r; }; +static inline __attribute__((device)) ulonglong3 make_ulonglong3(unsigned long long x, unsigned long long y, unsigned long long z) { ulonglong3 r{x, y, z}; return r; }; +static inline __attribute__((device)) ulonglong4 make_ulonglong4(unsigned long long x, unsigned long long y, unsigned long long z, unsigned long long w) { ulonglong4 r{x, y, z, w}; return r; }; + +static inline __attribute__((device)) longlong1 make_longlong1(signed long long x) { longlong1 r{x}; return r; }; +static inline __attribute__((device)) longlong2 make_longlong2(signed long long x, signed long long y) { longlong2 r{x, y}; return r; }; +static inline __attribute__((device)) longlong3 make_longlong3(signed long long x, signed long long y, signed long long z) { longlong3 r{x, y, z}; return r; }; +static inline __attribute__((device)) longlong4 make_longlong4(signed long long x, signed long long y, signed long long z, signed long long w) { longlong4 r{x, y, z, w}; return r; }; +# 28 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 2 3 + + +__attribute__((device)) inline static char __ldg(const char* ptr) { return *ptr; } + +__attribute__((device)) inline static char2 __ldg(const char2* ptr) { return *ptr; } + +__attribute__((device)) inline static char4 __ldg(const char4* ptr) { return *ptr; } + +__attribute__((device)) inline static signed char __ldg(const signed char* ptr) { return ptr[0]; } + +__attribute__((device)) inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static short __ldg(const short* ptr) { return ptr[0]; } + +__attribute__((device)) inline static short2 __ldg(const short2* ptr) { return ptr[0]; } + +__attribute__((device)) inline static short4 __ldg(const short4* ptr) { return ptr[0]; } + +__attribute__((device)) inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static int __ldg(const int* ptr) { return ptr[0]; } + +__attribute__((device)) inline static int2 __ldg(const int2* ptr) { return ptr[0]; } + +__attribute__((device)) inline static int4 __ldg(const int4* ptr) { return ptr[0]; } + +__attribute__((device)) inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static long __ldg(const long* ptr) { return ptr[0]; } + +__attribute__((device)) inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static long long __ldg(const long long* ptr) { return ptr[0]; } + +__attribute__((device)) inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; } + +__attribute__((device)) inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; } + +__attribute__((device)) inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; } + +__attribute__((device)) inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static float __ldg(const float* ptr) { return ptr[0]; } + +__attribute__((device)) inline static float2 __ldg(const float2* ptr) { return ptr[0]; } + +__attribute__((device)) inline static float4 __ldg(const float4* ptr) { return ptr[0]; } + + +__attribute__((device)) inline static double __ldg(const double* ptr) { return ptr[0]; } + +__attribute__((device)) inline static double2 __ldg(const double2* ptr) { return ptr[0]; } +# 125 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 2 3 +# 250 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3 +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int); +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int); +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int); +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_num_groups(unsigned int); +struct __HIP_BlockIdx { + __attribute__((device)) + std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); } +}; +struct __HIP_BlockDim { + __attribute__((device)) + std::uint32_t operator()(std::uint32_t x) const noexcept { + return __ockl_get_local_size(x); + } +}; +struct __HIP_GridDim { + __attribute__((device)) + std::uint32_t operator()(std::uint32_t x) const noexcept { + return __ockl_get_num_groups(x); + } +}; +struct __HIP_ThreadIdx { + __attribute__((device)) + std::uint32_t operator()(std::uint32_t x) const noexcept { + return __ockl_get_local_id(x); + } +}; + + +typedef struct dim3 { + uint32_t x; + uint32_t y; + uint32_t z; + + constexpr __attribute__((device)) dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){}; + +} dim3; + + +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_global_size(unsigned int); + + +template struct __HIP_Coordinates { + using R = decltype(F{}(0)); + + struct __X { + __attribute__((device)) operator R() const noexcept { return F{}(0); } + __attribute__((device)) R operator+=(const R& rhs) { return F{}(0) + rhs; } + }; + struct __Y { + __attribute__((device)) operator R() const noexcept { return F{}(1); } + __attribute__((device)) R operator+=(const R& rhs) { return F{}(1) + rhs; } + }; + struct __Z { + __attribute__((device)) operator R() const noexcept { return F{}(2); } + __attribute__((device)) R operator+=(const R& rhs) { return F{}(2) + rhs; } + }; + + + __attribute__((weak)) + + __attribute__((device)) static constexpr __X x{}; + + __attribute__((weak)) + + __attribute__((device)) static constexpr __Y y{}; + + __attribute__((weak)) + + __attribute__((device)) static constexpr __Z z{}; + + __attribute__((device)) operator dim3() const { return dim3(x, y, z); } +}; + +template +constexpr typename __HIP_Coordinates::__X __HIP_Coordinates::x; +template +constexpr typename __HIP_Coordinates::__Y __HIP_Coordinates::y; +template +constexpr typename __HIP_Coordinates::__Z __HIP_Coordinates::z; + +inline +__attribute__((device)) +std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X, + __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept { + return __ockl_get_global_size(0); +} +inline +__attribute__((device)) +std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X, + __HIP_Coordinates<__HIP_GridDim>::__X) noexcept { + return __ockl_get_global_size(0); +} +inline +__attribute__((device)) +std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y, + __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept { + return __ockl_get_global_size(1); +} +inline +__attribute__((device)) +std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y, + __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept { + return __ockl_get_global_size(1); +} +inline +__attribute__((device)) +std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z, + __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept { + return __ockl_get_global_size(2); +} +inline +__attribute__((device)) +std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z, + __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept { + return __ockl_get_global_size(2); +} + +static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{}; +static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{}; +static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{}; +static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{}; + + +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int); + + + + +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int); + + + + +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int); + + + + +extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_num_groups(unsigned int); +# 63 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3 +# 73 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 3 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_vector_types.h" 1 3 +# 74 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3 +# 6 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 1 3 +# 37 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 3 +# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 1 3 +# 55 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wshadow" +struct hip_bfloat16 +{ + __hip_uint16_t data; + + enum truncate_t + { + truncate + }; + + __attribute__((device)) hip_bfloat16() = default; + + + explicit __attribute__((device)) hip_bfloat16(float f) + : data(float_to_bfloat16(f)) + { + } + + explicit __attribute__((device)) hip_bfloat16(float f, truncate_t) + : data(truncate_float_to_bfloat16(f)) + { + } + + + __attribute__((device)) operator float() const + { + union + { + uint32_t int32; + float fp32; + } u = {uint32_t(data) << 16}; + return u.fp32; + } + + __attribute__((device)) hip_bfloat16 &operator=(const float& f) + { + data = float_to_bfloat16(f); + return *this; + } + + static __attribute__((device)) hip_bfloat16 round_to_bfloat16(float f) + { + hip_bfloat16 output; + output.data = float_to_bfloat16(f); + return output; + } + + static __attribute__((device)) hip_bfloat16 round_to_bfloat16(float f, truncate_t) + { + hip_bfloat16 output; + output.data = truncate_float_to_bfloat16(f); + return output; + } + +private: + static __attribute__((device)) __hip_uint16_t float_to_bfloat16(float f) + { + union + { + float fp32; + uint32_t int32; + } u = {f}; + if(~u.int32 & 0x7f800000) + { +# 136 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3 + u.int32 += 0x7fff + ((u.int32 >> 16) & 1); + } + else if(u.int32 & 0xffff) + { +# 148 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3 + u.int32 |= 0x10000; + } + return __hip_uint16_t(u.int32 >> 16); + } + + + static __attribute__((device)) __hip_uint16_t truncate_float_to_bfloat16(float f) + { + union + { + float fp32; + uint32_t int32; + } u = {f}; + return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff)); + } +}; +#pragma clang diagnostic pop + +typedef struct +{ + __hip_uint16_t data; +} hip_bfloat16_public; + +static_assert(__hip_internal::is_standard_layout{}, + "hip_bfloat16 is not a standard layout type, and thus is " + "incompatible with C."); + +static_assert(__hip_internal::is_trivial{}, + "hip_bfloat16 is not a trivial type, and thus is " + "incompatible with C."); +# 189 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3 +inline __attribute__((device)) hip_bfloat16 operator+(hip_bfloat16 a) +{ + return a; +} +inline __attribute__((device)) hip_bfloat16 operator-(hip_bfloat16 a) +{ + a.data ^= 0x8000; + return a; +} +inline __attribute__((device)) hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b) +{ + return hip_bfloat16(float(a) + float(b)); +} +inline __attribute__((device)) hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b) +{ + return hip_bfloat16(float(a) - float(b)); +} +inline __attribute__((device)) hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b) +{ + return hip_bfloat16(float(a) * float(b)); +} +inline __attribute__((device)) hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b) +{ + return hip_bfloat16(float(a) / float(b)); +} +inline __attribute__((device)) bool operator<(hip_bfloat16 a, hip_bfloat16 b) +{ + return float(a) < float(b); +} +inline __attribute__((device)) bool operator==(hip_bfloat16 a, hip_bfloat16 b) +{ + return float(a) == float(b); +} +inline __attribute__((device)) bool operator>(hip_bfloat16 a, hip_bfloat16 b) +{ + return b < a; +} +inline __attribute__((device)) bool operator<=(hip_bfloat16 a, hip_bfloat16 b) +{ + return !(a > b); +} +inline __attribute__((device)) bool operator!=(hip_bfloat16 a, hip_bfloat16 b) +{ + return !(a == b); +} +inline __attribute__((device)) bool operator>=(hip_bfloat16 a, hip_bfloat16 b) +{ + return !(a < b); +} +inline __attribute__((device)) hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b) +{ + return a = a + b; +} +inline __attribute__((device)) hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b) +{ + return a = a - b; +} +inline __attribute__((device)) hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b) +{ + return a = a * b; +} +inline __attribute__((device)) hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b) +{ + return a = a / b; +} +inline __attribute__((device)) hip_bfloat16& operator++(hip_bfloat16& a) +{ + return a += hip_bfloat16(1.0f); +} +inline __attribute__((device)) hip_bfloat16& operator--(hip_bfloat16& a) +{ + return a -= hip_bfloat16(1.0f); +} +inline __attribute__((device)) hip_bfloat16 operator++(hip_bfloat16& a, int) +{ + hip_bfloat16 orig = a; + ++a; + return orig; +} +inline __attribute__((device)) hip_bfloat16 operator--(hip_bfloat16& a, int) +{ + hip_bfloat16 orig = a; + --a; + return orig; +} + +namespace std +{ + constexpr __attribute__((device)) bool isinf(hip_bfloat16 a) + { + return !(~a.data & 0x7f80) && !(a.data & 0x7f); + } + constexpr __attribute__((device)) bool isnan(hip_bfloat16 a) + { + return !(~a.data & 0x7f80) && +(a.data & 0x7f); + } + constexpr __attribute__((device)) bool iszero(hip_bfloat16 a) + { + return !(a.data & 0x7fff); + } +} +# 38 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 2 3 +# 7 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2 + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-id-macro" +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wundef" +#define __device__ __attribute__((device)) +#define __host__ __attribute__((host)) +#define __global__ __attribute__((global)) +#define __constant__ __attribute__((constant)) +#define __shared__ __attribute__((shared)) +#define __align__(x) __attribute__((aligned(x))) +#if !defined(__has_feature) || !__has_feature(cuda_noinline_keyword) +#define __noinline__ __attribute__((noinline)) +#endif +#define __forceinline__ inline __attribute__((always_inline)) +#if __HIP_NO_IMAGE_SUPPORT +#define __hip_img_chk__ __attribute__((unavailable("The image/texture API not supported on the device"))) +#else +#define __hip_img_chk__ +#endif +#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock))) +#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \ + amdgpu_waves_per_eu(minBlocksPerMultiprocessor))) +#define select_impl_(_1, _2, impl_, ...) impl_ +#define __launch_bounds__(...) \ + select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__) +#define HIP_INCLUDE_HIP_HIP_RUNTIME_H +#define _HIP_BFLOAT16_H_ +#define HIP_INCLUDE_HIP_MATH_FUNCTIONS_H +#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H +#if !__HIP_NO_STD_DEFS__ +#if defined(__HIPRTC_PTRDIFF_T_IS_LONG_LONG__) && __HIPRTC_PTRDIFF_T_IS_LONG_LONG__==1 +typedef long long ptrdiff_t; +#else +typedef __PTRDIFF_TYPE__ ptrdiff_t; +#endif +typedef long clock_t; +namespace std { +using ::ptrdiff_t; +using ::clock_t; +} +#endif // __HIP_NO_STD_DEFS__ +#pragma clang diagnostic pop/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_HIP_COMMON_H +#define HIP_INCLUDE_HIP_HIP_COMMON_H + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#endif +// Common code included at start of every hip file. +// Auto enable __HIP_PLATFORM_AMD__ if compiling on AMD platform +// Other compiler (GCC,ICC,etc) need to set one of these macros explicitly +#if defined(__clang__) && defined(__HIP__) +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ +#endif +#endif // defined(__clang__) && defined(__HIP__) + +// Auto enable __HIP_PLATFORM_NVIDIA__ if compiling with NVIDIA platform +#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__) && !defined(__HIP__)) +#ifndef __HIP_PLATFORM_NVIDIA__ +#define __HIP_PLATFORM_NVIDIA__ +#endif + +#ifdef __CUDACC__ +#define __HIPCC__ +#endif + +#endif //__NVCC__ + +// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path +#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || \ + (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0) +#define __HIP_DEVICE_COMPILE__ 1 +#endif + +#ifdef __GNUC__ +#define HIP_PUBLIC_API __attribute__ ((visibility ("default"))) +#define HIP_INTERNAL_EXPORTED_API __attribute__ ((visibility ("default"))) +#else +#define HIP_PUBLIC_API +#define HIP_INTERNAL_EXPORTED_API +#endif + +#if __HIP_DEVICE_COMPILE__ == 0 +// 32-bit Atomics +#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0) +#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0) +#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0) +#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0) +#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0) + +// 64-bit Atomics +#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0) +#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0) + +// Doubles +#define __HIP_ARCH_HAS_DOUBLES__ (0) + +// Warp cross-lane operations +#define __HIP_ARCH_HAS_WARP_VOTE__ (0) +#define __HIP_ARCH_HAS_WARP_BALLOT__ (0) +#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0) +#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0) + +// Sync +#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0) +#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0) + +// Misc +#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0) +#define __HIP_ARCH_HAS_3DGRID__ (0) +#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0) +#endif + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_LIBRARY_TYPES_H +#define HIP_INCLUDE_HIP_LIBRARY_TYPES_H + +#if !defined(__HIPCC_RTC__) +#include +#endif + +#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) + +typedef enum hipDataType { + HIP_R_32F = 0, + HIP_R_64F = 1, + HIP_R_16F = 2, + HIP_R_8I = 3, + HIP_C_32F = 4, + HIP_C_64F = 5, + HIP_C_16F = 6, + HIP_C_8I = 7, + HIP_R_8U = 8, + HIP_C_8U = 9, + HIP_R_32I = 10, + HIP_C_32I = 11, + HIP_R_32U = 12, + HIP_C_32U = 13, + HIP_R_16BF = 14, + HIP_C_16BF = 15, + HIP_R_4I = 16, + HIP_C_4I = 17, + HIP_R_4U = 18, + HIP_C_4U = 19, + HIP_R_16I = 20, + HIP_C_16I = 21, + HIP_R_16U = 22, + HIP_C_16U = 23, + HIP_R_64I = 24, + HIP_C_64I = 25, + HIP_R_64U = 26, + HIP_C_64U = 27, + // HIP specific Data Types + HIP_R_8F_E4M3_FNUZ = 1000, + HIP_R_8F_E5M2_FNUZ = 1001 +} hipDataType; + +typedef enum hipLibraryPropertyType { + HIP_LIBRARY_MAJOR_VERSION, + HIP_LIBRARY_MINOR_VERSION, + HIP_LIBRARY_PATCH_LEVEL +} hipLibraryPropertyType; + +#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) +#include "library_types.h" +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_DRIVER_TYPES_H +#define HIP_INCLUDE_HIP_DRIVER_TYPES_H + +#if !defined(__HIPCC_RTC__) +#include +#endif + +#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) +#include "driver_types.h" +#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) + +#if !defined(__HIPCC_RTC__) +#ifndef __cplusplus +#include +#endif +#endif // !defined(__HIPCC_RTC__) +typedef void* hipDeviceptr_t; +typedef enum hipChannelFormatKind { + hipChannelFormatKindSigned = 0, + hipChannelFormatKindUnsigned = 1, + hipChannelFormatKindFloat = 2, + hipChannelFormatKindNone = 3 +}hipChannelFormatKind; +typedef struct hipChannelFormatDesc { + int x; + int y; + int z; + int w; + enum hipChannelFormatKind f; +}hipChannelFormatDesc; +#define HIP_TRSA_OVERRIDE_FORMAT 0x01 +#define HIP_TRSF_READ_AS_INTEGER 0x01 +#define HIP_TRSF_NORMALIZED_COORDINATES 0x02 +#define HIP_TRSF_SRGB 0x10 + +typedef struct hipArray* hipArray_t; +typedef const struct hipArray* hipArray_const_t; +typedef enum hipArray_Format { + HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01, + HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02, + HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03, + HIP_AD_FORMAT_SIGNED_INT8 = 0x08, + HIP_AD_FORMAT_SIGNED_INT16 = 0x09, + HIP_AD_FORMAT_SIGNED_INT32 = 0x0a, + HIP_AD_FORMAT_HALF = 0x10, + HIP_AD_FORMAT_FLOAT = 0x20 +}hipArray_Format; +typedef struct HIP_ARRAY_DESCRIPTOR { + size_t Width; + size_t Height; + enum hipArray_Format Format; + unsigned int NumChannels; +}HIP_ARRAY_DESCRIPTOR; +typedef struct HIP_ARRAY3D_DESCRIPTOR { + size_t Width; + size_t Height; + size_t Depth; + enum hipArray_Format Format; + unsigned int NumChannels; + unsigned int Flags; +}HIP_ARRAY3D_DESCRIPTOR; +#if !defined(__HIPCC_RTC__) +typedef struct hip_Memcpy2D { + size_t srcXInBytes; + size_t srcY; + hipMemoryType srcMemoryType; + const void* srcHost; + hipDeviceptr_t srcDevice; + hipArray_t srcArray; + size_t srcPitch; + size_t dstXInBytes; + size_t dstY; + hipMemoryType dstMemoryType; + void* dstHost; + hipDeviceptr_t dstDevice; + hipArray_t dstArray; + size_t dstPitch; + size_t WidthInBytes; + size_t Height; +} hip_Memcpy2D; +#endif // !defined(__HIPCC_RTC__) +typedef struct hipMipmappedArray { + void* data; + struct hipChannelFormatDesc desc; + unsigned int type; + unsigned int width; + unsigned int height; + unsigned int depth; + unsigned int min_mipmap_level; + unsigned int max_mipmap_level; + unsigned int flags; + enum hipArray_Format format; + unsigned int num_channels; +} hipMipmappedArray; +typedef struct hipMipmappedArray* hipMipmappedArray_t; +typedef hipMipmappedArray_t hipmipmappedArray; +typedef const struct hipMipmappedArray* hipMipmappedArray_const_t; +/** + * hip resource types + */ +typedef enum hipResourceType { + hipResourceTypeArray = 0x00, + hipResourceTypeMipmappedArray = 0x01, + hipResourceTypeLinear = 0x02, + hipResourceTypePitch2D = 0x03 +}hipResourceType; +typedef enum HIPresourcetype_enum { + HIP_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + HIP_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + HIP_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ +} HIPresourcetype, hipResourcetype; +/** + * hip address modes + */ +typedef enum HIPaddress_mode_enum { + HIP_TR_ADDRESS_MODE_WRAP = 0, + HIP_TR_ADDRESS_MODE_CLAMP = 1, + HIP_TR_ADDRESS_MODE_MIRROR = 2, + HIP_TR_ADDRESS_MODE_BORDER = 3 +} HIPaddress_mode; +/** + * hip filter modes + */ +typedef enum HIPfilter_mode_enum { + HIP_TR_FILTER_MODE_POINT = 0, + HIP_TR_FILTER_MODE_LINEAR = 1 +} HIPfilter_mode; +/** + * Texture descriptor + */ +typedef struct HIP_TEXTURE_DESC_st { + HIPaddress_mode addressMode[3]; /**< Address modes */ + HIPfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ + HIPfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + float borderColor[4]; /**< Border Color */ + int reserved[12]; +} HIP_TEXTURE_DESC; +/** + * hip texture resource view formats + */ +typedef enum hipResourceViewFormat { + hipResViewFormatNone = 0x00, + hipResViewFormatUnsignedChar1 = 0x01, + hipResViewFormatUnsignedChar2 = 0x02, + hipResViewFormatUnsignedChar4 = 0x03, + hipResViewFormatSignedChar1 = 0x04, + hipResViewFormatSignedChar2 = 0x05, + hipResViewFormatSignedChar4 = 0x06, + hipResViewFormatUnsignedShort1 = 0x07, + hipResViewFormatUnsignedShort2 = 0x08, + hipResViewFormatUnsignedShort4 = 0x09, + hipResViewFormatSignedShort1 = 0x0a, + hipResViewFormatSignedShort2 = 0x0b, + hipResViewFormatSignedShort4 = 0x0c, + hipResViewFormatUnsignedInt1 = 0x0d, + hipResViewFormatUnsignedInt2 = 0x0e, + hipResViewFormatUnsignedInt4 = 0x0f, + hipResViewFormatSignedInt1 = 0x10, + hipResViewFormatSignedInt2 = 0x11, + hipResViewFormatSignedInt4 = 0x12, + hipResViewFormatHalf1 = 0x13, + hipResViewFormatHalf2 = 0x14, + hipResViewFormatHalf4 = 0x15, + hipResViewFormatFloat1 = 0x16, + hipResViewFormatFloat2 = 0x17, + hipResViewFormatFloat4 = 0x18, + hipResViewFormatUnsignedBlockCompressed1 = 0x19, + hipResViewFormatUnsignedBlockCompressed2 = 0x1a, + hipResViewFormatUnsignedBlockCompressed3 = 0x1b, + hipResViewFormatUnsignedBlockCompressed4 = 0x1c, + hipResViewFormatSignedBlockCompressed4 = 0x1d, + hipResViewFormatUnsignedBlockCompressed5 = 0x1e, + hipResViewFormatSignedBlockCompressed5 = 0x1f, + hipResViewFormatUnsignedBlockCompressed6H = 0x20, + hipResViewFormatSignedBlockCompressed6H = 0x21, + hipResViewFormatUnsignedBlockCompressed7 = 0x22 +}hipResourceViewFormat; +typedef enum HIPresourceViewFormat_enum +{ + HIP_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + HIP_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + HIP_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + HIP_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + HIP_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + HIP_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} HIPresourceViewFormat; +/** + * HIP resource descriptor + */ +typedef struct hipResourceDesc { + enum hipResourceType resType; + union { + struct { + hipArray_t array; + } array; + struct { + hipMipmappedArray_t mipmap; + } mipmap; + struct { + void* devPtr; + struct hipChannelFormatDesc desc; + size_t sizeInBytes; + } linear; + struct { + void* devPtr; + struct hipChannelFormatDesc desc; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; +}hipResourceDesc; +typedef struct HIP_RESOURCE_DESC_st +{ + HIPresourcetype resType; /**< Resource type */ + union { + struct { + hipArray_t hArray; /**< HIP array */ + } array; + struct { + hipMipmappedArray_t hMipmappedArray; /**< HIP mipmapped array */ + } mipmap; + struct { + hipDeviceptr_t devPtr; /**< Device pointer */ + hipArray_Format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + hipDeviceptr_t devPtr; /**< Device pointer */ + hipArray_Format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + struct { + int reserved[32]; + } reserved; + } res; + unsigned int flags; /**< Flags (must be zero) */ +} HIP_RESOURCE_DESC; +/** + * hip resource view descriptor + */ +struct hipResourceViewDesc { + enum hipResourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; +}; +/** + * Resource view descriptor + */ +typedef struct HIP_RESOURCE_VIEW_DESC_st +{ + HIPresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int reserved[16]; +} HIP_RESOURCE_VIEW_DESC; +/** + * Memory copy types + * + */ +#if !defined(__HIPCC_RTC__) +typedef enum hipMemcpyKind { + hipMemcpyHostToHost = 0, ///< Host-to-Host Copy + hipMemcpyHostToDevice = 1, ///< Host-to-Device Copy + hipMemcpyDeviceToHost = 2, ///< Device-to-Host Copy + hipMemcpyDeviceToDevice = 3, ///< Device-to-Device Copy + hipMemcpyDefault = + 4 ///< Runtime will automatically determine copy-kind based on virtual addresses. +} hipMemcpyKind; +typedef struct hipPitchedPtr { + void* ptr; + size_t pitch; + size_t xsize; + size_t ysize; +}hipPitchedPtr; +typedef struct hipExtent { + size_t width; // Width in elements when referring to array memory, in bytes when referring to + // linear memory + size_t height; + size_t depth; +}hipExtent; +typedef struct hipPos { + size_t x; + size_t y; + size_t z; +}hipPos; +typedef struct hipMemcpy3DParms { + hipArray_t srcArray; + struct hipPos srcPos; + struct hipPitchedPtr srcPtr; + hipArray_t dstArray; + struct hipPos dstPos; + struct hipPitchedPtr dstPtr; + struct hipExtent extent; + enum hipMemcpyKind kind; +} hipMemcpy3DParms; +typedef struct HIP_MEMCPY3D { + size_t srcXInBytes; + size_t srcY; + size_t srcZ; + size_t srcLOD; + hipMemoryType srcMemoryType; + const void* srcHost; + hipDeviceptr_t srcDevice; + hipArray_t srcArray; + size_t srcPitch; + size_t srcHeight; + size_t dstXInBytes; + size_t dstY; + size_t dstZ; + size_t dstLOD; + hipMemoryType dstMemoryType; + void* dstHost; + hipDeviceptr_t dstDevice; + hipArray_t dstArray; + size_t dstPitch; + size_t dstHeight; + size_t WidthInBytes; + size_t Height; + size_t Depth; +} HIP_MEMCPY3D; +static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz, + size_t ysz) { + struct hipPitchedPtr s; + s.ptr = d; + s.pitch = p; + s.xsize = xsz; + s.ysize = ysz; + return s; +} +static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) { + struct hipPos p; + p.x = x; + p.y = y; + p.z = z; + return p; +} +static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) { + struct hipExtent e; + e.width = w; + e.height = h; + e.depth = d; + return e; +} +typedef enum hipFunction_attribute { + HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES, + HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + HIP_FUNC_ATTRIBUTE_NUM_REGS, + HIP_FUNC_ATTRIBUTE_PTX_VERSION, + HIP_FUNC_ATTRIBUTE_BINARY_VERSION, + HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA, + HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, + HIP_FUNC_ATTRIBUTE_MAX +} hipFunction_attribute; + +typedef enum hipPointer_attribute { + HIP_POINTER_ATTRIBUTE_CONTEXT = 1, ///< The context on which a pointer was allocated + ///< @warning - not supported in HIP + HIP_POINTER_ATTRIBUTE_MEMORY_TYPE, ///< memory type describing location of a pointer + HIP_POINTER_ATTRIBUTE_DEVICE_POINTER,///< address at which the pointer is allocated on device + HIP_POINTER_ATTRIBUTE_HOST_POINTER, ///< address at which the pointer is allocated on host + HIP_POINTER_ATTRIBUTE_P2P_TOKENS, ///< A pair of tokens for use with linux kernel interface + ///< @warning - not supported in HIP + HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS, ///< Synchronize every synchronous memory operation + ///< initiated on this region + HIP_POINTER_ATTRIBUTE_BUFFER_ID, ///< Unique ID for an allocated memory region + HIP_POINTER_ATTRIBUTE_IS_MANAGED, ///< Indicates if the pointer points to managed memory + HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL,///< device ordinal of a device on which a pointer + ///< was allocated or registered + HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE, ///< if this pointer maps to an allocation + ///< that is suitable for hipIpcGetMemHandle + ///< @warning - not supported in HIP + HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,///< Starting address for this requested pointer + HIP_POINTER_ATTRIBUTE_RANGE_SIZE, ///< Size of the address range for this requested pointer + HIP_POINTER_ATTRIBUTE_MAPPED, ///< tells if this pointer is in a valid address range + ///< that is mapped to a backing allocation + HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,///< Bitmask of allowed hipmemAllocationHandleType + ///< for this allocation @warning - not supported in HIP + HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, ///< returns if the memory referenced by + ///< this pointer can be used with the GPUDirect RDMA API + ///< @warning - not supported in HIP + HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS, ///< Returns the access flags the device associated with + ///< for the corresponding memory referenced by the ptr + HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE ///< Returns the mempool handle for the allocation if + ///< it was allocated from a mempool + ///< @warning - not supported in HIP +} hipPointer_attribute; + +#endif // !defined(__HIPCC_RTC__) +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif +#endif +/* +Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file surface_types.h + * @brief Defines surface types for HIP runtime. + */ + +#ifndef HIP_INCLUDE_HIP_SURFACE_TYPES_H +#define HIP_INCLUDE_HIP_SURFACE_TYPES_H + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-identifier" +#endif + +#if !defined(__HIPCC_RTC__) +#include +#endif + +/** + * An opaque value that represents a hip surface object + */ +struct __hip_surface; +typedef struct __hip_surface* hipSurfaceObject_t; + +/** + * hip surface reference + */ +struct surfaceReference { + hipSurfaceObject_t surfaceObject; +}; + +/** + * hip surface boundary modes + */ +enum hipSurfaceBoundaryMode { + hipBoundaryModeZero = 0, + hipBoundaryModeTrap = 1, + hipBoundaryModeClamp = 2 +}; + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +#endif /* !HIP_INCLUDE_HIP_SURFACE_TYPES_H */ +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H + +#if !defined(__HIPCC_RTC__) +#include +#include +#include +#endif + +#ifdef __cplusplus + +extern "C" HIP_PUBLIC_API +hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f); + +static inline hipChannelFormatDesc hipCreateChannelDescHalf() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat); +} + +static inline hipChannelFormatDesc hipCreateChannelDescHalf1() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat); +} + +static inline hipChannelFormatDesc hipCreateChannelDescHalf2() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat); +} + +static inline hipChannelFormatDesc hipCreateChannelDescHalf4() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat); +} + +template +static inline hipChannelFormatDesc hipCreateChannelDesc() { + return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(char) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed char) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned char) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned char) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed char) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned char) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed char) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned); +} + +#ifndef __GNUC__ // vector3 is the same as vector4 +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned char) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed char) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned); +} +#endif + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned char) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed char) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed short) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed short) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed short) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned); +} + +#ifndef __GNUC__ +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed short) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned); +} +#endif + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned short) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed short) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned int) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed int) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned int) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed int) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned int) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed int) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned); +} + +#ifndef __GNUC__ +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned int) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed int) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned); +} +#endif + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned int) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed int) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(float) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(float) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(float) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat); +} + +#ifndef __GNUC__ +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(float) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat); +} +#endif + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(float) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat); +} + +#if !defined(__LP64__) + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned long) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed long) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned long) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed long) * 8; + return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned long) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed long) * 8; + return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned); +} + +#ifndef __GNUC__ +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned long) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed long) * 8; + return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned); +} +#endif + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(unsigned long) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned); +} + +template <> +inline hipChannelFormatDesc hipCreateChannelDesc() { + int e = (int)sizeof(signed long) * 8; + return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned); +} +#endif /* !__LP64__ */ + +#else + +struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, + enum hipChannelFormatKind f); + +#endif /* __cplusplus */ + +#endif /* !HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H */ +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H +#define HIP_INCLUDE_HIP_TEXTURE_TYPES_H + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-identifier" +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wc++98-compat" +#endif + +#if !defined(__HIPCC_RTC__) +#include +#endif + +#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) +#include "texture_types.h" +#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) +/******************************************************************************* + * * + * * + * * + *******************************************************************************/ +#if !defined(__HIPCC_RTC__) +#include +#include +#include +#endif // !defined(__HIPCC_RTC__) + +#define hipTextureType1D 0x01 +#define hipTextureType2D 0x02 +#define hipTextureType3D 0x03 +#define hipTextureTypeCubemap 0x0C +#define hipTextureType1DLayered 0xF1 +#define hipTextureType2DLayered 0xF2 +#define hipTextureTypeCubemapLayered 0xFC + +/** + * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD + */ +#define HIP_IMAGE_OBJECT_SIZE_DWORD 12 +#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8 +#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD +#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD) + +/** + * An opaque value that represents a hip texture object + */ +struct __hip_texture; +typedef struct __hip_texture* hipTextureObject_t; + +/** + * hip texture address modes + */ +enum hipTextureAddressMode { + hipAddressModeWrap = 0, + hipAddressModeClamp = 1, + hipAddressModeMirror = 2, + hipAddressModeBorder = 3 +}; + +/** + * hip texture filter modes + */ +enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 }; + +/** + * hip texture read modes + */ +enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 }; + +/** + * hip texture reference + */ +typedef struct textureReference { + int normalized; + enum hipTextureReadMode readMode;// used only for driver API's + enum hipTextureFilterMode filterMode; + enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions + struct hipChannelFormatDesc channelDesc; + int sRGB; // Perform sRGB->linear conversion during texture read + unsigned int maxAnisotropy; // Limit to the anisotropy ratio + enum hipTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + + hipTextureObject_t textureObject; + int numChannels; + enum hipArray_Format format; +}textureReference; + +/** + * hip texture descriptor + */ +typedef struct hipTextureDesc { + enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions + enum hipTextureFilterMode filterMode; + enum hipTextureReadMode readMode; + int sRGB; // Perform sRGB->linear conversion during texture read + float borderColor[4]; + int normalizedCoords; + unsigned int maxAnisotropy; + enum hipTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; +}hipTextureDesc; + +#if __cplusplus + +/******************************************************************************* + * * + * * + * * + *******************************************************************************/ +#if __HIP__ +#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type)) +#else +#define __HIP_TEXTURE_ATTRIB +#endif + +typedef textureReference* hipTexRef; + +template +struct __HIP_TEXTURE_ATTRIB texture : public textureReference { + texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint, + enum hipTextureAddressMode aMode = hipAddressModeClamp) { + normalized = norm; + readMode = mode; + filterMode = fMode; + addressMode[0] = aMode; + addressMode[1] = aMode; + addressMode[2] = aMode; + channelDesc = hipCreateChannelDesc(); + sRGB = 0; + textureObject = nullptr; + maxAnisotropy = 0; + mipmapLevelBias = 0; + minMipmapLevelClamp = 0; + maxMipmapLevelClamp = 0; + } + + texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode, + struct hipChannelFormatDesc desc) { + normalized = norm; + readMode = mode; + filterMode = fMode; + addressMode[0] = aMode; + addressMode[1] = aMode; + addressMode[2] = aMode; + channelDesc = desc; + sRGB = 0; + textureObject = nullptr; + maxAnisotropy = 0; + mipmapLevelBias = 0; + minMipmapLevelClamp = 0; + maxMipmapLevelClamp = 0; + } +}; + +#endif /* __cplusplus */ + +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if !defined(__HIPCC_RTC__) +#include +#endif + +extern "C" { + +#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4))) + +__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c); + +__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c); + +__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f); + +__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f); + +__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l); + +__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l); + +__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l); + +__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l); + +__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l); + +__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l); + +__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l); + +__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p); + +__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p); + +__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p); + +__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p); + +__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p); + +__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, float4::Native_vec_ p); + +__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, float4::Native_vec_ p); + +__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p); + +__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p); + +__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p); + +__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p); + +__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p); + +__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l, float4::Native_vec_ p); + +__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l, float4::Native_vec_ p); + +__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c); + +__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy); + +__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy); + +__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy); + +__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy); + +__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy); + +__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l); + +__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l); + +__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l); + +__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l); + +__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l); + +__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l); + +__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l); + +__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c); + +__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c); + +__device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_2D(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_3D(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_CM(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_data_type_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_1D(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_2D(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i); + +__device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i); + +} +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if defined(__cplusplus) + +#if !defined(__HIPCC_RTC__) +#include +#include +#include +#include +#endif // !defined(__HIPCC_RTC__) + +#define TEXTURE_PARAMETERS_INIT \ + unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \ + unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; + +template +struct __hip_is_tex_surf_scalar_channel_type +{ + static constexpr bool value = + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; +}; + +template +struct __hip_is_tex_surf_channel_type +{ + static constexpr bool value = + __hip_is_tex_surf_scalar_channel_type::value; +}; + +template< + typename T, + unsigned int rank> +struct __hip_is_tex_surf_channel_type> +{ + static constexpr bool value = + __hip_is_tex_surf_scalar_channel_type::value && + ((rank == 1) || + (rank == 2) || + (rank == 4)); +}; + +template +struct __hip_is_tex_normalized_channel_type +{ + static constexpr bool value = + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; +}; + +template< + typename T, + unsigned int rank> +struct __hip_is_tex_normalized_channel_type> +{ + static constexpr bool value = + __hip_is_tex_normalized_channel_type::value && + ((rank == 1) || + (rank == 2) || + (rank == 4)); +}; + +template < + typename T, + hipTextureReadMode readMode, + typename Enable = void> +struct __hip_tex_ret +{ + static_assert(std::is_same::value, "Invalid channel type!"); +}; + +/* + * Map from device function return U to scalar texture type T + */ +template +__forceinline__ __device__ +typename std::enable_if< + __hip_is_tex_surf_scalar_channel_type::value, const T>::type +__hipMapFrom(const U &u) { + if constexpr (sizeof(T) < sizeof(float)) { + union { + U u; + int i; + } d = { u }; + return static_cast(d.i); + } else { // sizeof(T) == sizeof(float) + union { + U u; + T t; + } d = { u }; + return d.t; + } +} + +/* + * Map from device function return U to vector texture type T + */ +template +__forceinline__ __device__ +typename std::enable_if< + __hip_is_tex_surf_scalar_channel_type::value, const T>::type +__hipMapFrom(const U &u) { + if constexpr (sizeof(typename T::value_type) < sizeof(float)) { + union { + U u; + int4 i4; + } d = { u }; + return __hipMapVector(d.i4); + } else { // sizeof(typename T::value_type) == sizeof(float) + union { + U u; + T t; + } d = { u }; + return d.t; + } +} + +/* + * Map from scalar texture type T to device function input U + */ +template +__forceinline__ __device__ +typename std::enable_if< +__hip_is_tex_surf_scalar_channel_type::value, const U>::type +__hipMapTo(const T &t) { + if constexpr (sizeof(T) < sizeof(float)) { + union { + U u; + int i; + } d = { 0 }; + d.i = static_cast(t); + return d.u; + } else { // sizeof(T) == sizeof(float) + union { + U u; + T t; + } d = { 0 }; + d.t = t; + return d.u; + } +} + +/* + * Map from vector texture type T to device function input U + */ +template +__forceinline__ __device__ +typename std::enable_if< + __hip_is_tex_surf_scalar_channel_type::value, const U>::type +__hipMapTo(const T &t) { + if constexpr (sizeof(typename T::value_type) < sizeof(float)) { + union { + U u; + int4 i4; + } d = { 0 }; + d.i4 = __hipMapVector(t); + return d.u; + } else { // sizeof(typename T::value_type) == sizeof(float) + union { + U u; + T t; + } d = { 0 }; + d.t = t; + return d.u; + } +} + +template < + typename T, + hipTextureReadMode readMode> +using __hip_tex_ret_t = typename __hip_tex_ret::type; + +template +struct __hip_tex_ret< + T, + hipReadModeElementType, + typename std::enable_if<__hip_is_tex_surf_channel_type::value, bool>::type> +{ + using type = T; +}; + +template< + typename T, + unsigned int rank> +struct __hip_tex_ret< + HIP_vector_type, + hipReadModeElementType, + typename std::enable_if<__hip_is_tex_surf_channel_type>::value, bool>::type> +{ + using type = HIP_vector_type<__hip_tex_ret_t, rank>; +}; + +template +struct __hip_tex_ret< + T, + hipReadModeNormalizedFloat, + typename std::enable_if<__hip_is_tex_normalized_channel_type::value, bool>::type> +{ + using type = float; +}; + +template< + typename T, + unsigned int rank> +struct __hip_tex_ret< + HIP_vector_type, + hipReadModeNormalizedFloat, + typename std::enable_if<__hip_is_tex_normalized_channel_type>::value, bool>::type> +{ + using type = HIP_vector_type<__hip_tex_ret_t, rank>; +}; + + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex1Dfetch(texture t, int x) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_load_1Db(i, x); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex1D(texture t, float x) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_1D(i, s, x); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex2D(texture t, float x, float y) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex1DLayered(texture t, float x, int layer) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex2DLayered(texture t, float x, float y, int layer) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex3D(texture t, float x, float y, float z) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t texCubemap(texture t, float x, float y, float z) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex1DLod(texture t, float x, float level) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_lod_1D(i, s, x, level); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex2DLod(texture t, float x, float y, float level) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex1DLayeredLod(texture t, float x, int layer, float level) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex2DLayeredLod(texture t, float x, float y, int layer, float level) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_lod_2Da(i, s, float4(x, y, layer, 0.0f).data, level); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex3DLod(texture t, float x, float y, float z, float level) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t texCubemapLod(texture t, float x, float y, float z, float level) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t texCubemapLayered(texture t, float x, float y, float z, int layer) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t texCubemapLayeredLod(texture t, float x, float y, float z, int layer, float level) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t texCubemapGrad(texture t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ + TEXTURE_PARAMETERS_INIT; + // TODO missing in device libs. + // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data); + // return __hipMapFrom<__hip_tex_ret_t>(tmp); + return {}; +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t texCubemapLayeredGrad(texture t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ + TEXTURE_PARAMETERS_INIT; + // TODO missing in device libs. + // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data); + // return __hipMapFrom<__hip_tex_ret_t>(tmp); + return {}; +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex1DGrad(texture t, float x, float dPdx, float dPdy) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex2DGrad(texture t, float x, float y, float2 dPdx, float2 dPdy) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex1DLayeredGrad(texture t, float x, int layer, float dPdx, float dPdy) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex2DLayeredGrad(texture t, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t tex3DGrad(texture t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ + TEXTURE_PARAMETERS_INIT; + auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data); + return __hipMapFrom<__hip_tex_ret_t>(tmp); +} + +template < + typename T, + hipTextureReadMode readMode, + typename Enable = void> +struct __hip_tex2dgather_ret +{ + static_assert(std::is_same::value, "Invalid channel type!"); +}; + +template < + typename T, + hipTextureReadMode readMode> +using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret::type; + +template +struct __hip_tex2dgather_ret< + T, + hipReadModeElementType, + typename std::enable_if<__hip_is_tex_surf_channel_type::value, bool>::type> +{ + using type = HIP_vector_type; +}; + +template< + typename T, + unsigned int rank> +struct __hip_tex2dgather_ret< + HIP_vector_type, + hipReadModeElementType, + typename std::enable_if<__hip_is_tex_surf_channel_type>::value, bool>::type> +{ + using type = HIP_vector_type; +}; + +template +struct __hip_tex2dgather_ret< + T, + hipReadModeNormalizedFloat, + typename std::enable_if<__hip_is_tex_normalized_channel_type::value, bool>::type> +{ + using type = float4; +}; + +template +static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t tex2Dgather(texture t, float x, float y, int comp=0) +{ + TEXTURE_PARAMETERS_INIT; + switch (comp) { + case 1: { + auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data); + return __hipMapFrom<__hip_tex2dgather_ret_t>(tmp); + } + case 2: { + auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data); + return __hipMapFrom<__hip_tex2dgather_ret_t>(tmp); + } + case 3: { + auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data); + return __hipMapFrom<__hip_tex2dgather_ret_t>(tmp); + } + default: { + auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data); + return __hipMapFrom<__hip_tex2dgather_ret_t>(tmp); + } + } + return {}; +} + +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if defined(__cplusplus) + +#if !defined(__HIPCC_RTC__) +#include +#include +#include +#include +#include +#endif // !defined(__HIPCC_RTC__) + +#define TEXTURE_OBJECT_PARAMETERS_INIT \ + unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \ + unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_load_1Db(i, x); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x) +{ + *ptr = tex1Dfetch(textureObject, x); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_1D(i, s, x); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x) +{ + *ptr = tex1D(textureObject, x); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y) +{ + *ptr = tex2D(textureObject, x, y); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z) +{ + *ptr = tex3D(textureObject, x, y, z); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer) +{ + *ptr = tex1DLayered(textureObject, x, layer); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer) +{ + *ptr = tex1DLayered(textureObject, x, y, layer); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y, float z) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z) +{ + *ptr = texCubemap(textureObject, x, y, z); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer) +{ + *ptr = texCubemapLayered(textureObject, x, y, z, layer); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + switch (comp) { + case 1: { + auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data); + return __hipMapFrom(tmp); + break; + } + case 2: { + auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data); + return __hipMapFrom(tmp); + break; + } + case 3: { + auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data); + return __hipMapFrom(tmp); + break; + } + default: { + auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data); + return __hipMapFrom(tmp); + break; + } + } + return {}; +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0) +{ + *ptr = texCubemapLayered(textureObject, x, y, comp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_lod_1D(i, s, x, level); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level) +{ + *ptr = tex1DLod(textureObject, x, level); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level) +{ + *ptr = tex2DLod(textureObject, x, y, level); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level) +{ + *ptr = tex3DLod(textureObject, x, y, z, level); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level) +{ + *ptr = tex1DLayeredLod(textureObject, x, layer, level); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level) +{ + *ptr = tex2DLayeredLod(textureObject, x, y, layer, level); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level) +{ + *ptr = texCubemapLod(textureObject, x, y, z, level); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + // TODO missing in device libs. + // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data); + // return __hipMapFrom(tmp); + return {}; +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy) +{ + *ptr = texCubemapGrad(textureObject, x, y, z, dPdx, dPdy); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level) +{ + *ptr = texCubemapLayeredLod(textureObject, x, y, z, layer, level); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy) +{ + *ptr = tex1DGrad(textureObject, x, dPdx, dPdy); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy) +{ + *ptr = tex2DGrad(textureObject, x, y, dPdx, dPdy); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy) +{ + *ptr = tex3DGrad(textureObject, x, y, z, dPdx, dPdy); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy) +{ + *ptr = tex1DLayeredGrad(textureObject, x, layer, dPdx, dPdy); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data); + return __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ + *ptr = tex2DLayeredGrad(textureObject, x, y, layer, dPdx, dPdy); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ + TEXTURE_OBJECT_PARAMETERS_INIT + // TODO missing in device libs. + // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data); + // return __hipMapFrom(tmp); + return {}; +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ + *ptr = texCubemapLayeredGrad(textureObject, x, y, z, layer, dPdx, dPdy); +} + +#endif +/* +Copyright (c) 2018 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H + +#if defined(__cplusplus) + +#if !defined(__HIPCC_RTC__) +#include +#include +#include +#include +#endif + +#if defined(__HIPCC_RTC__) +#define __HOST_DEVICE__ __device__ +#else +#define __HOST_DEVICE__ __host__ __device__ +#endif + +#define __HIP_SURFACE_OBJECT_PARAMETERS_INIT \ + unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj; + +// CUDA is using byte address, need map to pixel address for HIP +static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, int order) { + /* + * use below format index to generate format LUT + typedef enum { + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15 + } hsa_ext_image_channel_type_t; + */ + static const int FormatLUT[] = { 0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2 }; + x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format]; + + /* + * use below order index to generate order LUT + typedef enum { + HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0, + HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1, + HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2, + HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4, + HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8, + HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9, + HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10, + HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14, + HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15, + HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 + } hsa_ext_image_channel_order_t; + */ + static const int OrderLUT[] = { 0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0 }; + return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order]; +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x, + int boundaryMode = hipBoundaryModeZero) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); + auto tmp = __ockl_image_load_1D(i, x); + *data = __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); + auto tmp = __hipMapTo(data); + __ockl_image_store_1D(i, x, tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); + auto tmp = __ockl_image_load_2D(i, int2(x, y).data); + *data = __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); + auto tmp = __hipMapTo(data); + __ockl_image_store_2D(i, int2(x, y).data, tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); + auto tmp = __ockl_image_load_3D(i, int4(x, y, z, 0).data); + *data = __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); + auto tmp = __hipMapTo(data); + __ockl_image_store_3D(i, int4(x, y, z, 0).data, tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); + auto tmp = __ockl_image_load_lod_1D(i, x, layer); + *data = __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); + auto tmp = __hipMapTo(data); + __ockl_image_store_lod_1D(i, x, layer, tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); + auto tmp = __ockl_image_load_lod_2D(i, int2(x, y).data, layer); + *data = __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); + auto tmp = __hipMapTo(data); + __ockl_image_store_lod_2D(i, int2(x, y).data, layer, tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); + auto tmp = __ockl_image_load_CM(i, int2(x, y).data, face); + *data = __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); + auto tmp = __hipMapTo(data); + __ockl_image_store_CM(i, int2(x, y).data, face, tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face, + int layer) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); + auto tmp = __ockl_image_load_lod_CM(i, int2(x, y).data, face, layer); + *data = __hipMapFrom(tmp); +} + +template < + typename T, + typename std::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face, + int layer) { + __HIP_SURFACE_OBJECT_PARAMETERS_INIT + x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); + auto tmp = __hipMapTo(data); + __ockl_image_store_lod_CM(i, int2(x, y).data, face, layer, tmp); +} + +#endif + +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H + +#if !defined(__HIPCC_RTC__) +#include "hip/amd_detail/amd_hip_vector_types.h" +#endif + +#if defined(__HIPCC_RTC__) +#define __HOST_DEVICE__ __device__ +#else +#define __HOST_DEVICE__ __host__ __device__ +// TODO: Clang has a bug which allows device functions to call std functions +// when std functions are introduced into default namespace by using statement. +// math.h may be included after this bug is fixed. +#if __cplusplus +#include +#else +#include "math.h" +#endif +#endif // !defined(__HIPCC_RTC__) + +#if __cplusplus +#define COMPLEX_NEG_OP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type operator-(const type& op) { \ + type ret; \ + ret.x = -op.x; \ + ret.y = -op.y; \ + return ret; \ + } + +#define COMPLEX_EQ_OP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) { \ + return lhs.x == rhs.x && lhs.y == rhs.y; \ + } + +#define COMPLEX_NE_OP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) { \ + return !(lhs == rhs); \ + } + +#define COMPLEX_ADD_OP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x + rhs.x; \ + ret.y = lhs.y + rhs.y; \ + return ret; \ + } + +#define COMPLEX_SUB_OP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x - rhs.x; \ + ret.y = lhs.y - rhs.y; \ + return ret; \ + } + +#define COMPLEX_MUL_OP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \ + ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \ + return ret; \ + } + +#define COMPLEX_DIV_OP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \ + ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \ + ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y); \ + ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y); \ + return ret; \ + } + +#define COMPLEX_ADD_PREOP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) { \ + lhs.x += rhs.x; \ + lhs.y += rhs.y; \ + return lhs; \ + } + +#define COMPLEX_SUB_PREOP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) { \ + lhs.x -= rhs.x; \ + lhs.y -= rhs.y; \ + return lhs; \ + } + +#define COMPLEX_MUL_PREOP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \ + type temp{lhs}; \ + lhs.x = rhs.x * temp.x - rhs.y * temp.y; \ + lhs.y = rhs.y * temp.x + rhs.x * temp.y; \ + return lhs; \ + } + +#define COMPLEX_DIV_PREOP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \ + type temp; \ + temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \ + temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \ + lhs = temp; \ + return lhs; \ + } + +#define COMPLEX_SCALAR_PRODUCT(type, type1) \ + __HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + return ret; \ + } + +#endif + +typedef float2 hipFloatComplex; + +__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; } + +__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; } + +__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) { + hipFloatComplex z; + z.x = a; + z.y = b; + return z; +} + +__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { + hipFloatComplex ret; + ret.x = z.x; + ret.y = -z.y; + return ret; +} + +__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) { + return z.x * z.x + z.y * z.y; +} + +__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) { + return make_hipFloatComplex(p.x + q.x, p.y + q.y); +} + +__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) { + return make_hipFloatComplex(p.x - q.x, p.y - q.y); +} + +__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) { + return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); +} + +__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) { + float sqabs = hipCsqabsf(q); + hipFloatComplex ret; + ret.x = (p.x * q.x + p.y * q.y) / sqabs; + ret.y = (p.y * q.x - p.x * q.y) / sqabs; + return ret; +} + +__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); } + + +typedef double2 hipDoubleComplex; + +__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; } + +__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; } + +__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) { + hipDoubleComplex z; + z.x = a; + z.y = b; + return z; +} + +__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { + hipDoubleComplex ret; + ret.x = z.x; + ret.y = -z.y; + return ret; +} + +__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) { + return z.x * z.x + z.y * z.y; +} + +__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) { + return make_hipDoubleComplex(p.x + q.x, p.y + q.y); +} + +__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) { + return make_hipDoubleComplex(p.x - q.x, p.y - q.y); +} + +__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) { + return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); +} + +__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) { + double sqabs = hipCsqabs(q); + hipDoubleComplex ret; + ret.x = (p.x * q.x + p.y * q.y) / sqabs; + ret.y = (p.y * q.x - p.x * q.y) / sqabs; + return ret; +} + +__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); } + + +#if __cplusplus + +COMPLEX_NEG_OP_OVERLOAD(hipFloatComplex) +COMPLEX_EQ_OP_OVERLOAD(hipFloatComplex) +COMPLEX_NE_OP_OVERLOAD(hipFloatComplex) +COMPLEX_ADD_OP_OVERLOAD(hipFloatComplex) +COMPLEX_SUB_OP_OVERLOAD(hipFloatComplex) +COMPLEX_MUL_OP_OVERLOAD(hipFloatComplex) +COMPLEX_DIV_OP_OVERLOAD(hipFloatComplex) +COMPLEX_ADD_PREOP_OVERLOAD(hipFloatComplex) +COMPLEX_SUB_PREOP_OVERLOAD(hipFloatComplex) +COMPLEX_MUL_PREOP_OVERLOAD(hipFloatComplex) +COMPLEX_DIV_PREOP_OVERLOAD(hipFloatComplex) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned short) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed short) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned int) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed int) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, float) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, double) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long long) +COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long long) + +COMPLEX_NEG_OP_OVERLOAD(hipDoubleComplex) +COMPLEX_EQ_OP_OVERLOAD(hipDoubleComplex) +COMPLEX_NE_OP_OVERLOAD(hipDoubleComplex) +COMPLEX_ADD_OP_OVERLOAD(hipDoubleComplex) +COMPLEX_SUB_OP_OVERLOAD(hipDoubleComplex) +COMPLEX_MUL_OP_OVERLOAD(hipDoubleComplex) +COMPLEX_DIV_OP_OVERLOAD(hipDoubleComplex) +COMPLEX_ADD_PREOP_OVERLOAD(hipDoubleComplex) +COMPLEX_SUB_PREOP_OVERLOAD(hipDoubleComplex) +COMPLEX_MUL_PREOP_OVERLOAD(hipDoubleComplex) +COMPLEX_DIV_PREOP_OVERLOAD(hipDoubleComplex) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned short) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed short) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned int) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed int) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, float) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, double) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long long) +COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long) + +#endif + + +typedef hipFloatComplex hipComplex; + +__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) { + return make_hipFloatComplex(x, y); +} + +__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) { + return make_hipFloatComplex((float)z.x, (float)z.y); +} + +__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) { + return make_hipDoubleComplex((double)z.x, (double)z.y); +} + +__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) { + float real = (p.x * q.x) + r.x; + float imag = (q.x * p.y) + r.y; + + real = -(p.y * q.y) + real; + imag = (p.x * q.y) + imag; + + return make_hipComplex(real, imag); +} + +__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, + hipDoubleComplex r) { + double real = (p.x * q.x) + r.x; + double imag = (q.x * p.y) + r.y; + + real = -(p.y * q.y) + real; + imag = (p.x * q.y) + imag; + + return make_hipDoubleComplex(real, imag); +} + +#endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef AMD_HIP_MATH_CONSTANTS_H +#define AMD_HIP_MATH_CONSTANTS_H + +// single precision constants +#define HIP_INF_F __int_as_float(0x7f800000U) +#define HIP_NAN_F __int_as_float(0x7fffffffU) +#define HIP_MIN_DENORM_F __int_as_float(0x00000001U) +#define HIP_MAX_NORMAL_F __int_as_float(0x7f7fffffU) +#define HIP_NEG_ZERO_F __int_as_float(0x80000000U) +#define HIP_ZERO_F 0.0F +#define HIP_ONE_F 1.0F +#define HIP_SQRT_HALF_F 0.707106781F +#define HIP_SQRT_HALF_HI_F 0.707106781F +#define HIP_SQRT_HALF_LO_F 1.210161749e-08F +#define HIP_SQRT_TWO_F 1.414213562F +#define HIP_THIRD_F 0.333333333F +#define HIP_PIO4_F 0.785398163F +#define HIP_PIO2_F 1.570796327F +#define HIP_3PIO4_F 2.356194490F +#define HIP_2_OVER_PI_F 0.636619772F +#define HIP_SQRT_2_OVER_PI_F 0.797884561F +#define HIP_PI_F 3.141592654F +#define HIP_L2E_F 1.442695041F +#define HIP_L2T_F 3.321928094F +#define HIP_LG2_F 0.301029996F +#define HIP_LGE_F 0.434294482F +#define HIP_LN2_F 0.693147181F +#define HIP_LNT_F 2.302585093F +#define HIP_LNPI_F 1.144729886F +#define HIP_TWO_TO_M126_F 1.175494351e-38F +#define HIP_TWO_TO_126_F 8.507059173e37F +#define HIP_NORM_HUGE_F 3.402823466e38F +#define HIP_TWO_TO_23_F 8388608.0F +#define HIP_TWO_TO_24_F 16777216.0F +#define HIP_TWO_TO_31_F 2147483648.0F +#define HIP_TWO_TO_32_F 4294967296.0F +#define HIP_REMQUO_BITS_F 3U +#define HIP_REMQUO_MASK_F (~((~0U)<::Native_vec_, + HIP_vector_base::Native_vec_, + int, bool); + +__device__ +__attribute__((const)) +unsigned int __ockl_udot2( + HIP_vector_base::Native_vec_, + HIP_vector_base::Native_vec_, + unsigned int, bool); + +__device__ +__attribute__((const)) +int __ockl_sdot4( + HIP_vector_base::Native_vec_, + HIP_vector_base::Native_vec_, + int, bool); + +__device__ +__attribute__((const)) +unsigned int __ockl_udot4( + HIP_vector_base::Native_vec_, + HIP_vector_base::Native_vec_, + unsigned int, bool); + +__device__ +__attribute__((const)) +int __ockl_sdot8(int, int, int, bool); + +__device__ +__attribute__((const)) +unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool); +#endif + +#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ +// BEGIN FLOAT +__device__ +__attribute__((const)) +float __ocml_acos_f32(float); +__device__ +__attribute__((pure)) +float __ocml_acosh_f32(float); +__device__ +__attribute__((const)) +float __ocml_asin_f32(float); +__device__ +__attribute__((pure)) +float __ocml_asinh_f32(float); +__device__ +__attribute__((const)) +float __ocml_atan2_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_atan_f32(float); +__device__ +__attribute__((pure)) +float __ocml_atanh_f32(float); +__device__ +__attribute__((pure)) +float __ocml_cbrt_f32(float); +__device__ +__attribute__((const)) +float __ocml_ceil_f32(float); +__device__ +__attribute__((const)) +__device__ +float __ocml_copysign_f32(float, float); +__device__ +float __ocml_cos_f32(float); +__device__ +float __ocml_native_cos_f32(float); +__device__ +__attribute__((pure)) +__device__ +float __ocml_cosh_f32(float); +__device__ +float __ocml_cospi_f32(float); +__device__ +float __ocml_i0_f32(float); +__device__ +float __ocml_i1_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erfc_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erfcinv_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erfcx_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erf_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erfinv_f32(float); +__device__ +__attribute__((pure)) +float __ocml_exp10_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_exp10_f32(float); +__device__ +__attribute__((pure)) +float __ocml_exp2_f32(float); +__device__ +__attribute__((pure)) +float __ocml_exp_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_exp_f32(float); +__device__ +__attribute__((pure)) +float __ocml_expm1_f32(float); +__device__ +__attribute__((const)) +float __ocml_fabs_f32(float); +__device__ +__attribute__((const)) +float __ocml_fdim_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_floor_f32(float); +__device__ +__attribute__((const)) +float __ocml_fma_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_fmax_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_fmin_f32(float, float); +__device__ +__attribute__((const)) +__device__ +float __ocml_fmod_f32(float, float); +__device__ +float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*); +__device__ +__attribute__((const)) +float __ocml_hypot_f32(float, float); +__device__ +__attribute__((const)) +int __ocml_ilogb_f32(float); +__device__ +__attribute__((const)) +int __ocml_isfinite_f32(float); +__device__ +__attribute__((const)) +int __ocml_isinf_f32(float); +__device__ +__attribute__((const)) +int __ocml_isnan_f32(float); +__device__ +float __ocml_j0_f32(float); +__device__ +float __ocml_j1_f32(float); +__device__ +__attribute__((const)) +float __ocml_ldexp_f32(float, int); +__device__ +float __ocml_lgamma_f32(float); +__device__ +__attribute__((pure)) +float __ocml_log10_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_log10_f32(float); +__device__ +__attribute__((pure)) +float __ocml_log1p_f32(float); +__device__ +__attribute__((pure)) +float __ocml_log2_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_log2_f32(float); +__device__ +__attribute__((const)) +float __ocml_logb_f32(float); +__device__ +__attribute__((pure)) +float __ocml_log_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_log_f32(float); +__device__ +float __ocml_modf_f32(float, __attribute__((address_space(5))) float*); +__device__ +__attribute__((const)) +float __ocml_nearbyint_f32(float); +__device__ +__attribute__((const)) +float __ocml_nextafter_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_len3_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_len4_f32(float, float, float, float); +__device__ +__attribute__((pure)) +float __ocml_ncdf_f32(float); +__device__ +__attribute__((pure)) +float __ocml_ncdfinv_f32(float); +__device__ +__attribute__((pure)) +float __ocml_pow_f32(float, float); +__device__ +__attribute__((pure)) +float __ocml_pown_f32(float, int); +__device__ +__attribute__((pure)) +float __ocml_rcbrt_f32(float); +__device__ +__attribute__((const)) +float __ocml_remainder_f32(float, float); +__device__ +float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*); +__device__ +__attribute__((const)) +float __ocml_rhypot_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_rint_f32(float); +__device__ +__attribute__((const)) +float __ocml_rlen3_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_rlen4_f32(float, float, float, float); +__device__ +__attribute__((const)) +float __ocml_round_f32(float); +__device__ +__attribute__((pure)) +float __ocml_rsqrt_f32(float); +__device__ +__attribute__((const)) +float __ocml_scalb_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_scalbn_f32(float, int); +__device__ +__attribute__((const)) +int __ocml_signbit_f32(float); +__device__ +float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*); +__device__ +float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*); +__device__ +float __ocml_sin_f32(float); +__device__ +float __ocml_native_sin_f32(float); +__device__ +__attribute__((pure)) +float __ocml_sinh_f32(float); +__device__ +float __ocml_sinpi_f32(float); +__device__ +__attribute__((const)) +float __ocml_sqrt_f32(float); +__device__ +__attribute__((const)) +float __ocml_native_sqrt_f32(float); +__device__ +float __ocml_tan_f32(float); +__device__ +__attribute__((pure)) +float __ocml_tanh_f32(float); +__device__ +float __ocml_tgamma_f32(float); +__device__ +__attribute__((const)) +float __ocml_trunc_f32(float); +__device__ +float __ocml_y0_f32(float); +__device__ +float __ocml_y1_f32(float); + +// BEGIN INTRINSICS +__device__ +__attribute__((const)) +float __ocml_add_rte_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_add_rtn_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_add_rtp_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_add_rtz_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sub_rte_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sub_rtn_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sub_rtp_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sub_rtz_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_mul_rte_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_mul_rtn_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_mul_rtp_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_mul_rtz_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_div_rte_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_div_rtn_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_div_rtp_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_div_rtz_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sqrt_rte_f32(float); +__device__ +__attribute__((const)) +float __ocml_sqrt_rtn_f32(float); +__device__ +__attribute__((const)) +float __ocml_sqrt_rtp_f32(float); +__device__ +__attribute__((const)) +float __ocml_sqrt_rtz_f32(float); +__device__ +__attribute__((const)) +float __ocml_fma_rte_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_fma_rtn_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_fma_rtp_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_fma_rtz_f32(float, float, float); +// END INTRINSICS +// END FLOAT + +// BEGIN DOUBLE +__device__ +__attribute__((const)) +double __ocml_acos_f64(double); +__device__ +__attribute__((pure)) +double __ocml_acosh_f64(double); +__device__ +__attribute__((const)) +double __ocml_asin_f64(double); +__device__ +__attribute__((pure)) +double __ocml_asinh_f64(double); +__device__ +__attribute__((const)) +double __ocml_atan2_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_atan_f64(double); +__device__ +__attribute__((pure)) +double __ocml_atanh_f64(double); +__device__ +__attribute__((pure)) +double __ocml_cbrt_f64(double); +__device__ +__attribute__((const)) +double __ocml_ceil_f64(double); +__device__ +__attribute__((const)) +double __ocml_copysign_f64(double, double); +__device__ +double __ocml_cos_f64(double); +__device__ +__attribute__((pure)) +double __ocml_cosh_f64(double); +__device__ +double __ocml_cospi_f64(double); +__device__ +double __ocml_i0_f64(double); +__device__ +double __ocml_i1_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erfc_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erfcinv_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erfcx_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erf_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erfinv_f64(double); +__device__ +__attribute__((pure)) +double __ocml_exp10_f64(double); +__device__ +__attribute__((pure)) +double __ocml_exp2_f64(double); +__device__ +__attribute__((pure)) +double __ocml_exp_f64(double); +__device__ +__attribute__((pure)) +double __ocml_expm1_f64(double); +__device__ +__attribute__((const)) +double __ocml_fabs_f64(double); +__device__ +__attribute__((const)) +double __ocml_fdim_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_floor_f64(double); +__device__ +__attribute__((const)) +double __ocml_fma_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_fmax_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_fmin_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_fmod_f64(double, double); +__device__ +double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*); +__device__ +__attribute__((const)) +double __ocml_hypot_f64(double, double); +__device__ +__attribute__((const)) +int __ocml_ilogb_f64(double); +__device__ +__attribute__((const)) +int __ocml_isfinite_f64(double); +__device__ +__attribute__((const)) +int __ocml_isinf_f64(double); +__device__ +__attribute__((const)) +int __ocml_isnan_f64(double); +__device__ +double __ocml_j0_f64(double); +__device__ +double __ocml_j1_f64(double); +__device__ +__attribute__((const)) +double __ocml_ldexp_f64(double, int); +__device__ +double __ocml_lgamma_f64(double); +__device__ +__attribute__((pure)) +double __ocml_log10_f64(double); +__device__ +__attribute__((pure)) +double __ocml_log1p_f64(double); +__device__ +__attribute__((pure)) +double __ocml_log2_f64(double); +__device__ +__attribute__((const)) +double __ocml_logb_f64(double); +__device__ +__attribute__((pure)) +double __ocml_log_f64(double); +__device__ +double __ocml_modf_f64(double, __attribute__((address_space(5))) double*); +__device__ +__attribute__((const)) +double __ocml_nearbyint_f64(double); +__device__ +__attribute__((const)) +double __ocml_nextafter_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_len3_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_len4_f64(double, double, double, double); +__device__ +__attribute__((pure)) +double __ocml_ncdf_f64(double); +__device__ +__attribute__((pure)) +double __ocml_ncdfinv_f64(double); +__device__ +__attribute__((pure)) +double __ocml_pow_f64(double, double); +__device__ +__attribute__((pure)) +double __ocml_pown_f64(double, int); +__device__ +__attribute__((pure)) +double __ocml_rcbrt_f64(double); +__device__ +__attribute__((const)) +double __ocml_remainder_f64(double, double); +__device__ +double __ocml_remquo_f64( + double, double, __attribute__((address_space(5))) int*); +__device__ +__attribute__((const)) +double __ocml_rhypot_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_rint_f64(double); +__device__ +__attribute__((const)) +double __ocml_rlen3_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_rlen4_f64(double, double, double, double); +__device__ +__attribute__((const)) +double __ocml_round_f64(double); +__device__ +__attribute__((pure)) +double __ocml_rsqrt_f64(double); +__device__ +__attribute__((const)) +double __ocml_scalb_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_scalbn_f64(double, int); +__device__ +__attribute__((const)) +int __ocml_signbit_f64(double); +__device__ +double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*); +__device__ +double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*); +__device__ +double __ocml_sin_f64(double); +__device__ +__attribute__((pure)) +double __ocml_sinh_f64(double); +__device__ +double __ocml_sinpi_f64(double); +__device__ +__attribute__((const)) +double __ocml_sqrt_f64(double); +__device__ +double __ocml_tan_f64(double); +__device__ +__attribute__((pure)) +double __ocml_tanh_f64(double); +__device__ +double __ocml_tgamma_f64(double); +__device__ +__attribute__((const)) +double __ocml_trunc_f64(double); +__device__ +double __ocml_y0_f64(double); +__device__ +double __ocml_y1_f64(double); + +// BEGIN INTRINSICS +__device__ +__attribute__((const)) +double __ocml_add_rte_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_add_rtn_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_add_rtp_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_add_rtz_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sub_rte_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sub_rtn_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sub_rtp_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sub_rtz_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_mul_rte_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_mul_rtn_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_mul_rtp_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_mul_rtz_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_div_rte_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_div_rtn_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_div_rtp_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_div_rtz_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sqrt_rte_f64(double); +__device__ +__attribute__((const)) +double __ocml_sqrt_rtn_f64(double); +__device__ +__attribute__((const)) +double __ocml_sqrt_rtp_f64(double); +__device__ +__attribute__((const)) +double __ocml_sqrt_rtz_f64(double); +__device__ +__attribute__((const)) +double __ocml_fma_rte_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_fma_rtn_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_fma_rtp_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_fma_rtz_f64(double, double, double); +// END INTRINSICS +// END DOUBLE + +#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ + +#if defined(__cplusplus) + } // extern "C" +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file amd_detail/device_library_decls.h + * @brief Contains declarations for types and functions in device library. + * Uses int64_t and uint64_t instead of long, long long, unsigned + * long and unsigned long long types for device library API + * declarations. + */ + +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H + +#if !defined(__HIPCC_RTC__) +#include "hip/amd_detail/host_defines.h" +#endif + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; +typedef unsigned long long ullong; + +extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int); +extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int); +extern "C" __device__ uint __ockl_activelane_u32(void); + +extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint); +extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int); +extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint); +extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int); +extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint); + +extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar); +extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort); +extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint); +extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t); + +extern "C" __device__ __attribute__((const)) float __ocml_floor_f32(float); +extern "C" __device__ __attribute__((const)) float __ocml_rint_f32(float); +extern "C" __device__ __attribute__((const)) float __ocml_ceil_f32(float); +extern "C" __device__ __attribute__((const)) float __ocml_trunc_f32(float); + +extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float); +extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float); + +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_f64(double); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_f64(double); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_f64(double); + +extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float); +extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float); +extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float); + +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t); + +extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid); +extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid); + +extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32(); +extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void); +extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void); +extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void); +extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void); +extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void); +extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void); +extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void); +extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void); + +extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float); + +extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a); +extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a); +extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a); + +extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin(); +extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args, + uint64_t value0, uint64_t value1, + uint64_t value2, uint64_t value3, + uint64_t value4, uint64_t value5, + uint64_t value6, uint32_t is_last); +extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data, + uint64_t length, uint32_t is_last); + +// Introduce local address space +#define __local __attribute__((address_space(3))) + +#ifdef __HIP_DEVICE_COMPILE__ +__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; } +#endif //__HIP_DEVICE_COMPILE__ + +// Using hip.amdgcn.bc - sync threads +#define __CLK_LOCAL_MEM_FENCE 0x01 +typedef unsigned __cl_mem_fence_flags; + +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H + +#if !defined(__HIPCC_RTC__) +#include +#include "host_defines.h" +#include "math_fwd.h" +#include +#include +#include +#include +#endif // !defined(__HIPCC_RTC__) + +#if defined(__clang__) && defined(__HIP__) +extern "C" __device__ int printf(const char *fmt, ...); +#else +template +static inline __device__ void printf(const char* format, All... all) {} +#endif // __HIP_CLANG_ONLY__ + +extern "C" __device__ unsigned long long __ockl_steadyctr_u64(); + +/* +Integer Intrinsics +*/ + +// integer intrinsic function __poc __clz __ffs __brev +__device__ static inline unsigned int __popc(unsigned int input) { + return __builtin_popcount(input); +} +__device__ static inline unsigned int __popcll(unsigned long long int input) { + return __builtin_popcountll(input); +} + +__device__ static inline int __clz(int input) { + return __ockl_clz_u32((uint)input); +} + +__device__ static inline int __clzll(long long int input) { + return __ockl_clz_u64((uint64_t)input); +} + +__device__ static inline unsigned int __ffs(unsigned int input) { + return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1; +} + +__device__ static inline unsigned int __ffsll(unsigned long long int input) { + return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1; +} + +__device__ static inline unsigned int __ffs(int input) { + return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1; +} + +__device__ static inline unsigned int __ffsll(long long int input) { + return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1; +} + +// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE), +// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position. +// If not found, return -1. +__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) { + uint64_t temp_mask = mask; + int32_t temp_offset = offset; + + if (offset == 0) { + temp_mask &= (1 << base); + temp_offset = 1; + } + else if (offset < 0) { + temp_mask = __builtin_bitreverse64(mask); + base = 63 - base; + temp_offset = -offset; + } + + temp_mask = temp_mask & ((~0ULL) << base); + if (__builtin_popcountll(temp_mask) < temp_offset) + return -1; + int32_t total = 0; + for (int i = 0x20; i > 0; i >>= 1) { + uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1); + int32_t pcnt = __builtin_popcountll(temp_mask_lo); + if (pcnt < temp_offset) { + temp_mask = temp_mask >> i; + temp_offset -= pcnt; + total += i; + } + else { + temp_mask = temp_mask_lo; + } + } + if (offset < 0) + return 63 - total; + else + return total; +} + +__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) { + uint64_t temp_mask = mask; + int32_t temp_offset = offset; + if (offset == 0) { + temp_mask &= (1 << base); + temp_offset = 1; + } + else if (offset < 0) { + temp_mask = __builtin_bitreverse64(mask); + base = 63 - base; + temp_offset = -offset; + } + temp_mask = temp_mask & ((~0ULL) << base); + if (__builtin_popcountll(temp_mask) < temp_offset) + return -1; + int32_t total = 0; + for (int i = 0x20; i > 0; i >>= 1) { + uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1); + int32_t pcnt = __builtin_popcountll(temp_mask_lo); + if (pcnt < temp_offset) { + temp_mask = temp_mask >> i; + temp_offset -= pcnt; + total += i; + } + else { + temp_mask = temp_mask_lo; + } + } + if (offset < 0) + return 63 - total; + else + return total; +} +__device__ static inline unsigned int __brev(unsigned int input) { + return __builtin_bitreverse32(input); +} + +__device__ static inline unsigned long long int __brevll(unsigned long long int input) { + return __builtin_bitreverse64(input); +} + +__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) { + return input == 0 ? -1 : __builtin_ctzl(input); +} + +__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) { + uint32_t offset = src1 & 31; + uint32_t width = src2 & 31; + return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width); +} + +__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) { + uint64_t offset = src1 & 63; + uint64_t width = src2 & 63; + return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width); +} + +__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) { + uint32_t offset = src2 & 31; + uint32_t width = src3 & 31; + uint32_t mask = (1 << width) - 1; + return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); +} + +__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) { + uint64_t offset = src2 & 63; + uint64_t width = src3 & 63; + uint64_t mask = (1ULL << width) - 1; + return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); +} + +__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift) +{ + uint32_t mask_shift = shift & 31; + return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift); +} + +__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift) +{ + uint32_t min_shift = shift >= 32 ? 32 : shift; + return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift); +} + +__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift) +{ + return __builtin_amdgcn_alignbit(hi, lo, shift); +} + +__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift) +{ + return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift); +} + +__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s); +__device__ static unsigned int __hadd(int x, int y); +__device__ static int __mul24(int x, int y); +__device__ static long long int __mul64hi(long long int x, long long int y); +__device__ static int __mulhi(int x, int y); +__device__ static int __rhadd(int x, int y); +__device__ static unsigned int __sad(int x, int y,unsigned int z); +__device__ static unsigned int __uhadd(unsigned int x, unsigned int y); +__device__ static int __umul24(unsigned int x, unsigned int y); +__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y); +__device__ static unsigned int __umulhi(unsigned int x, unsigned int y); +__device__ static unsigned int __urhadd(unsigned int x, unsigned int y); +__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z); + +struct ucharHolder { + union { + unsigned char c[4]; + unsigned int ui; + }; +} __attribute__((aligned(4))); + +struct uchar2Holder { + union { + unsigned int ui[2]; + unsigned char c[8]; + }; +} __attribute__((aligned(8))); + +__device__ +static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) { + struct uchar2Holder cHoldVal; + struct ucharHolder cHoldKey; + cHoldKey.ui = s; + cHoldVal.ui[0] = x; + cHoldVal.ui[1] = y; + unsigned int result; + result = cHoldVal.c[cHoldKey.c[0] & 0x07]; + result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8); + result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16); + result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24); + return result; +} + +__device__ static inline unsigned int __hadd(int x, int y) { + int z = x + y; + int sign = z & 0x8000000; + int value = z & 0x7FFFFFFF; + return ((value) >> 1 || sign); +} + +__device__ static inline int __mul24(int x, int y) { + return __ockl_mul24_i32(x, y); +} + +__device__ static inline long long __mul64hi(long long int x, long long int y) { + ulong x0 = (ulong)x & 0xffffffffUL; + long x1 = x >> 32; + ulong y0 = (ulong)y & 0xffffffffUL; + long y1 = y >> 32; + ulong z0 = x0*y0; + long t = x1*y0 + (z0 >> 32); + long z1 = t & 0xffffffffL; + long z2 = t >> 32; + z1 = x0*y1 + z1; + return x1*y1 + z2 + (z1 >> 32); +} + +__device__ static inline int __mulhi(int x, int y) { + return __ockl_mul_hi_i32(x, y); +} + +__device__ static inline int __rhadd(int x, int y) { + int z = x + y + 1; + int sign = z & 0x8000000; + int value = z & 0x7FFFFFFF; + return ((value) >> 1 || sign); +} +__device__ static inline unsigned int __sad(int x, int y, unsigned int z) { + return x > y ? x - y + z : y - x + z; +} +__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) { + return (x + y) >> 1; +} +__device__ static inline int __umul24(unsigned int x, unsigned int y) { + return __ockl_mul24_u32(x, y); +} + +__device__ +static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) { + ulong x0 = x & 0xffffffffUL; + ulong x1 = x >> 32; + ulong y0 = y & 0xffffffffUL; + ulong y1 = y >> 32; + ulong z0 = x0*y0; + ulong t = x1*y0 + (z0 >> 32); + ulong z1 = t & 0xffffffffUL; + ulong z2 = t >> 32; + z1 = x0*y1 + z1; + return x1*y1 + z2 + (z1 >> 32); +} + +__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) { + return __ockl_mul_hi_u32(x, y); +} +__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) { + return (x + y + 1) >> 1; +} +__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) { + return __ockl_sadd_u32(x, y, z); +} + +__device__ static inline unsigned int __lane_id() { + return __builtin_amdgcn_mbcnt_hi( + -1, __builtin_amdgcn_mbcnt_lo(-1, 0)); +} + +__device__ +static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);}; + +__device__ +static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);}; + +/* +HIP specific device functions +*/ + +#if !defined(__HIPCC_RTC__) +#include "amd_warp_functions.h" +#endif + +#define MASK1 0x00ff00ff +#define MASK2 0xff00ff00 + +__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) { + char4 out; + unsigned one1 = in1.w & MASK1; + unsigned one2 = in2.w & MASK1; + out.w = (one1 + one2) & MASK1; + one1 = in1.w & MASK2; + one2 = in2.w & MASK2; + out.w = out.w | ((one1 + one2) & MASK2); + return out; +} + +__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) { + char4 out; + unsigned one1 = in1.w & MASK1; + unsigned one2 = in2.w & MASK1; + out.w = (one1 - one2) & MASK1; + one1 = in1.w & MASK2; + one2 = in2.w & MASK2; + out.w = out.w | ((one1 - one2) & MASK2); + return out; +} + +__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) { + char4 out; + unsigned one1 = in1.w & MASK1; + unsigned one2 = in2.w & MASK1; + out.w = (one1 * one2) & MASK1; + one1 = in1.w & MASK2; + one2 = in2.w & MASK2; + out.w = out.w | ((one1 * one2) & MASK2); + return out; +} + +__device__ static inline float __double2float_rd(double x) { + return __ocml_cvtrtn_f32_f64(x); +} +__device__ static inline float __double2float_rn(double x) { return x; } +__device__ static inline float __double2float_ru(double x) { + return __ocml_cvtrtp_f32_f64(x); +} +__device__ static inline float __double2float_rz(double x) { + return __ocml_cvtrtz_f32_f64(x); +} + +__device__ static inline int __double2hiint(double x) { + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[1]; +} +__device__ static inline int __double2loint(double x) { + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[0]; +} + +__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); } +__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); } +__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); } +__device__ static inline int __double2int_rz(double x) { return (int)x; } + +__device__ static inline long long int __double2ll_rd(double x) { + return (long long)__ocml_floor_f64(x); +} +__device__ static inline long long int __double2ll_rn(double x) { + return (long long)__ocml_rint_f64(x); +} +__device__ static inline long long int __double2ll_ru(double x) { + return (long long)__ocml_ceil_f64(x); +} +__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; } + +__device__ static inline unsigned int __double2uint_rd(double x) { + return (unsigned int)__ocml_floor_f64(x); +} +__device__ static inline unsigned int __double2uint_rn(double x) { + return (unsigned int)__ocml_rint_f64(x); +} +__device__ static inline unsigned int __double2uint_ru(double x) { + return (unsigned int)__ocml_ceil_f64(x); +} +__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; } + +__device__ static inline unsigned long long int __double2ull_rd(double x) { + return (unsigned long long int)__ocml_floor_f64(x); +} +__device__ static inline unsigned long long int __double2ull_rn(double x) { + return (unsigned long long int)__ocml_rint_f64(x); +} +__device__ static inline unsigned long long int __double2ull_ru(double x) { + return (unsigned long long int)__ocml_ceil_f64(x); +} +__device__ static inline unsigned long long int __double2ull_rz(double x) { + return (unsigned long long int)x; +} +__device__ static inline long long int __double_as_longlong(double x) { + static_assert(sizeof(long long) == sizeof(double), ""); + + long long tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; +} + +/* +__device__ unsigned short __float2half_rn(float x); +__device__ float __half2float(unsigned short); + +The above device function are not a valid . +Use +__device__ __half __float2half_rn(float x); +__device__ float __half2float(__half); +from hip_fp16.h + +CUDA implements half as unsigned short whereas, HIP doesn't. + +*/ + +__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); } +__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); } +__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); } +__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); } + +__device__ static inline long long int __float2ll_rd(float x) { + return (long long int)__ocml_floor_f32(x); +} +__device__ static inline long long int __float2ll_rn(float x) { + return (long long int)__ocml_rint_f32(x); +} +__device__ static inline long long int __float2ll_ru(float x) { + return (long long int)__ocml_ceil_f32(x); +} +__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; } + +__device__ static inline unsigned int __float2uint_rd(float x) { + return (unsigned int)__ocml_floor_f32(x); +} +__device__ static inline unsigned int __float2uint_rn(float x) { + return (unsigned int)__ocml_rint_f32(x); +} +__device__ static inline unsigned int __float2uint_ru(float x) { + return (unsigned int)__ocml_ceil_f32(x); +} +__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; } + +__device__ static inline unsigned long long int __float2ull_rd(float x) { + return (unsigned long long int)__ocml_floor_f32(x); +} +__device__ static inline unsigned long long int __float2ull_rn(float x) { + return (unsigned long long int)__ocml_rint_f32(x); +} +__device__ static inline unsigned long long int __float2ull_ru(float x) { + return (unsigned long long int)__ocml_ceil_f32(x); +} +__device__ static inline unsigned long long int __float2ull_rz(float x) { + return (unsigned long long int)x; +} + +__device__ static inline int __float_as_int(float x) { + static_assert(sizeof(int) == sizeof(float), ""); + + int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; +} + +__device__ static inline unsigned int __float_as_uint(float x) { + static_assert(sizeof(unsigned int) == sizeof(float), ""); + + unsigned int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; +} + +__device__ static inline double __hiloint2double(int hi, int lo) { + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + uint64_t tmp0 = (static_cast(hi) << 32ull) | static_cast(lo); + double tmp1; + __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + + return tmp1; +} + +__device__ static inline double __int2double_rn(int x) { return (double)x; } + +__device__ static inline float __int2float_rd(int x) { + return __ocml_cvtrtn_f32_s32(x); +} +__device__ static inline float __int2float_rn(int x) { return (float)x; } +__device__ static inline float __int2float_ru(int x) { + return __ocml_cvtrtp_f32_s32(x); +} +__device__ static inline float __int2float_rz(int x) { + return __ocml_cvtrtz_f32_s32(x); +} + +__device__ static inline float __int_as_float(int x) { + static_assert(sizeof(float) == sizeof(int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; +} + +__device__ static inline double __ll2double_rd(long long int x) { + return __ocml_cvtrtn_f64_s64(x); +} +__device__ static inline double __ll2double_rn(long long int x) { return (double)x; } +__device__ static inline double __ll2double_ru(long long int x) { + return __ocml_cvtrtp_f64_s64(x); +} +__device__ static inline double __ll2double_rz(long long int x) { + return __ocml_cvtrtz_f64_s64(x); +} + +__device__ static inline float __ll2float_rd(long long int x) { + return __ocml_cvtrtn_f32_s64(x); +} +__device__ static inline float __ll2float_rn(long long int x) { return (float)x; } +__device__ static inline float __ll2float_ru(long long int x) { + return __ocml_cvtrtp_f32_s64(x); +} +__device__ static inline float __ll2float_rz(long long int x) { + return __ocml_cvtrtz_f32_s64(x); +} + +__device__ static inline double __longlong_as_double(long long int x) { + static_assert(sizeof(double) == sizeof(long long), ""); + + double tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; +} + +__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; } + +__device__ static inline float __uint2float_rd(unsigned int x) { + return __ocml_cvtrtn_f32_u32(x); +} +__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; } +__device__ static inline float __uint2float_ru(unsigned int x) { + return __ocml_cvtrtp_f32_u32(x); +} +__device__ static inline float __uint2float_rz(unsigned int x) { + return __ocml_cvtrtz_f32_u32(x); +} + +__device__ static inline float __uint_as_float(unsigned int x) { + static_assert(sizeof(float) == sizeof(unsigned int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; +} + +__device__ static inline double __ull2double_rd(unsigned long long int x) { + return __ocml_cvtrtn_f64_u64(x); +} +__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; } +__device__ static inline double __ull2double_ru(unsigned long long int x) { + return __ocml_cvtrtp_f64_u64(x); +} +__device__ static inline double __ull2double_rz(unsigned long long int x) { + return __ocml_cvtrtz_f64_u64(x); +} + +__device__ static inline float __ull2float_rd(unsigned long long int x) { + return __ocml_cvtrtn_f32_u64(x); +} +__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; } +__device__ static inline float __ull2float_ru(unsigned long long int x) { + return __ocml_cvtrtp_f32_u64(x); +} +__device__ static inline float __ull2float_rz(unsigned long long int x) { + return __ocml_cvtrtz_f32_u64(x); +} + +#if defined(__clang__) && defined(__HIP__) + +// Clock functions +__device__ long long int __clock64(); +__device__ long long int __clock(); +__device__ long long int clock64(); +__device__ long long int clock(); +__device__ long long int wall_clock64(); +// hip.amdgcn.bc - named sync +__device__ void __named_sync(); + +#ifdef __HIP_DEVICE_COMPILE__ + +// Clock function to return GPU core cycle count. +// GPU can change its core clock frequency at runtime. The maximum frequency can be queried +// through hipDeviceAttributeClockRate attribute. +__device__ +inline __attribute((always_inline)) +long long int __clock64() { +#if __has_builtin(__builtin_amdgcn_s_memtime) + // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3 + return (long long int) __builtin_amdgcn_s_memtime(); +#else + // Subject to change when better solution available + return (long long int) __builtin_readcyclecounter(); +#endif +} + +__device__ +inline __attribute((always_inline)) +long long int __clock() { return __clock64(); } + +// Clock function to return wall clock count at a constant frequency that can be queried +// through hipDeviceAttributeWallClockRate attribute. +__device__ +inline __attribute__((always_inline)) +long long int wall_clock64() { + return (long long int) __ockl_steadyctr_u64(); +} + +__device__ +inline __attribute__((always_inline)) +long long int clock64() { return __clock64(); } + +__device__ +inline __attribute__((always_inline)) +long long int clock() { return __clock(); } + +// hip.amdgcn.bc - named sync +__device__ +inline +void __named_sync() { __builtin_amdgcn_s_barrier(); } + +#endif // __HIP_DEVICE_COMPILE__ + +// warp vote function __all __any __ballot +__device__ +inline +int __all(int predicate) { + return __ockl_wfall_i32(predicate); +} + +__device__ +inline +int __any(int predicate) { + return __ockl_wfany_i32(predicate); +} + +// XXX from llvm/include/llvm/IR/InstrTypes.h +#define ICMP_NE 33 + +__device__ +inline +unsigned long long int __ballot(int predicate) { + return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE); +} + +__device__ +inline +unsigned long long int __ballot64(int predicate) { + return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE); +} + +// hip.amdgcn.bc - lanemask +__device__ +inline +uint64_t __lanemask_gt() +{ + uint32_t lane = __ockl_lane_u32(); + if (lane == 63) + return 0; + uint64_t ballot = __ballot64(1); + uint64_t mask = (~((uint64_t)0)) << (lane + 1); + return mask & ballot; +} + +__device__ +inline +uint64_t __lanemask_lt() +{ + uint32_t lane = __ockl_lane_u32(); + int64_t ballot = __ballot64(1); + uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; + return mask & ballot; +} + +__device__ +inline +uint64_t __lanemask_eq() +{ + uint32_t lane = __ockl_lane_u32(); + int64_t mask = ((uint64_t)1 << lane); + return mask; +} + + +__device__ inline void* __local_to_generic(void* p) { return p; } + +#ifdef __HIP_DEVICE_COMPILE__ +__device__ +inline +void* __get_dynamicgroupbaseptr() +{ + // Get group segment base pointer. + return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize())); +} +#else +__device__ +void* __get_dynamicgroupbaseptr(); +#endif // __HIP_DEVICE_COMPILE__ + +__device__ +inline +void *__amdgcn_get_dynamicgroupbaseptr() { + return __get_dynamicgroupbaseptr(); +} + +// Memory Fence Functions +__device__ +inline +static void __threadfence() +{ + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); +} + +__device__ +inline +static void __threadfence_block() +{ + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); +} + +__device__ +inline +static void __threadfence_system() +{ + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); +} + +// abort +__device__ +inline +__attribute__((weak)) +void abort() { + return __builtin_trap(); +} + +// The noinline attribute helps encapsulate the printf expansion, +// which otherwise has a performance impact just by increasing the +// size of the calling function. Additionally, the weak attribute +// allows the function to exist as a global although its definition is +// included in every compilation unit. +#if defined(_WIN32) || defined(_WIN64) +extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) +void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) { + // FIXME: Need `wchar_t` support to generate assertion message. + __builtin_trap(); +} +#else /* defined(_WIN32) || defined(_WIN64) */ +extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) +void __assert_fail(const char *assertion, + const char *file, + unsigned int line, + const char *function) +{ + const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n"; + + // strlen is not available as a built-in yet, so we create our own + // loop in a macro. With a string literal argument, the compiler + // usually manages to replace the loop with a constant. + // + // The macro does not check for null pointer, since all the string + // arguments are defined to be constant literals when called from + // the assert() macro. + // + // NOTE: The loop below includes the null terminator in the length + // as required by append_string_n(). +#define __hip_get_string_length(LEN, STR) \ + do { \ + const char *tmp = STR; \ + while (*tmp++); \ + LEN = tmp - STR; \ + } while (0) + + auto msg = __ockl_fprintf_stderr_begin(); + int len = 0; + __hip_get_string_length(len, fmt); + msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0); + __hip_get_string_length(len, file); + msg = __ockl_fprintf_append_string_n(msg, file, len, 0); + msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0); + __hip_get_string_length(len, function); + msg = __ockl_fprintf_append_string_n(msg, function, len, 0); + __hip_get_string_length(len, assertion); + __ockl_fprintf_append_string_n(msg, assertion, len, /* is_last = */ 1); + +#undef __hip_get_string_length + + __builtin_trap(); +} + +extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) +void __assertfail() +{ + // ignore all the args for now. + __builtin_trap(); +} +#endif /* defined(_WIN32) || defined(_WIN64) */ + +__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) { + if (flags) { + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); + } else { + __builtin_amdgcn_s_barrier(); + } +} + +__device__ +inline +static void __barrier(int n) +{ + __work_group_barrier((__cl_mem_fence_flags)n); +} + +__device__ +inline +__attribute__((convergent)) +void __syncthreads() +{ + __barrier(__CLK_LOCAL_MEM_FENCE); +} + +__device__ +inline +__attribute__((convergent)) +int __syncthreads_count(int predicate) +{ + return __ockl_wgred_add_i32(!!predicate); +} + +__device__ +inline +__attribute__((convergent)) +int __syncthreads_and(int predicate) +{ + return __ockl_wgred_and_i32(!!predicate); +} + +__device__ +inline +__attribute__((convergent)) +int __syncthreads_or(int predicate) +{ + return __ockl_wgred_or_i32(!!predicate); +} + +// hip.amdgcn.bc - device routine +/* + HW_ID Register bit structure for RDNA2 & RDNA3 + WAVE_ID 4:0 Wave id within the SIMD. + SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column. + WGP_ID 13:10 Physical WGP ID. + SA_ID 16 Shader Array ID + SE_ID 20:18 Shader Engine the wave is assigned to for gfx11 + SE_ID 19:18 Shader Engine the wave is assigned to for gfx10 + DP_RATE 31:29 Number of double-precision float units per SIMD + + HW_ID Register bit structure for GCN and CDNA + WAVE_ID 3:0 Wave buffer slot number. 0-9. + SIMD_ID 5:4 SIMD which the wave is assigned to within the CU. + PIPE_ID 7:6 Pipeline from which the wave was dispatched. + CU_ID 11:8 Compute Unit the wave is assigned to. + SH_ID 12 Shader Array (within an SE) the wave is assigned to. + SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942 + 14:13 Shader Engine the wave is assigned to for Vega. + TG_ID 19:16 Thread-group ID + VM_ID 23:20 Virtual Memory ID + QUEUE_ID 26:24 Queue from which this wave was dispatched. + STATE_ID 29:27 State ID (graphics only, not compute). + ME_ID 31:30 Micro-engine ID. + + XCC_ID Register bit structure for gfx940 + XCC_ID 3:0 XCC the wave is assigned to. + */ + +#if (defined (__GFX10__) || defined (__GFX11__)) + #define HW_ID 23 +#else + #define HW_ID 4 +#endif + +#if (defined(__GFX10__) || defined(__GFX11__)) + #define HW_ID_WGP_ID_SIZE 4 + #define HW_ID_WGP_ID_OFFSET 10 +#else + #define HW_ID_CU_ID_SIZE 4 + #define HW_ID_CU_ID_OFFSET 8 +#endif + +#if (defined(__gfx908__) || defined(__gfx90a__) || \ + defined(__GFX11__)) + #define HW_ID_SE_ID_SIZE 3 +#else //4 SEs/XCC for gfx940-942 + #define HW_ID_SE_ID_SIZE 2 +#endif +#if (defined(__GFX10__) || defined(__GFX11__)) + #define HW_ID_SE_ID_OFFSET 18 + #define HW_ID_SA_ID_OFFSET 16 + #define HW_ID_SA_ID_SIZE 1 +#else + #define HW_ID_SE_ID_OFFSET 13 +#endif + +#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) + #define XCC_ID 20 + #define XCC_ID_XCC_ID_SIZE 4 + #define XCC_ID_XCC_ID_OFFSET 0 +#endif + +#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \ + (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))) + #define __HIP_NO_IMAGE_SUPPORT 1 +#endif + +/* + Encoding of parameter bitmask + HW_ID 5:0 HW_ID + OFFSET 10:6 Range: 0..31 + SIZE 15:11 Range: 1..32 + */ + +#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG)) + +/* + __smid returns the wave's assigned Compute Unit and Shader Engine. + The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4. + Note: the results vary over time. + SZ minus 1 since SIZE is 1-based. +*/ +__device__ +inline +unsigned __smid(void) +{ + unsigned se_id = __builtin_amdgcn_s_getreg( + GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID)); + #if (defined(__GFX10__) || defined(__GFX11__)) + unsigned wgp_id = __builtin_amdgcn_s_getreg( + GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID)); + unsigned sa_id = __builtin_amdgcn_s_getreg( + GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID)); + #else + #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) + unsigned xcc_id = __builtin_amdgcn_s_getreg( + GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID)); + #endif + unsigned cu_id = __builtin_amdgcn_s_getreg( + GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID)); + #endif + #if (defined(__GFX10__) || defined(__GFX11__)) + unsigned temp = se_id; + temp = (temp << HW_ID_SA_ID_SIZE) | sa_id; + temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id; + return temp; + //TODO : CU Mode impl + #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) + unsigned temp = xcc_id; + temp = (temp << HW_ID_SE_ID_SIZE) | se_id; + temp = (temp << HW_ID_CU_ID_SIZE) | cu_id; + return temp; + #else + return (se_id << HW_ID_CU_ID_SIZE) + cu_id; + #endif +} + +/** + * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications + * To be removed in a future release. + */ +#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[]; +#define HIP_DYNAMIC_SHARED_ATTRIBUTE + +#endif //defined(__clang__) && defined(__HIP__) + + +// loop unrolling +static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) { + auto dstPtr = static_cast(dst); + auto srcPtr = static_cast(src); + + while (size >= 4u) { + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + + size -= 4u; + srcPtr += 4u; + dstPtr += 4u; + } + switch (size) { + case 3: + dstPtr[2] = srcPtr[2]; + case 2: + dstPtr[1] = srcPtr[1]; + case 1: + dstPtr[0] = srcPtr[0]; + } + + return dst; +} + +static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) { + auto dstPtr = static_cast(dst); + + while (size >= 4u) { + dstPtr[0] = val; + dstPtr[1] = val; + dstPtr[2] = val; + dstPtr[3] = val; + + size -= 4u; + dstPtr += 4u; + } + switch (size) { + case 3: + dstPtr[2] = val; + case 2: + dstPtr[1] = val; + case 1: + dstPtr[0] = val; + } + + return dst; +} +#ifndef __OPENMP_AMDGCN__ +static inline __device__ void* memcpy(void* dst, const void* src, size_t size) { + return __hip_hc_memcpy(dst, src, size); +} + +static inline __device__ void* memset(void* ptr, int val, size_t size) { + unsigned char val8 = static_cast(val); + return __hip_hc_memset(ptr, val8, size); +} +#endif // !__OPENMP_AMDGCN__ + +#endif +/* +Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H + +__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) { + union { int i; unsigned u; float f; } tmp; tmp.u = src; + tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i); + return tmp.u; +} + +__device__ static inline float __hip_ds_bpermutef(int index, float src) { + union { int i; unsigned u; float f; } tmp; tmp.f = src; + tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i); + return tmp.f; +} + +__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) { + union { int i; unsigned u; float f; } tmp; tmp.u = src; + tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i); + return tmp.u; +} + +__device__ static inline float __hip_ds_permutef(int index, float src) { + union { int i; unsigned u; float f; } tmp; tmp.f = src; + tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i); + return tmp.f; +} + +#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src)) +#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src)) + +template +__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) { + union { int i; unsigned u; float f; } tmp; tmp.u = src; + tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern); + return tmp.u; +} + +template +__device__ static inline float __hip_ds_swizzlef_N(float src) { + union { int i; unsigned u; float f; } tmp; tmp.f = src; + tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern); + return tmp.f; +} + +#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \ + __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src)) + +template +__device__ static inline int __hip_move_dpp_N(int src) { + return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask, + bound_ctrl); +} + +static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE; + +__device__ +inline +int __shfl(int var, int src_lane, int width = warpSize) { + int self = __lane_id(); + int index = (src_lane & (width - 1)) + (self & ~(width-1)); + return __builtin_amdgcn_ds_bpermute(index<<2, var); +} +__device__ +inline +unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) { + union { int i; unsigned u; float f; } tmp; tmp.u = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.u; +} +__device__ +inline +float __shfl(float var, int src_lane, int width = warpSize) { + union { int i; unsigned u; float f; } tmp; tmp.f = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.f; +} +__device__ +inline +double __shfl(double var, int src_lane, int width = warpSize) { + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +long __shfl(long var, int src_lane, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(long) == 2 * sizeof(int), ""); + static_assert(sizeof(long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(long) == sizeof(int), ""); + return static_cast(__shfl(static_cast(var), src_lane, width)); + #endif +} +__device__ +inline +unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) { + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl(static_cast(var), src_lane, width)); + #endif +} +__device__ +inline +long long __shfl(long long var, int src_lane, int width = warpSize) +{ + static_assert(sizeof(long long) == 2 * sizeof(int), ""); + static_assert(sizeof(long long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) { + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +__device__ +inline +int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) { + int self = __lane_id(); + int index = self - lane_delta; + index = (index < (self & ~(width-1)))?self:index; + return __builtin_amdgcn_ds_bpermute(index<<2, var); +} +__device__ +inline +unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) { + union { int i; unsigned u; float f; } tmp; tmp.u = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.u; +} +__device__ +inline +float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) { + union { int i; unsigned u; float f; } tmp; tmp.f = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.f; +} +__device__ +inline +double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) { + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +long __shfl_up(long var, unsigned int lane_delta, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(long) == 2 * sizeof(int), ""); + static_assert(sizeof(long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(long) == sizeof(int), ""); + return static_cast(__shfl_up(static_cast(var), lane_delta, width)); + #endif +} + +__device__ +inline +unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_up(static_cast(var), lane_delta, width)); + #endif +} + +__device__ +inline +long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(long long) == 2 * sizeof(int), ""); + static_assert(sizeof(long long) == sizeof(uint64_t), ""); + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +__device__ +inline +unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +__device__ +inline +int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) { + int self = __lane_id(); + int index = self + lane_delta; + index = (int)((self&(width-1))+lane_delta) >= width?self:index; + return __builtin_amdgcn_ds_bpermute(index<<2, var); +} +__device__ +inline +unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) { + union { int i; unsigned u; float f; } tmp; tmp.u = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.u; +} +__device__ +inline +float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) { + union { int i; unsigned u; float f; } tmp; tmp.f = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.f; +} +__device__ +inline +double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) { + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +long __shfl_down(long var, unsigned int lane_delta, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(long) == 2 * sizeof(int), ""); + static_assert(sizeof(long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(long) == sizeof(int), ""); + return static_cast(__shfl_down(static_cast(var), lane_delta, width)); + #endif +} +__device__ +inline +unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_down(static_cast(var), lane_delta, width)); + #endif +} +__device__ +inline +long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(long long) == 2 * sizeof(int), ""); + static_assert(sizeof(long long) == sizeof(uint64_t), ""); + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +__device__ +inline +int __shfl_xor(int var, int lane_mask, int width = warpSize) { + int self = __lane_id(); + int index = self^lane_mask; + index = index >= ((self+width)&~(width-1))?self:index; + return __builtin_amdgcn_ds_bpermute(index<<2, var); +} +__device__ +inline +unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) { + union { int i; unsigned u; float f; } tmp; tmp.u = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.u; +} +__device__ +inline +float __shfl_xor(float var, int lane_mask, int width = warpSize) { + union { int i; unsigned u; float f; } tmp; tmp.f = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.f; +} +__device__ +inline +double __shfl_xor(double var, int lane_mask, int width = warpSize) { + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +long __shfl_xor(long var, int lane_mask, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(long) == 2 * sizeof(int), ""); + static_assert(sizeof(long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(long) == sizeof(int), ""); + return static_cast(__shfl_xor(static_cast(var), lane_mask, width)); + #endif +} +__device__ +inline +unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_xor(static_cast(var), lane_mask, width)); + #endif +} +__device__ +inline +long long __shfl_xor(long long var, int lane_mask, int width = warpSize) +{ + static_assert(sizeof(long long) == 2 * sizeof(int), ""); + static_assert(sizeof(long long) == sizeof(uint64_t), ""); + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file amd_detail/hip_cooperative_groups_helper.h + * + * @brief Device side implementation of cooperative group feature. + * + * Defines helper constructs and APIs which aid the types and device API + * wrappers defined within `amd_detail/hip_cooperative_groups.h`. + */ +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H + +#if __cplusplus +#if !defined(__HIPCC_RTC__) +#include // threadId, blockId +#include +#endif +#if !defined(__align__) +#define __align__(x) __attribute__((aligned(x))) +#endif + +#if !defined(__CG_QUALIFIER__) +#define __CG_QUALIFIER__ __device__ __forceinline__ +#endif + +#if !defined(__CG_STATIC_QUALIFIER__) +#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__ +#endif + +#if !defined(_CG_STATIC_CONST_DECL_) +#define _CG_STATIC_CONST_DECL_ static constexpr +#endif + +#if __AMDGCN_WAVEFRONT_SIZE == 32 +using lane_mask = unsigned int; +#else +using lane_mask = unsigned long long int; +#endif + +namespace cooperative_groups { + +/* Global scope */ +template +using is_power_of_2 = std::integral_constant; + +template +using is_valid_wavefront = std::integral_constant; + +template +using is_valid_tile_size = + std::integral_constant::value && is_valid_wavefront::value>; + +template +using is_valid_type = + std::integral_constant::value || std::is_floating_point::value>; + +namespace internal { + +/** +* @brief Enums representing different cooperative group types +* @note This enum is only applicable on Linux. +* + */ +typedef enum { + cg_invalid, + cg_multi_grid, + cg_grid, + cg_workgroup, + cg_tiled_group, + cg_coalesced_group +} group_type; +/** + * @ingroup CooperativeG + * @{ + * This section describes the cooperative groups functions of HIP runtime API. + * + * The cooperative groups provides flexible thread parallel programming algorithms, threads + * cooperate and share data to perform collective computations. + * + * @note Cooperative groups feature is implemented on Linux, under developement + * on Windows. + * + */ +/** + * + * @brief Functionalities related to multi-grid cooperative group type + * @note The following cooperative groups functions are only applicable on Linux. + * + */ +namespace multi_grid { + +__CG_STATIC_QUALIFIER__ uint32_t num_grids() { + return static_cast(__ockl_multi_grid_num_grids()); } + +__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { + return static_cast(__ockl_multi_grid_grid_rank()); } + +__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast(__ockl_multi_grid_size()); } + +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + return static_cast(__ockl_multi_grid_thread_rank()); } + +__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_multi_grid_is_valid()); } + +__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } + +} // namespace multi_grid + +/** + * @brief Functionalities related to grid cooperative group type + * @note The following cooperative groups functions are only applicable on Linux. + */ +namespace grid { + +__CG_STATIC_QUALIFIER__ uint32_t size() { + return static_cast((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * + (blockDim.x * gridDim.x)); +} + +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + // Compute global id of the workgroup to which the current thread belongs to + uint32_t blkIdx = static_cast((blockIdx.z * gridDim.y * gridDim.x) + + (blockIdx.y * gridDim.x) + (blockIdx.x)); + + // Compute total number of threads being passed to reach current workgroup + // within grid + uint32_t num_threads_till_current_workgroup = + static_cast(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); + + // Compute thread local rank within current workgroup + uint32_t local_thread_rank = static_cast((threadIdx.z * blockDim.y * blockDim.x) + + (threadIdx.y * blockDim.x) + (threadIdx.x)); + + return (num_threads_till_current_workgroup + local_thread_rank); +} + +__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_grid_is_valid()); } + +__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } + +} // namespace grid + +/** + * @brief Functionalities related to `workgroup` (thread_block in CUDA terminology) + * cooperative group type + * @note The following cooperative groups functions are only applicable on Linux. + */ +namespace workgroup { + +__CG_STATIC_QUALIFIER__ dim3 group_index() { + return (dim3(static_cast(blockIdx.x), static_cast(blockIdx.y), + static_cast(blockIdx.z))); +} + +__CG_STATIC_QUALIFIER__ dim3 thread_index() { + return (dim3(static_cast(threadIdx.x), static_cast(threadIdx.y), + static_cast(threadIdx.z))); +} + +__CG_STATIC_QUALIFIER__ uint32_t size() { + return (static_cast(blockDim.x * blockDim.y * blockDim.z)); +} + +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + return (static_cast((threadIdx.z * blockDim.y * blockDim.x) + + (threadIdx.y * blockDim.x) + (threadIdx.x))); +} + +__CG_STATIC_QUALIFIER__ bool is_valid() { + return true; +} + +__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); } + +__CG_STATIC_QUALIFIER__ dim3 block_dim() { + return (dim3(static_cast(blockDim.x), static_cast(blockDim.y), + static_cast(blockDim.z))); +} + +} // namespace workgroup + +namespace tiled_group { + +// enforce ordering for memory intructions +__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); } + +} // namespace tiled_group + +namespace coalesced_group { + +// enforce ordering for memory intructions +__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); } + +// Masked bit count +// +// For each thread, this function returns the number of active threads which +// have i-th bit of x set and come before the current thread. +__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { + unsigned int counter=0; + #if __AMDGCN_WAVEFRONT_SIZE == 32 + counter = __builtin_amdgcn_mbcnt_lo(x, add); + #else + counter = __builtin_amdgcn_mbcnt_lo(static_cast(x), add); + counter = __builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), counter); + #endif + + return counter; +} + +} // namespace coalesced_group + + +} // namespace internal + +} // namespace cooperative_groups +/** +* @} +*/ + +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file amd_detail/hip_cooperative_groups.h + * + * @brief Device side implementation of `Cooperative Group` feature. + * + * Defines new types and device API wrappers related to `Cooperative Group` + * feature, which the programmer can directly use in his kernel(s) in order to + * make use of this feature. + */ +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H + +#if __cplusplus +#if !defined(__HIPCC_RTC__) +#include +#endif + +#define __hip_abort() \ + { abort(); } +#if defined(NDEBUG) +#define __hip_assert(COND) +#else +#define __hip_assert(COND) \ + { \ + if (!COND) { \ + __hip_abort(); \ + } \ + } +#endif + +namespace cooperative_groups { + +/** @brief The base type of all cooperative group types + * + * \details Holds the key properties of a constructed cooperative group types + * object, like the group type, its size, etc + * + * @note Cooperative groups feature is implemented on Linux, under developement + * on Windows. + */ +class thread_group { + protected: + uint32_t _type; // thread_group type + uint32_t _size; // total number of threads in the tread_group + uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types, + // LSB represents lane 0, and MSB represents lane 63 + + // Construct a thread group, and set thread group type and other essential + // thread group properties. This generic thread group is directly constructed + // only when the group is supposed to contain only the calling the thread + // (throurh the API - `this_thread()`), and in all other cases, this thread + // group object is a sub-object of some other derived thread group object + __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast(0), + uint64_t mask = static_cast(0)) { + _type = type; + _size = size; + _mask = mask; + } + + struct _tiled_info { + bool is_tiled; + unsigned int size; + unsigned int meta_group_rank; + unsigned int meta_group_size; + }; + + struct _coalesced_info { + lane_mask member_mask; + unsigned int size; + struct _tiled_info tiled_info; + } coalesced_info; + + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, + unsigned int tile_size); + friend class thread_block; + + public: + // Total number of threads in the thread group, and this serves the purpose + // for all derived cooperative group types since their `size` is directly + // saved during the construction + __CG_QUALIFIER__ uint32_t size() const { return _size; } + __CG_QUALIFIER__ unsigned int cg_type() const { return _type; } + // Rank of the calling thread within [0, size()) + __CG_QUALIFIER__ uint32_t thread_rank() const; + // Is this cooperative group type valid? + __CG_QUALIFIER__ bool is_valid() const; + // synchronize the threads in the thread group + __CG_QUALIFIER__ void sync() const; +}; +/** + *------------------------------------------------------------------------------------------------- + *------------------------------------------------------------------------------------------------- + * @defgroup CooperativeG Cooperative Groups + * @ingroup API + * @{ + * This section describes the cooperative groups functions of HIP runtime API. + * + * The cooperative groups provides flexible thread parallel programming algorithms, threads + * cooperate and share data to perform collective computations. + * + * @note Cooperative groups feature is implemented on Linux, under developement + * on Windows. + * + */ +/** \brief The multi-grid cooperative group type + * + * \details Represents an inter-device cooperative group type where the + * participating threads within the group spans across multple + * devices, running the (same) kernel on these devices + * @note The multi-grid cooperative group type is implemented on Linux, under developement + * on Windows. + */ +class multi_grid_group : public thread_group { + // Only these friend functions are allowed to construct an object of this class + // and access its resources + friend __CG_QUALIFIER__ multi_grid_group this_multi_grid(); + + protected: + // Construct mutli-grid thread group (through the API this_multi_grid()) + explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size) + : thread_group(internal::cg_multi_grid, size) {} + + public: + // Number of invocations participating in this multi-grid group. In other + // words, the number of GPUs + __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); } + // Rank of this invocation. In other words, an ID number within the range + // [0, num_grids()) of the GPU, this kernel is running on + __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); } + __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); } + __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); } + __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); } +}; + +/** @brief User exposed API interface to construct multi-grid cooperative + * group type object - `multi_grid_group` + * + * \details User is not allowed to directly construct an object of type + * `multi_grid_group`. Instead, he should construct it through this + * API function + * @note This multi-grid cooperative API type is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ multi_grid_group this_multi_grid() { + return multi_grid_group(internal::multi_grid::size()); +} + +/** @brief The grid cooperative group type + * + * \details Represents an inter-workgroup cooperative group type where the + * participating threads within the group spans across multiple + * workgroups running the (same) kernel on the same device + * @note This is implemented on Linux, under developement + * on Windows. + */ +class grid_group : public thread_group { + // Only these friend functions are allowed to construct an object of this class + // and access its resources + friend __CG_QUALIFIER__ grid_group this_grid(); + + protected: + // Construct grid thread group (through the API this_grid()) + explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {} + + public: + __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); } + __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); } + __CG_QUALIFIER__ void sync() const { internal::grid::sync(); } +}; + +/** @brief User exposed API interface to construct grid cooperative group type + * object - `grid_group` + * + * \details User is not allowed to directly construct an object of type + * `multi_grid_group`. Instead, he should construct it through this + * API function + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); } + +/** @brief The workgroup (thread-block in CUDA terminology) cooperative group + * type + * + * \details Represents an intra-workgroup cooperative group type where the + * participating threads within the group are exactly the same threads + * which are participated in the currently executing `workgroup` + * @note This is implemented on Linux, under developement + * on Windows. + */ +class thread_block : public thread_group { + // Only these friend functions are allowed to construct an object of thi + // class and access its resources + friend __CG_QUALIFIER__ thread_block this_thread_block(); + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, + unsigned int tile_size); + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, + unsigned int tile_size); + protected: + // Construct a workgroup thread group (through the API this_thread_block()) + explicit __CG_QUALIFIER__ thread_block(uint32_t size) + : thread_group(internal::cg_workgroup, size) {} + + __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const { + const bool pow2 = ((tile_size & (tile_size - 1)) == 0); + // Invalid tile size, assert + if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { + __hip_assert(false && "invalid tile size") + } + + thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size); + tiledGroup.coalesced_info.tiled_info.size = tile_size; + tiledGroup.coalesced_info.tiled_info.is_tiled = true; + tiledGroup.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; + tiledGroup.coalesced_info.tiled_info.meta_group_size = (size() + tile_size - 1) / tile_size; + return tiledGroup; + } + + public: + // 3-dimensional block index within the grid + __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); } + // 3-dimensional thread index within the block + __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); } + __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); } + __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); } + __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); } + __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); } + __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); } +}; + +/** \brief User exposed API interface to construct workgroup cooperative + * group type object - `thread_block`. + * + * \details User is not allowed to directly construct an object of type + * `thread_block`. Instead, he should construct it through this API + * function. + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ thread_block this_thread_block() { + return thread_block(internal::workgroup::size()); +} + +/** \brief The tiled_group cooperative group type + * + * \details Represents one tiled thread group in a wavefront. + * This group type also supports sub-wave level intrinsics. + * @note This is implemented on Linux, under developement + * on Windows. + */ + +class tiled_group : public thread_group { + private: + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, + unsigned int tile_size); + friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, + unsigned int tile_size); + + __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const { + const bool pow2 = ((tile_size & (tile_size - 1)) == 0); + + if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { + __hip_assert(false && "invalid tile size") + } + + if (size() <= tile_size) { + return *this; + } + + tiled_group tiledGroup = tiled_group(tile_size); + tiledGroup.coalesced_info.tiled_info.is_tiled = true; + return tiledGroup; + } + + protected: + explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize) + : thread_group(internal::cg_tiled_group, tileSize) { + coalesced_info.tiled_info.size = tileSize; + coalesced_info.tiled_info.is_tiled = true; + } + + public: + __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); } + + __CG_QUALIFIER__ unsigned int thread_rank() const { + return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1)); + } + + __CG_QUALIFIER__ void sync() const { + internal::tiled_group::sync(); + } +}; + +/** \brief The coalesced_group cooperative group type + * + * \details Represents a active thread group in a wavefront. + * This group type also supports sub-wave level intrinsics. + * @note This is implemented on Linux, under developement + * on Windows. + */ +class coalesced_group : public thread_group { + private: + friend __CG_QUALIFIER__ coalesced_group coalesced_threads(); + friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size); + friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size); + + __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const { + const bool pow2 = ((tile_size & (tile_size - 1)) == 0); + + if (!tile_size || (tile_size > size()) || !pow2) { + return coalesced_group(0); + } + + // If a tiled group is passed to be partitioned further into a coalesced_group. + // prepare a mask for further partitioning it so that it stays coalesced. + if (coalesced_info.tiled_info.is_tiled) { + unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); + unsigned int masklength = min(static_cast(size()) - base_offset, tile_size); + lane_mask member_mask = static_cast(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); + + member_mask <<= (__lane_id() & ~(tile_size - 1)); + coalesced_group coalesced_tile = coalesced_group(member_mask); + coalesced_tile.coalesced_info.tiled_info.is_tiled = true; + coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; + coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size; + return coalesced_tile; + } + // Here the parent coalesced_group is not partitioned. + else { + lane_mask member_mask = 0; + unsigned int tile_rank = 0; + int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size; + + for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) { + lane_mask active = coalesced_info.member_mask & (1 << i); + // Make sure the lane is active + if (active) { + if (lanes_to_skip <= 0 && tile_rank < tile_size) { + // Prepare a member_mask that is appropriate for a tile + member_mask |= active; + tile_rank++; + } + lanes_to_skip--; + } + } + coalesced_group coalesced_tile = coalesced_group(member_mask); + coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; + coalesced_tile.coalesced_info.tiled_info.meta_group_size = + (size() + tile_size - 1) / tile_size; + return coalesced_tile; + } + return coalesced_group(0); + } + + protected: + // Constructor + explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask) + : thread_group(internal::cg_coalesced_group) { + coalesced_info.member_mask = member_mask; // Which threads are active + coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active + coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group + coalesced_info.tiled_info.meta_group_rank = 0; + coalesced_info.tiled_info.meta_group_size = 1; + } + + public: + __CG_QUALIFIER__ unsigned int size() const { + return coalesced_info.size; + } + + __CG_QUALIFIER__ unsigned int thread_rank() const { + return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask); + } + + __CG_QUALIFIER__ void sync() const { + internal::coalesced_group::sync(); + } + + __CG_QUALIFIER__ unsigned int meta_group_rank() const { + return coalesced_info.tiled_info.meta_group_rank; + } + + __CG_QUALIFIER__ unsigned int meta_group_size() const { + return coalesced_info.tiled_info.meta_group_size; + } + + template + __CG_QUALIFIER__ T shfl(T var, int srcRank) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + + srcRank = srcRank % static_cast(size()); + + int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank + : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) + : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); + + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + } + + template + __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + + // Note: The cuda implementation appears to use the remainder of lane_delta + // and WARP_SIZE as the shift value rather than lane_delta itself. + // This is not described in the documentation and is not done here. + + if (size() == __AMDGCN_WAVEFRONT_SIZE) { + return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE); + } + + int lane; + if (__AMDGCN_WAVEFRONT_SIZE == 64) { + lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1); + } + else { + lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1); + } + + if (lane == -1) { + lane = __lane_id(); + } + + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + } + + template + __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + + // Note: The cuda implementation appears to use the remainder of lane_delta + // and WARP_SIZE as the shift value rather than lane_delta itself. + // This is not described in the documentation and is not done here. + + if (size() == __AMDGCN_WAVEFRONT_SIZE) { + return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE); + } + + int lane; + if (__AMDGCN_WAVEFRONT_SIZE == 64) { + lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); + } + else if (__AMDGCN_WAVEFRONT_SIZE == 32) { + lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); + } + + if (lane == -1) { + lane = __lane_id(); + } + + return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + } +}; + +/** \brief User exposed API to create coalesced groups. + * + * \details A collective operation that groups all active lanes into a new thread group. + * @note This function is implemented on Linux, under developement + * on Windows. + */ + +__CG_QUALIFIER__ coalesced_group coalesced_threads() { + return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec()); +} + +/** + * Implemenation of all publicly exposed base class APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { + switch (this->_type) { + case internal::cg_multi_grid: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_grid: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_workgroup: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_tiled_group: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_coalesced_group: { + return (static_cast(this)->thread_rank()); + } + default: { + __hip_assert(false && "invalid cooperative group type") + return -1; + } + } +} +/** + * Implemenation of all publicly exposed thread group API + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ bool thread_group::is_valid() const { + switch (this->_type) { + case internal::cg_multi_grid: { + return (static_cast(this)->is_valid()); + } + case internal::cg_grid: { + return (static_cast(this)->is_valid()); + } + case internal::cg_workgroup: { + return (static_cast(this)->is_valid()); + } + case internal::cg_tiled_group: { + return (static_cast(this)->is_valid()); + } + case internal::cg_coalesced_group: { + return (static_cast(this)->is_valid()); + } + default: { + __hip_assert(false && "invalid cooperative group type") + return false; + } + } +} +/** + * Implemenation of all publicly exposed thread group sync API + * @note This function is implemented on Linux, under developement + * on Windows. + */ +__CG_QUALIFIER__ void thread_group::sync() const { + switch (this->_type) { + case internal::cg_multi_grid: { + static_cast(this)->sync(); + break; + } + case internal::cg_grid: { + static_cast(this)->sync(); + break; + } + case internal::cg_workgroup: { + static_cast(this)->sync(); + break; + } + case internal::cg_tiled_group: { + static_cast(this)->sync(); + break; + } + case internal::cg_coalesced_group: { + static_cast(this)->sync(); + break; + } + default: { + __hip_assert(false && "invalid cooperative group type") + } + } +} + +/** + * Implemenation of publicly exposed `wrapper` API on top of basic cooperative + * group type APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +template __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); } +/** + * Implemenation of publicly exposed `wrapper` API on top of basic cooperative + * group type APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +template __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) { + return g.thread_rank(); +} +/** + * Implemenation of publicly exposed `wrapper` API on top of basic cooperative + * group type APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +template __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); } +/** + * Implemenation of publicly exposed `wrapper` API on top of basic cooperative + * group type APIs + * @note This function is implemented on Linux, under developement + * on Windows. + */ +template __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); } +/** + * template class tile_base + * @note This class is implemented on Linux, under developement + * on Windows. + */ +template class tile_base { + protected: + _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; + + public: + // Rank of the thread within this tile + _CG_STATIC_CONST_DECL_ unsigned int thread_rank() { + return (internal::workgroup::thread_rank() & (numThreads - 1)); + } + + // Number of threads within this tile + __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; } +}; +/** + * template class thread_block_tile_base + * @note This class is implemented on Linux, under developement + * on Windows. + */ +template class thread_block_tile_base : public tile_base { + static_assert(is_valid_tile_size::value, + "Tile size is either not a power of 2 or greater than the wavefront size"); + using tile_base::numThreads; + + public: + __CG_STATIC_QUALIFIER__ void sync() { + internal::tiled_group::sync(); + } + + template __CG_QUALIFIER__ T shfl(T var, int srcRank) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + return (__shfl(var, srcRank, numThreads)); + } + + template __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + return (__shfl_down(var, lane_delta, numThreads)); + } + + template __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + return (__shfl_up(var, lane_delta, numThreads)); + } + + template __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const { + static_assert(is_valid_type::value, "Neither an integer or float type."); + return (__shfl_xor(var, laneMask, numThreads)); + } +}; +/** \brief User exposed API that captures the state of the parent group pre-partition + */ +template +class parent_group_info { +public: + // Returns the linear rank of the group within the set of tiles partitioned + // from a parent group (bounded by meta_group_size) + __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() { + return ParentCGTy::thread_rank() / tileSize; + } + + // Returns the number of groups created when the parent group was partitioned. + __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() { + return (ParentCGTy::size() + tileSize - 1) / tileSize; + } +}; + +/** \brief Group type - thread_block_tile + * + * \details Represents one tile of thread group. + * @note This type is implemented on Linux, under developement + * on Windows. + */ +template +class thread_block_tile_type : public thread_block_tile_base, + public tiled_group, + public parent_group_info { + _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; + protected: + __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) { + coalesced_info.tiled_info.size = numThreads; + coalesced_info.tiled_info.is_tiled = true; + } +}; + +// Partial template specialization +template +class thread_block_tile_type : public thread_block_tile_base, + public tiled_group + { + _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; + + typedef thread_block_tile_base tbtBase; + + protected: + + __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size) + : tiled_group(numThreads) { + coalesced_info.tiled_info.size = numThreads; + coalesced_info.tiled_info.is_tiled = true; + coalesced_info.tiled_info.meta_group_rank = meta_group_rank; + coalesced_info.tiled_info.meta_group_size = meta_group_size; + } + + public: + using tbtBase::size; + using tbtBase::sync; + using tbtBase::thread_rank; + + __CG_QUALIFIER__ unsigned int meta_group_rank() const { + return coalesced_info.tiled_info.meta_group_rank; + } + + __CG_QUALIFIER__ unsigned int meta_group_size() const { + return coalesced_info.tiled_info.meta_group_size; + } +// end of operative group +/** +* @} +*/ +}; + + +/** \brief User exposed API to partition groups. + * + * \details A collective operation that partitions the parent group into a one-dimensional, + * row-major, tiling of subgroups. + */ + +__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) { + if (parent.cg_type() == internal::cg_tiled_group) { + const tiled_group* cg = static_cast(&parent); + return cg->new_tiled_group(tile_size); + } + else if(parent.cg_type() == internal::cg_coalesced_group) { + const coalesced_group* cg = static_cast(&parent); + return cg->new_tiled_group(tile_size); + } + else { + const thread_block* tb = static_cast(&parent); + return tb->new_tiled_group(tile_size); + } +} + +// Thread block type overload +__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) { + return (parent.new_tiled_group(tile_size)); +} + +__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) { + return (parent.new_tiled_group(tile_size)); +} + +// If a coalesced group is passed to be partitioned, it should remain coalesced +__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) { + return (parent.new_tiled_group(tile_size)); +} + +template class thread_block_tile; + +namespace impl { +template class thread_block_tile_internal; + +template +class thread_block_tile_internal : public thread_block_tile_type { + protected: + template + __CG_QUALIFIER__ thread_block_tile_internal( + const thread_block_tile_internal& g) + : thread_block_tile_type(g.meta_group_rank(), g.meta_group_size()) {} + + __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g) + : thread_block_tile_type() {} +}; +} // namespace impl + +template +class thread_block_tile : public impl::thread_block_tile_internal { + protected: + __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g) + : impl::thread_block_tile_internal(g) {} + + public: + __CG_QUALIFIER__ operator thread_block_tile() const { + return thread_block_tile(*this); + } +}; + + +template +class thread_block_tile : public impl::thread_block_tile_internal { + template friend class thread_block_tile; + + protected: + public: + template + __CG_QUALIFIER__ thread_block_tile(const thread_block_tile& g) + : impl::thread_block_tile_internal(g) {} +}; + +template class thread_block_tile; + +namespace impl { +template struct tiled_partition_internal; + +template +struct tiled_partition_internal : public thread_block_tile { + __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g) + : thread_block_tile(g) {} +}; + +} // namespace impl + +/** \brief User exposed API to partition groups. + * + * \details This constructs a templated class derieved from thread_group. + * The template defines tile size of the new thread group at compile time. + */ +template +__CG_QUALIFIER__ thread_block_tile tiled_partition(const ParentCGTy& g) { + static_assert(is_valid_tile_size::value, + "Tiled partition with size > wavefront size. Currently not supported "); + return impl::tiled_partition_internal(g); +} +} // namespace cooperative_groups + +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H +/* +Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#ifdef __cplusplus + +/** + * @brief Unsafe floating point rmw atomic add. + * + * Performs a relaxed read-modify-write floating point atomic add with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated to have the original value plus \p value + * + * @note This operation currently only performs different operations for + * the gfx90a target. Other devices continue to use safe atomics. + * + * It can be used to generate code that uses fast hardware floating point atomic + * operations which may handle rounding and subnormal values differently than + * non-atomic floating point operations. + * + * The operation is not always safe and can have undefined behavior unless + * following condition are met: + * + * - \p addr is at least 4 bytes aligned + * - If \p addr is a global segment address, it is in a coarse grain allocation. + * Passing in global segment addresses in fine grain allocations will result in + * undefined behavior and is not supported. + * + * @param [in,out] addr Pointer to value to be increment by \p value. + * @param [in] value Value by \p addr is to be incremented. + * @return Original value contained in \p addr. + */ +__device__ inline float unsafeAtomicAdd(float* addr, float value) { +#if defined(__gfx90a__) && \ + __has_builtin(__builtin_amdgcn_is_shared) && \ + __has_builtin(__builtin_amdgcn_is_private) && \ + __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \ + __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32) + if (__builtin_amdgcn_is_shared( + (const __attribute__((address_space(0))) void*)addr)) + return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value); + else if (__builtin_amdgcn_is_private( + (const __attribute__((address_space(0))) void*)addr)) { + float temp = *addr; + *addr = temp + value; + return temp; + } + else + return __builtin_amdgcn_global_atomic_fadd_f32(addr, value); +#elif __has_builtin(__hip_atomic_fetch_add) + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED); +#endif +} + +/** + * @brief Unsafe floating point rmw atomic max. + * + * Performs a relaxed read-modify-write floating point atomic max with + * device memory scope. The original value at \p addr is returned and + * the value at \p addr is replaced by \p val if greater. + * + * @note This operation is currently identical to that performed by + * atomicMax and is included for completeness. + * + * @param [in,out] addr Pointer to value to be updated + * @param [in] val Value used to update the value at \p addr. + * @return Original value contained in \p addr. + */ +__device__ inline float unsafeAtomicMax(float* addr, float val) { + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value < val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned int *uaddr = (unsigned int *)addr; + unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __uint_as_float(value) < val) { + done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __uint_as_float(value); + #endif +} + +/** + * @brief Unsafe floating point rmw atomic min. + * + * Performs a relaxed read-modify-write floating point atomic min with + * device memory scope. The original value at \p addr is returned and + * the value at \p addr is replaced by \p val if lesser. + * + * @note This operation is currently identical to that performed by + * atomicMin and is included for completeness. + * + * @param [in,out] addr Pointer to value to be updated + * @param [in] val Value used to update the value at \p addr. + * @return Original value contained in \p addr. + */ +__device__ inline float unsafeAtomicMin(float* addr, float val) { + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value > val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned int *uaddr = (unsigned int *)addr; + unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __uint_as_float(value) > val) { + done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __uint_as_float(value); + #endif +} + +/** + * @brief Unsafe double precision rmw atomic add. + * + * Performs a relaxed read-modify-write double precision atomic add with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated to have the original value plus \p value + * + * @note This operation currently only performs different operations for + * the gfx90a target. Other devices continue to use safe atomics. + * + * It can be used to generate code that uses fast hardware floating point atomic + * operations which may handle rounding and subnormal values differently than + * non-atomic floating point operations. + * + * The operation is not always safe and can have undefined behavior unless + * following condition are met: + * + * - \p addr is at least 8 byte aligned + * - If \p addr is a global segment address, it is in a coarse grain allocation. + * Passing in global segment addresses in fine grain allocations will result in + * undefined behavior and are not supported. + * + * @param [in,out] addr Pointer to value to be updated. + * @param [in] value Value by \p addr is to be incremented. + * @return Original value contained in \p addr. + */ +__device__ inline double unsafeAtomicAdd(double* addr, double value) { +#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64) + return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value); +#elif defined (__hip_atomic_fetch_add) + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED); +#endif +} + +/** + * @brief Unsafe double precision rmw atomic max. + * + * Performs a relaxed read-modify-write double precision atomic max with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated with \p val if greater. + * + * @note This operation currently only performs different operations for + * the gfx90a target. Other devices continue to use safe atomics. + * + * It can be used to generate code that uses fast hardware floating point atomic + * operations which may handle rounding and subnormal values differently than + * non-atomic floating point operations. + * + * The operation is not always safe and can have undefined behavior unless + * following condition are met: + * + * - \p addr is at least 8 byte aligned + * - If \p addr is a global segment address, it is in a coarse grain allocation. + * Passing in global segment addresses in fine grain allocations will result in + * undefined behavior and are not supported. + * + * @param [in,out] addr Pointer to value to be updated. + * @param [in] val Value used to updated the contents at \p addr + * @return Original value contained at \p addr. + */ +__device__ inline double unsafeAtomicMax(double* addr, double val) { +#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64) + return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val); +#else + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value < val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned long long *uaddr = (unsigned long long *)addr; + unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __longlong_as_double(value) < val) { + done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __longlong_as_double(value); + #endif +#endif +} + +/** + * @brief Unsafe double precision rmw atomic min. + * + * Performs a relaxed read-modify-write double precision atomic min with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated with \p val if lesser. + * + * @note This operation currently only performs different operations for + * the gfx90a target. Other devices continue to use safe atomics. + * + * It can be used to generate code that uses fast hardware floating point atomic + * operations which may handle rounding and subnormal values differently than + * non-atomic floating point operations. + * + * The operation is not always safe and can have undefined behavior unless + * following condition are met: + * + * - \p addr is at least 8 byte aligned + * - If \p addr is a global segment address, it is in a coarse grain allocation. + * Passing in global segment addresses in fine grain allocations will result in + * undefined behavior and are not supported. + * + * @param [in,out] addr Pointer to value to be updated. + * @param [in] val Value used to updated the contents at \p addr + * @return Original value contained at \p addr. + */ +__device__ inline double unsafeAtomicMin(double* addr, double val) { +#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64) + return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val); +#else + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value > val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned long long *uaddr = (unsigned long long *)addr; + unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __longlong_as_double(value) > val) { + done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __longlong_as_double(value); + #endif +#endif +} + +/** + * @brief Safe floating point rmw atomic add. + * + * Performs a relaxed read-modify-write floating point atomic add with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated to have the original value plus \p value + * + * @note This operation ensures that, on all targets, we produce safe atomics. + * This will be the case even when -munsafe-fp-atomics is passed into the compiler. + * + * @param [in,out] addr Pointer to value to be increment by \p value. + * @param [in] value Value by \p addr is to be incremented. + * @return Original value contained in \p addr. + */ +__device__ inline float safeAtomicAdd(float* addr, float value) { +#if defined(__gfx908__) || defined(__gfx941__) \ + || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \ + && !__has_builtin(__hip_atomic_fetch_add)) + // On gfx908, we can generate unsafe FP32 atomic add that does not follow all + // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead. + // On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically, + // so we need to force a CAS loop emulation to ensure safety. + // On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we + // need to force a CAS loop here. + float old_val; +#if __has_builtin(__hip_atomic_load) + old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_load) + old_val = __uint_as_float(__atomic_load_n(reinterpret_cast(addr), __ATOMIC_RELAXED)); +#endif // __has_builtin(__hip_atomic_load) + float expected, temp; + do { + temp = expected = old_val; +#if __has_builtin(__hip_atomic_compare_exchange_strong) + __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED, + __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_compare_exchange_strong) + __atomic_compare_exchange_n(addr, &expected, old_val + value, false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); +#endif // __has_builtin(__hip_atomic_compare_exchange_strong) + old_val = expected; + } while (__float_as_uint(temp) != __float_as_uint(old_val)); + return old_val; +#elif defined(__gfx90a__) + // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope + // atomics will produce safe CAS loops, but are otherwise not different than + // agent-scope atomics. This logic is only applicable for gfx90a, and should + // not be assumed on other architectures. + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#elif __has_builtin(__hip_atomic_fetch_add) + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED); +#endif +} + +/** + * @brief Safe floating point rmw atomic max. + * + * Performs a relaxed read-modify-write floating point atomic max with + * device memory scope. The original value at \p addr is returned and + * the value at \p addr is replaced by \p val if greater. + * + * @note This operation ensures that, on all targets, we produce safe atomics. + * This will be the case even when -munsafe-fp-atomics is passed into the compiler. + * + * @param [in,out] addr Pointer to value to be updated + * @param [in] val Value used to update the value at \p addr. + * @return Original value contained in \p addr. + */ +__device__ inline float safeAtomicMax(float* addr, float val) { + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value < val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned int *uaddr = (unsigned int *)addr; + unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __uint_as_float(value) < val) { + done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __uint_as_float(value); + #endif +} + +/** + * @brief Safe floating point rmw atomic min. + * + * Performs a relaxed read-modify-write floating point atomic min with + * device memory scope. The original value at \p addr is returned and + * the value at \p addr is replaced by \p val if lesser. + * + * @note This operation ensures that, on all targets, we produce safe atomics. + * This will be the case even when -munsafe-fp-atomics is passed into the compiler. + * + * @param [in,out] addr Pointer to value to be updated + * @param [in] val Value used to update the value at \p addr. + * @return Original value contained in \p addr. + */ +__device__ inline float safeAtomicMin(float* addr, float val) { + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value > val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned int *uaddr = (unsigned int *)addr; + unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __uint_as_float(value) > val) { + done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __uint_as_float(value); + #endif +} + +/** + * @brief Safe double precision rmw atomic add. + * + * Performs a relaxed read-modify-write double precision atomic add with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated to have the original value plus \p value + * + * @note This operation ensures that, on all targets, we produce safe atomics. + * This will be the case even when -munsafe-fp-atomics is passed into the compiler. + * + * @param [in,out] addr Pointer to value to be increment by \p value. + * @param [in] value Value by \p addr is to be incremented. + * @return Original value contained in \p addr. + */ +__device__ inline double safeAtomicAdd(double* addr, double value) { +#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add) + // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope + // atomics will produce safe CAS loops, but are otherwise not different than + // agent-scope atomics. This logic is only applicable for gfx90a, and should + // not be assumed on other architectures. + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#elif defined(__gfx90a__) + // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to + // force a CAS loop here. + double old_val; +#if __has_builtin(__hip_atomic_load) + old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_load) + old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast(addr), __ATOMIC_RELAXED)); +#endif // __has_builtin(__hip_atomic_load) + double expected, temp; + do { + temp = expected = old_val; +#if __has_builtin(__hip_atomic_compare_exchange_strong) + __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED, + __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_compare_exchange_strong) + __atomic_compare_exchange_n(addr, &expected, old_val + value, false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); +#endif // __has_builtin(__hip_atomic_compare_exchange_strong) + old_val = expected; + } while (__double_as_longlong(temp) != __double_as_longlong(old_val)); + return old_val; +#else // !defined(__gfx90a__) +#if __has_builtin(__hip_atomic_fetch_add) + return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else // !__has_builtin(__hip_atomic_fetch_add) + return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED); +#endif // __has_builtin(__hip_atomic_fetch_add) +#endif +} + +/** + * @brief Safe double precision rmw atomic max. + * + * Performs a relaxed read-modify-write double precision atomic max with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated with \p val if greater. + * + * @note This operation ensures that, on all targets, we produce safe atomics. + * This will be the case even when -munsafe-fp-atomics is passed into the compiler. + * + * @param [in,out] addr Pointer to value to be updated. + * @param [in] val Value used to updated the contents at \p addr + * @return Original value contained at \p addr. + */ +__device__ inline double safeAtomicMax(double* addr, double val) { + #if __has_builtin(__builtin_amdgcn_is_private) + if (__builtin_amdgcn_is_private( + (const __attribute__((address_space(0))) void*)addr)) { + double old = *addr; + *addr = __builtin_fmax(old, val); + return old; + } else { + #endif + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value < val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned long long *uaddr = (unsigned long long *)addr; + unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __longlong_as_double(value) < val) { + done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __longlong_as_double(value); + #endif + #if __has_builtin(__builtin_amdgcn_is_private) + } + #endif +} + +/** + * @brief Safe double precision rmw atomic min. + * + * Performs a relaxed read-modify-write double precision atomic min with + * device memory scope. Original value at \p addr is returned and + * the value of \p addr is updated with \p val if lesser. + * + * @note This operation ensures that, on all targets, we produce safe atomics. + * This will be the case even when -munsafe-fp-atomics is passed into the compiler. + * + * @param [in,out] addr Pointer to value to be updated. + * @param [in] val Value used to updated the contents at \p addr + * @return Original value contained at \p addr. + */ +__device__ inline double safeAtomicMin(double* addr, double val) { + #if __has_builtin(__builtin_amdgcn_is_private) + if (__builtin_amdgcn_is_private( + (const __attribute__((address_space(0))) void*)addr)) { + double old = *addr; + *addr = __builtin_fmin(old, val); + return old; + } else { + #endif + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value > val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned long long *uaddr = (unsigned long long *)addr; + unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __longlong_as_double(value) > val) { + done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __longlong_as_double(value); + #endif + #if __has_builtin(__builtin_amdgcn_is_private) + } + #endif +} + +#endif +/* +Copyright (c) 2015 - Present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if !defined(__HIPCC_RTC__) +#include "amd_device_functions.h" +#endif + +#if __has_builtin(__hip_atomic_compare_exchange_strong) + +template struct Cond_t; + +template struct Cond_t { using type = T; }; +template struct Cond_t { using type = F; }; + +#if !__HIP_DEVICE_COMPILE__ +//TODO: Remove this after compiler pre-defines the following Macros. +#define __HIP_MEMORY_SCOPE_SINGLETHREAD 1 +#define __HIP_MEMORY_SCOPE_WAVEFRONT 2 +#define __HIP_MEMORY_SCOPE_WORKGROUP 3 +#define __HIP_MEMORY_SCOPE_AGENT 4 +#define __HIP_MEMORY_SCOPE_SYSTEM 5 +#endif + +#if !defined(__HIPCC_RTC__) +#include "amd_hip_unsafe_atomics.h" +#endif + +// Atomic expanders +template< + int mem_order = __ATOMIC_SEQ_CST, + int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM, + typename T, + typename Op, + typename F> +inline +__attribute__((always_inline, device)) +T hip_cas_expander(T* p, T x, Op op, F f) noexcept +{ + using FP = __attribute__((address_space(0))) const void*; + + __device__ + extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared"); + + if (is_shared_workaround((FP)p)) + return f(); + + using U = typename Cond_t< + sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type; + + auto q = reinterpret_cast(p); + + U tmp0{__hip_atomic_load(q, mem_order, mem_scope)}; + U tmp1; + do { + tmp1 = tmp0; + + op(reinterpret_cast(tmp1), x); + } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order, + mem_order, mem_scope)); + + return reinterpret_cast(tmp0); +} + +template< + int mem_order = __ATOMIC_SEQ_CST, + int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM, + typename T, + typename Cmp, + typename F> +inline +__attribute__((always_inline, device)) +T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept +{ + using FP = __attribute__((address_space(0))) const void*; + + __device__ + extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared"); + + if (is_shared_workaround((FP)p)) + return f(); + + using U = typename Cond_t< + sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type; + + auto q = reinterpret_cast(p); + + U tmp{__hip_atomic_load(q, mem_order, mem_scope)}; + while (cmp(x, reinterpret_cast(tmp)) && + !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order, + mem_scope)); + + return reinterpret_cast(tmp); +} + +__device__ +inline +int atomicCAS(int* address, int compare, int val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + return compare; +} + +__device__ +inline +int atomicCAS_system(int* address, int compare, int val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + return compare; +} + +__device__ +inline +unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + return compare; +} + +__device__ +inline +unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + return compare; +} + +__device__ +inline +unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + return compare; +} + +__device__ +inline +unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + return compare; +} + +__device__ +inline +unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare, + unsigned long long val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + return compare; +} + +__device__ +inline +unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare, + unsigned long long val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + return compare; +} + +__device__ +inline +float atomicCAS(float* address, float compare, float val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + return compare; +} + +__device__ +inline +float atomicCAS_system(float* address, float compare, float val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + return compare; +} + +__device__ +inline +double atomicCAS(double* address, double compare, double val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + return compare; +} + +__device__ +inline +double atomicCAS_system(double* address, double compare, double val) { + __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + return compare; +} + +__device__ +inline +int atomicAdd(int* address, int val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +int atomicAdd_system(int* address, int val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned int atomicAdd(unsigned int* address, unsigned int val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned int atomicAdd_system(unsigned int* address, unsigned int val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned long atomicAdd(unsigned long* address, unsigned long val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned long atomicAdd_system(unsigned long* address, unsigned long val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +float atomicAdd(float* address, float val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, val); +#else + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif +} + +__device__ +inline +float atomicAdd_system(float* address, float val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +#if !defined(__HIPCC_RTC__) +DEPRECATED("use atomicAdd instead") +#endif // !defined(__HIPCC_RTC__) +__device__ +inline +void atomicAddNoRet(float* address, float val) +{ + __ockl_atomic_add_noret_f32(address, val); +} + +__device__ +inline +double atomicAdd(double* address, double val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, val); +#else + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif +} + +__device__ +inline +double atomicAdd_system(double* address, double val) { + return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +int atomicSub(int* address, int val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +int atomicSub_system(int* address, int val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned int atomicSub(unsigned int* address, unsigned int val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned int atomicSub_system(unsigned int* address, unsigned int val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned long atomicSub(unsigned long* address, unsigned long val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned long atomicSub_system(unsigned long* address, unsigned long val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned long long atomicSub(unsigned long long* address, unsigned long long val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +float atomicSub(float* address, float val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, -val); +#else + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif +} + +__device__ +inline +float atomicSub_system(float* address, float val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +double atomicSub(double* address, double val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, -val); +#else + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif +} + +__device__ +inline +double atomicSub_system(double* address, double val) { + return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +int atomicExch(int* address, int val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +int atomicExch_system(int* address, int val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned int atomicExch(unsigned int* address, unsigned int val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned int atomicExch_system(unsigned int* address, unsigned int val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned long atomicExch(unsigned long* address, unsigned long val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned long atomicExch_system(unsigned long* address, unsigned long val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +unsigned long long atomicExch(unsigned long long* address, unsigned long long val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +float atomicExch(float* address, float val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +float atomicExch_system(float* address, float val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +double atomicExch(double* address, double val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ +inline +double atomicExch_system(double* address, double val) { + return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ +inline +int atomicMin(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](int x, int y) { return x < y; }, [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +int atomicMin_system(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](int x, int y) { return x < y; }, [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicMin(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ + +} + +__device__ +inline +unsigned int atomicMin_system(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicMin(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned long x, unsigned long y) { return x < y; }, + [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicMin_system(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, + val, + [](unsigned long x, unsigned long y) { return x < y; }, + [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicMin(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned long long x, unsigned long long y) { return x < y; }, + [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, + val, + [](unsigned long long x, unsigned long long y) { return x < y; }, + [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +long long atomicMin(long long* address, long long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](long long x, long long y) { return x < y; }, + [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +long long atomicMin_system(long long* address, long long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](long long x, long long y) { return x < y; }, + [=]() { + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +float atomicMin(float* addr, float val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicMin(addr, val); +#else + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value > val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned int *uaddr = (unsigned int *)addr; + unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __uint_as_float(value) > val) { + done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __uint_as_float(value); + #endif +#endif +} + +__device__ +inline +float atomicMin_system(float* address, float val) { + unsigned int* uaddr { reinterpret_cast(address) }; + #if __has_builtin(__hip_atomic_load) + unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)}; + #else + unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; + #endif + float value = __uint_as_float(tmp); + + while (val < value) { + value = atomicCAS_system(address, value, val); + } + + return value; +} + +__device__ +inline +double atomicMin(double* addr, double val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicMin(addr, val); +#else + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value > val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned long long *uaddr = (unsigned long long *)addr; + unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __longlong_as_double(value) > val) { + done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __longlong_as_double(value); + #endif +#endif +} + +__device__ +inline +double atomicMin_system(double* address, double val) { + unsigned long long* uaddr { reinterpret_cast(address) }; + #if __has_builtin(__hip_atomic_load) + unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)}; + #else + unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; + #endif + double value = __longlong_as_double(tmp); + + while (val < value) { + value = atomicCAS_system(address, value, val); + } + + return value; +} + +__device__ +inline +int atomicMax(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](int x, int y) { return y < x; }, [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +int atomicMax_system(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](int x, int y) { return y < x; }, [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicMax(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicMax_system(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicMax(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned long x, unsigned long y) { return y < x; }, + [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicMax_system(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, + val, + [](unsigned long x, unsigned long y) { return y < x; }, + [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicMax(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned long long x, unsigned long long y) { return y < x; }, + [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, + val, + [](unsigned long long x, unsigned long long y) { return y < x; }, + [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +long long atomicMax(long long* address, long long val) { + #if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](long long x, long long y) { return y < x; }, + [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +long long atomicMax_system(long long* address, long long val) { +#if defined(__gfx941__) + return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](long long x, long long y) { return y < x; }, + [=]() { + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +float atomicMax(float* addr, float val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicMax(addr, val); +#else + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value < val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned int *uaddr = (unsigned int *)addr; + unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __uint_as_float(value) < val) { + done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __uint_as_float(value); + #endif +#endif +} + +__device__ +inline +float atomicMax_system(float* address, float val) { + unsigned int* uaddr { reinterpret_cast(address) }; + #if __has_builtin(__hip_atomic_load) + unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)}; + #else + unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; + #endif + float value = __uint_as_float(tmp); + + while (value < val) { + value = atomicCAS_system(address, value, val); + } + + return value; +} + +__device__ +inline +double atomicMax(double* addr, double val) { +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicMax(addr, val); +#else + #if __has_builtin(__hip_atomic_load) && \ + __has_builtin(__hip_atomic_compare_exchange_strong) + double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + bool done = false; + while (!done && value < val) { + done = __hip_atomic_compare_exchange_strong(addr, &value, val, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + return value; + #else + unsigned long long *uaddr = (unsigned long long *)addr; + unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + bool done = false; + while (!done && __longlong_as_double(value) < val) { + done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + return __longlong_as_double(value); + #endif +#endif +} + +__device__ +inline +double atomicMax_system(double* address, double val) { + unsigned long long* uaddr { reinterpret_cast(address) }; + #if __has_builtin(__hip_atomic_load) + unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)}; + #else + unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; + #endif + double value = __longlong_as_double(tmp); + + while (value < val) { + value = atomicCAS_system(address, value, val); + } + + return value; +} + +__device__ +inline +unsigned int atomicInc(unsigned int* address, unsigned int val) +{ +#if defined(__gfx941__) + __device__ + extern + unsigned int __builtin_amdgcn_atomic_inc( + unsigned int*, + unsigned int, + unsigned int, + unsigned int, + bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32"); + + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); }, + [=]() { + return + __builtin_amdgcn_atomic_inc(address, val, __ATOMIC_RELAXED, 1, false); + }); +#else + return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent"); +#endif // __gfx941__ + +} + +__device__ +inline +unsigned int atomicDec(unsigned int* address, unsigned int val) +{ +#if defined(__gfx941__) + __device__ + extern + unsigned int __builtin_amdgcn_atomic_dec( + unsigned int*, + unsigned int, + unsigned int, + unsigned int, + bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32"); + + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); }, + [=]() { + return + __builtin_amdgcn_atomic_dec(address, val, __ATOMIC_RELAXED, 1, false); + }); +#else + return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent"); +#endif // __gfx941__ + +} + +__device__ +inline +int atomicAnd(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](int& x, int y) { x &= y; }, [=]() { + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +int atomicAnd_system(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](int& x, int y) { x &= y; }, [=]() { + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicAnd(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() { + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicAnd_system(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() { + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicAnd(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() { + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicAnd_system(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() { + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned long long& x, unsigned long long y) { x &= y; }, + [=]() { + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, + val, + [](unsigned long long& x, unsigned long long y) { x &= y; }, + [=]() { + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +int atomicOr(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](int& x, int y) { x |= y; }, [=]() { + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +int atomicOr_system(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](int& x, int y) { x |= y; }, [=]() { + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicOr(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() { + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicOr_system(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() { + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicOr(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() { + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicOr_system(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() { + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicOr(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned long long& x, unsigned long long y) { x |= y; }, + [=]() { + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, + val, + [](unsigned long long& x, unsigned long long y) { x |= y; }, + [=]() { + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +int atomicXor(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](int& x, int y) { x ^= y; }, [=]() { + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +int atomicXor_system(int* address, int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](int& x, int y) { x ^= y; }, [=]() { + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicXor(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() { + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned int atomicXor_system(unsigned int* address, unsigned int val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() { + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicXor(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() { + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long atomicXor_system(unsigned long* address, unsigned long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>( + address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() { + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_SYSTEM); + }); +#else + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicXor(unsigned long long* address, unsigned long long val) { +#if defined(__gfx941__) + return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>( + address, + val, + [](unsigned long long& x, unsigned long long y) { x ^= y; }, + [=]() { + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT); + }); +#else + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#endif // __gfx941__ +} + +__device__ +inline +unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) { + return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +} + +#else // __hip_atomic_compare_exchange_strong + +__device__ +inline +int atomicCAS(int* address, int compare, int val) +{ + __atomic_compare_exchange_n( + address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + + return compare; +} +__device__ +inline +unsigned int atomicCAS( + unsigned int* address, unsigned int compare, unsigned int val) +{ + __atomic_compare_exchange_n( + address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + + return compare; +} +__device__ +inline +unsigned long long atomicCAS( + unsigned long long* address, + unsigned long long compare, + unsigned long long val) +{ + __atomic_compare_exchange_n( + address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + + return compare; +} + +__device__ +inline +int atomicAdd(int* address, int val) +{ + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicAdd(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicAdd( + unsigned long long* address, unsigned long long val) +{ + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +float atomicAdd(float* address, float val) +{ +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, val); +#else + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +#endif +} + +#if !defined(__HIPCC_RTC__) +DEPRECATED("use atomicAdd instead") +#endif // !defined(__HIPCC_RTC__) +__device__ +inline +void atomicAddNoRet(float* address, float val) +{ + __ockl_atomic_add_noret_f32(address, val); +} + +__device__ +inline +double atomicAdd(double* address, double val) +{ +#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) + return unsafeAtomicAdd(address, val); +#else + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +#endif +} + +__device__ +inline +int atomicSub(int* address, int val) +{ + return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicSub(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED); +} + +__device__ +inline +int atomicExch(int* address, int val) +{ + return __atomic_exchange_n(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicExch(unsigned int* address, unsigned int val) +{ + return __atomic_exchange_n(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicExch(unsigned long long* address, unsigned long long val) +{ + return __atomic_exchange_n(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +float atomicExch(float* address, float val) +{ + return __uint_as_float(__atomic_exchange_n( + reinterpret_cast(address), + __float_as_uint(val), + __ATOMIC_RELAXED)); +} + +__device__ +inline +int atomicMin(int* address, int val) +{ + return __atomic_fetch_min(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicMin(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_min(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicMin( + unsigned long long* address, unsigned long long val) +{ + unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; + while (val < tmp) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { tmp = tmp1; continue; } + + tmp = atomicCAS(address, tmp, val); + } + + return tmp; +} +__device__ inline long long atomicMin(long long* address, long long val) { + long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; + while (val < tmp) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { + tmp = tmp1; + continue; + } + + tmp = atomicCAS(address, tmp, val); + } + return tmp; +} + +__device__ +inline +int atomicMax(int* address, int val) +{ + return __atomic_fetch_max(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicMax(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_max(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicMax( + unsigned long long* address, unsigned long long val) +{ + unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; + while (tmp < val) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { tmp = tmp1; continue; } + + tmp = atomicCAS(address, tmp, val); + } + + return tmp; +} +__device__ inline long long atomicMax(long long* address, long long val) { + long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; + while (tmp < val) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { + tmp = tmp1; + continue; + } + + tmp = atomicCAS(address, tmp, val); + } + return tmp; +} + +__device__ +inline +unsigned int atomicInc(unsigned int* address, unsigned int val) +{ + return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent"); +} + +__device__ +inline +unsigned int atomicDec(unsigned int* address, unsigned int val) +{ + return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent"); +} + +__device__ +inline +int atomicAnd(int* address, int val) +{ + return __atomic_fetch_and(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicAnd(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_and(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicAnd( + unsigned long long* address, unsigned long long val) +{ + return __atomic_fetch_and(address, val, __ATOMIC_RELAXED); +} + +__device__ +inline +int atomicOr(int* address, int val) +{ + return __atomic_fetch_or(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicOr(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_or(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicOr( + unsigned long long* address, unsigned long long val) +{ + return __atomic_fetch_or(address, val, __ATOMIC_RELAXED); +} + +__device__ +inline +int atomicXor(int* address, int val) +{ + return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicXor(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicXor( + unsigned long long* address, unsigned long long val) +{ + return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED); +} + +#endif // __hip_atomic_compare_exchange_strong +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if !defined(__HIPCC_RTC__) +#include "host_defines.h" +#include "amd_hip_vector_types.h" // For Native_vec_ +#endif + +#if defined(__cplusplus) + extern "C" { +#endif + +// DOT FUNCTIONS +#if defined(__clang__) && defined(__HIP__) +__device__ +__attribute__((const)) +int __ockl_sdot2( + HIP_vector_base::Native_vec_, + HIP_vector_base::Native_vec_, + int, bool); + +__device__ +__attribute__((const)) +unsigned int __ockl_udot2( + HIP_vector_base::Native_vec_, + HIP_vector_base::Native_vec_, + unsigned int, bool); + +__device__ +__attribute__((const)) +int __ockl_sdot4( + HIP_vector_base::Native_vec_, + HIP_vector_base::Native_vec_, + int, bool); + +__device__ +__attribute__((const)) +unsigned int __ockl_udot4( + HIP_vector_base::Native_vec_, + HIP_vector_base::Native_vec_, + unsigned int, bool); + +__device__ +__attribute__((const)) +int __ockl_sdot8(int, int, int, bool); + +__device__ +__attribute__((const)) +unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool); +#endif + +#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ +// BEGIN FLOAT +__device__ +__attribute__((const)) +float __ocml_acos_f32(float); +__device__ +__attribute__((pure)) +float __ocml_acosh_f32(float); +__device__ +__attribute__((const)) +float __ocml_asin_f32(float); +__device__ +__attribute__((pure)) +float __ocml_asinh_f32(float); +__device__ +__attribute__((const)) +float __ocml_atan2_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_atan_f32(float); +__device__ +__attribute__((pure)) +float __ocml_atanh_f32(float); +__device__ +__attribute__((pure)) +float __ocml_cbrt_f32(float); +__device__ +__attribute__((const)) +float __ocml_ceil_f32(float); +__device__ +__attribute__((const)) +__device__ +float __ocml_copysign_f32(float, float); +__device__ +float __ocml_cos_f32(float); +__device__ +float __ocml_native_cos_f32(float); +__device__ +__attribute__((pure)) +__device__ +float __ocml_cosh_f32(float); +__device__ +float __ocml_cospi_f32(float); +__device__ +float __ocml_i0_f32(float); +__device__ +float __ocml_i1_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erfc_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erfcinv_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erfcx_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erf_f32(float); +__device__ +__attribute__((pure)) +float __ocml_erfinv_f32(float); +__device__ +__attribute__((pure)) +float __ocml_exp10_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_exp10_f32(float); +__device__ +__attribute__((pure)) +float __ocml_exp2_f32(float); +__device__ +__attribute__((pure)) +float __ocml_exp_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_exp_f32(float); +__device__ +__attribute__((pure)) +float __ocml_expm1_f32(float); +__device__ +__attribute__((const)) +float __ocml_fabs_f32(float); +__device__ +__attribute__((const)) +float __ocml_fdim_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_floor_f32(float); +__device__ +__attribute__((const)) +float __ocml_fma_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_fmax_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_fmin_f32(float, float); +__device__ +__attribute__((const)) +__device__ +float __ocml_fmod_f32(float, float); +__device__ +float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*); +__device__ +__attribute__((const)) +float __ocml_hypot_f32(float, float); +__device__ +__attribute__((const)) +int __ocml_ilogb_f32(float); +__device__ +__attribute__((const)) +int __ocml_isfinite_f32(float); +__device__ +__attribute__((const)) +int __ocml_isinf_f32(float); +__device__ +__attribute__((const)) +int __ocml_isnan_f32(float); +__device__ +float __ocml_j0_f32(float); +__device__ +float __ocml_j1_f32(float); +__device__ +__attribute__((const)) +float __ocml_ldexp_f32(float, int); +__device__ +float __ocml_lgamma_f32(float); +__device__ +__attribute__((pure)) +float __ocml_log10_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_log10_f32(float); +__device__ +__attribute__((pure)) +float __ocml_log1p_f32(float); +__device__ +__attribute__((pure)) +float __ocml_log2_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_log2_f32(float); +__device__ +__attribute__((const)) +float __ocml_logb_f32(float); +__device__ +__attribute__((pure)) +float __ocml_log_f32(float); +__device__ +__attribute__((pure)) +float __ocml_native_log_f32(float); +__device__ +float __ocml_modf_f32(float, __attribute__((address_space(5))) float*); +__device__ +__attribute__((const)) +float __ocml_nearbyint_f32(float); +__device__ +__attribute__((const)) +float __ocml_nextafter_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_len3_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_len4_f32(float, float, float, float); +__device__ +__attribute__((pure)) +float __ocml_ncdf_f32(float); +__device__ +__attribute__((pure)) +float __ocml_ncdfinv_f32(float); +__device__ +__attribute__((pure)) +float __ocml_pow_f32(float, float); +__device__ +__attribute__((pure)) +float __ocml_pown_f32(float, int); +__device__ +__attribute__((pure)) +float __ocml_rcbrt_f32(float); +__device__ +__attribute__((const)) +float __ocml_remainder_f32(float, float); +__device__ +float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*); +__device__ +__attribute__((const)) +float __ocml_rhypot_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_rint_f32(float); +__device__ +__attribute__((const)) +float __ocml_rlen3_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_rlen4_f32(float, float, float, float); +__device__ +__attribute__((const)) +float __ocml_round_f32(float); +__device__ +__attribute__((pure)) +float __ocml_rsqrt_f32(float); +__device__ +__attribute__((const)) +float __ocml_scalb_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_scalbn_f32(float, int); +__device__ +__attribute__((const)) +int __ocml_signbit_f32(float); +__device__ +float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*); +__device__ +float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*); +__device__ +float __ocml_sin_f32(float); +__device__ +float __ocml_native_sin_f32(float); +__device__ +__attribute__((pure)) +float __ocml_sinh_f32(float); +__device__ +float __ocml_sinpi_f32(float); +__device__ +__attribute__((const)) +float __ocml_sqrt_f32(float); +__device__ +__attribute__((const)) +float __ocml_native_sqrt_f32(float); +__device__ +float __ocml_tan_f32(float); +__device__ +__attribute__((pure)) +float __ocml_tanh_f32(float); +__device__ +float __ocml_tgamma_f32(float); +__device__ +__attribute__((const)) +float __ocml_trunc_f32(float); +__device__ +float __ocml_y0_f32(float); +__device__ +float __ocml_y1_f32(float); + +// BEGIN INTRINSICS +__device__ +__attribute__((const)) +float __ocml_add_rte_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_add_rtn_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_add_rtp_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_add_rtz_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sub_rte_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sub_rtn_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sub_rtp_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sub_rtz_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_mul_rte_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_mul_rtn_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_mul_rtp_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_mul_rtz_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_div_rte_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_div_rtn_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_div_rtp_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_div_rtz_f32(float, float); +__device__ +__attribute__((const)) +float __ocml_sqrt_rte_f32(float); +__device__ +__attribute__((const)) +float __ocml_sqrt_rtn_f32(float); +__device__ +__attribute__((const)) +float __ocml_sqrt_rtp_f32(float); +__device__ +__attribute__((const)) +float __ocml_sqrt_rtz_f32(float); +__device__ +__attribute__((const)) +float __ocml_fma_rte_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_fma_rtn_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_fma_rtp_f32(float, float, float); +__device__ +__attribute__((const)) +float __ocml_fma_rtz_f32(float, float, float); +// END INTRINSICS +// END FLOAT + +// BEGIN DOUBLE +__device__ +__attribute__((const)) +double __ocml_acos_f64(double); +__device__ +__attribute__((pure)) +double __ocml_acosh_f64(double); +__device__ +__attribute__((const)) +double __ocml_asin_f64(double); +__device__ +__attribute__((pure)) +double __ocml_asinh_f64(double); +__device__ +__attribute__((const)) +double __ocml_atan2_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_atan_f64(double); +__device__ +__attribute__((pure)) +double __ocml_atanh_f64(double); +__device__ +__attribute__((pure)) +double __ocml_cbrt_f64(double); +__device__ +__attribute__((const)) +double __ocml_ceil_f64(double); +__device__ +__attribute__((const)) +double __ocml_copysign_f64(double, double); +__device__ +double __ocml_cos_f64(double); +__device__ +__attribute__((pure)) +double __ocml_cosh_f64(double); +__device__ +double __ocml_cospi_f64(double); +__device__ +double __ocml_i0_f64(double); +__device__ +double __ocml_i1_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erfc_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erfcinv_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erfcx_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erf_f64(double); +__device__ +__attribute__((pure)) +double __ocml_erfinv_f64(double); +__device__ +__attribute__((pure)) +double __ocml_exp10_f64(double); +__device__ +__attribute__((pure)) +double __ocml_exp2_f64(double); +__device__ +__attribute__((pure)) +double __ocml_exp_f64(double); +__device__ +__attribute__((pure)) +double __ocml_expm1_f64(double); +__device__ +__attribute__((const)) +double __ocml_fabs_f64(double); +__device__ +__attribute__((const)) +double __ocml_fdim_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_floor_f64(double); +__device__ +__attribute__((const)) +double __ocml_fma_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_fmax_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_fmin_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_fmod_f64(double, double); +__device__ +double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*); +__device__ +__attribute__((const)) +double __ocml_hypot_f64(double, double); +__device__ +__attribute__((const)) +int __ocml_ilogb_f64(double); +__device__ +__attribute__((const)) +int __ocml_isfinite_f64(double); +__device__ +__attribute__((const)) +int __ocml_isinf_f64(double); +__device__ +__attribute__((const)) +int __ocml_isnan_f64(double); +__device__ +double __ocml_j0_f64(double); +__device__ +double __ocml_j1_f64(double); +__device__ +__attribute__((const)) +double __ocml_ldexp_f64(double, int); +__device__ +double __ocml_lgamma_f64(double); +__device__ +__attribute__((pure)) +double __ocml_log10_f64(double); +__device__ +__attribute__((pure)) +double __ocml_log1p_f64(double); +__device__ +__attribute__((pure)) +double __ocml_log2_f64(double); +__device__ +__attribute__((const)) +double __ocml_logb_f64(double); +__device__ +__attribute__((pure)) +double __ocml_log_f64(double); +__device__ +double __ocml_modf_f64(double, __attribute__((address_space(5))) double*); +__device__ +__attribute__((const)) +double __ocml_nearbyint_f64(double); +__device__ +__attribute__((const)) +double __ocml_nextafter_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_len3_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_len4_f64(double, double, double, double); +__device__ +__attribute__((pure)) +double __ocml_ncdf_f64(double); +__device__ +__attribute__((pure)) +double __ocml_ncdfinv_f64(double); +__device__ +__attribute__((pure)) +double __ocml_pow_f64(double, double); +__device__ +__attribute__((pure)) +double __ocml_pown_f64(double, int); +__device__ +__attribute__((pure)) +double __ocml_rcbrt_f64(double); +__device__ +__attribute__((const)) +double __ocml_remainder_f64(double, double); +__device__ +double __ocml_remquo_f64( + double, double, __attribute__((address_space(5))) int*); +__device__ +__attribute__((const)) +double __ocml_rhypot_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_rint_f64(double); +__device__ +__attribute__((const)) +double __ocml_rlen3_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_rlen4_f64(double, double, double, double); +__device__ +__attribute__((const)) +double __ocml_round_f64(double); +__device__ +__attribute__((pure)) +double __ocml_rsqrt_f64(double); +__device__ +__attribute__((const)) +double __ocml_scalb_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_scalbn_f64(double, int); +__device__ +__attribute__((const)) +int __ocml_signbit_f64(double); +__device__ +double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*); +__device__ +double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*); +__device__ +double __ocml_sin_f64(double); +__device__ +__attribute__((pure)) +double __ocml_sinh_f64(double); +__device__ +double __ocml_sinpi_f64(double); +__device__ +__attribute__((const)) +double __ocml_sqrt_f64(double); +__device__ +double __ocml_tan_f64(double); +__device__ +__attribute__((pure)) +double __ocml_tanh_f64(double); +__device__ +double __ocml_tgamma_f64(double); +__device__ +__attribute__((const)) +double __ocml_trunc_f64(double); +__device__ +double __ocml_y0_f64(double); +__device__ +double __ocml_y1_f64(double); + +// BEGIN INTRINSICS +__device__ +__attribute__((const)) +double __ocml_add_rte_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_add_rtn_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_add_rtp_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_add_rtz_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sub_rte_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sub_rtn_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sub_rtp_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sub_rtz_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_mul_rte_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_mul_rtn_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_mul_rtp_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_mul_rtz_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_div_rte_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_div_rtn_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_div_rtp_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_div_rtz_f64(double, double); +__device__ +__attribute__((const)) +double __ocml_sqrt_rte_f64(double); +__device__ +__attribute__((const)) +double __ocml_sqrt_rtn_f64(double); +__device__ +__attribute__((const)) +double __ocml_sqrt_rtp_f64(double); +__device__ +__attribute__((const)) +double __ocml_sqrt_rtz_f64(double); +__device__ +__attribute__((const)) +double __ocml_fma_rte_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_fma_rtn_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_fma_rtp_f64(double, double, double); +__device__ +__attribute__((const)) +double __ocml_fma_rtz_f64(double, double, double); +// END INTRINSICS +// END DOUBLE + +#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ + +#if defined(__cplusplus) + } // extern "C" +#endif +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +// /* +// Half Math Functions +// */ +#if !defined(__HIPCC_RTC__) +#include "host_defines.h" +#endif +#ifndef __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ +extern "C" +{ + __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16); + __device__ _Float16 __ocml_cos_f16(_Float16); + __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16); + __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16); + __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16); + __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16); + __device__ __attribute__((const)) + _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16); + __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16); + __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16); + __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16); + __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16); + __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16); + __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16); + __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int); + __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16); + __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16); + __device__ _Float16 __ocml_sin_f16(_Float16); + __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16); + __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16); + __device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16); + __device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16); + + typedef _Float16 __2f16 __attribute__((ext_vector_type(2))); + typedef short __2i16 __attribute__((ext_vector_type(2))); + + #if defined(__clang__) && defined(__HIP__) + __device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s); + #endif + + __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16); + __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16); + __device__ __2f16 __ocml_cos_2f16(__2f16); + __device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16); + __device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16); + __device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16); + __device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16); + __device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16); + __device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16); + __device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16); + __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16); + __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16); + __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); + __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); + __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); + __device__ __2f16 __ocml_sin_2f16(__2f16); + __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16); + __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16); + + __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float); + __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float); + __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float); + +} +#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ +//TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h' +extern "C" { + __device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16); + __device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16); + __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float); + __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float); + __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float); +} +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H + +#if defined(__HIPCC_RTC__) + #define __HOST_DEVICE__ __device__ +#else + #define __HOST_DEVICE__ __host__ __device__ + #include + #include "hip/amd_detail/host_defines.h" + #include + #if defined(__cplusplus) + #include + #include + #include +#endif +#endif // !defined(__HIPCC_RTC__) + +#if defined(__clang__) && defined(__HIP__) + typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2))); + + struct __half_raw { + union { + static_assert(sizeof(_Float16) == sizeof(unsigned short), ""); + + _Float16 data; + unsigned short x; + }; + }; + + struct __half2_raw { + union { + static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), ""); + + struct { + __half_raw x; + __half_raw y; + }; + _Float16_2 data; + }; + }; + + #if defined(__cplusplus) + #if !defined(__HIPCC_RTC__) + #include "hip_fp16_math_fwd.h" + #include "amd_hip_vector_types.h" + #include "host_defines.h" + #include "amd_device_functions.h" + #include "amd_warp_functions.h" + #endif + namespace std + { + template<> struct is_floating_point<_Float16> : std::true_type {}; + } + + template + using Enable_if_t = typename std::enable_if::type; + + // BEGIN STRUCT __HALF + struct __half { + protected: + union { + static_assert(sizeof(_Float16) == sizeof(unsigned short), ""); + + _Float16 data; + unsigned short __x; + }; + public: + // CREATORS + __HOST_DEVICE__ + __half() = default; + __HOST_DEVICE__ + __half(const __half_raw& x) : data{x.data} {} + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + __HOST_DEVICE__ + __half(decltype(data) x) : data{x} {} + template< + typename T, + Enable_if_t{}>* = nullptr> + __HOST_DEVICE__ + __half(T x) : data{static_cast<_Float16>(x)} {} + #endif + __HOST_DEVICE__ + __half(const __half&) = default; + __HOST_DEVICE__ + __half(__half&&) = default; + __HOST_DEVICE__ + ~__half() = default; + + // CREATORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, Enable_if_t{}>* = nullptr> + __HOST_DEVICE__ + __half(T x) : data{static_cast<_Float16>(x)} {} + #endif + + // MANIPULATORS + __HOST_DEVICE__ + __half& operator=(const __half&) = default; + __HOST_DEVICE__ + __half& operator=(__half&&) = default; + __HOST_DEVICE__ + __half& operator=(const __half_raw& x) + { + data = x.data; + return *this; + } + __HOST_DEVICE__ + volatile __half& operator=(const __half_raw& x) volatile + { + data = x.data; + return *this; + } + volatile __half& operator=(const volatile __half_raw& x) volatile + { + data = x.data; + return *this; + } + __half& operator=(__half_raw&& x) + { + data = x.data; + return *this; + } + volatile __half& operator=(__half_raw&& x) volatile + { + data = x.data; + return *this; + } + volatile __half& operator=(volatile __half_raw&& x) volatile + { + data = x.data; + return *this; + } + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, + Enable_if_t{}>* = nullptr> + __HOST_DEVICE__ + __half& operator=(T x) + { + data = static_cast<_Float16>(x); + return *this; + } + #endif + + // MANIPULATORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, Enable_if_t{}>* = nullptr> + __device__ + __half& operator=(T x) + { + data = static_cast<_Float16>(x); + return *this; + } + #endif + + #if !defined(__HIP_NO_HALF_OPERATORS__) + __device__ + __half& operator+=(const __half& x) + { + data += x.data; + return *this; + } + __device__ + __half& operator-=(const __half& x) + { + data -= x.data; + return *this; + } + __device__ + __half& operator*=(const __half& x) + { + data *= x.data; + return *this; + } + __device__ + __half& operator/=(const __half& x) + { + data /= x.data; + return *this; + } + __device__ + __half& operator++() { ++data; return *this; } + __device__ + __half operator++(int) + { + __half tmp{*this}; + ++*this; + return tmp; + } + __device__ + __half& operator--() { --data; return *this; } + __device__ + __half operator--(int) + { + __half tmp{*this}; + --*this; + return tmp; + } + #endif + + // ACCESSORS + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, + Enable_if_t{}>* = nullptr> + __HOST_DEVICE__ + operator T() const { return data; } + #endif + __HOST_DEVICE__ + operator __half_raw() const { return __half_raw{data}; } + __HOST_DEVICE__ + operator __half_raw() const volatile + { + return __half_raw{data}; + } + + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, Enable_if_t{}>* = nullptr> + __HOST_DEVICE__ + operator T() const { return data; } + #endif + + #if !defined(__HIP_NO_HALF_OPERATORS__) + __device__ + __half operator+() const { return *this; } + __device__ + __half operator-() const + { + __half tmp{*this}; + tmp.data = -tmp.data; + return tmp; + } + #endif + + // FRIENDS + #if !defined(__HIP_NO_HALF_OPERATORS__) + friend + inline + __device__ + __half operator+(const __half& x, const __half& y) + { + return __half{x} += y; + } + friend + inline + __device__ + __half operator-(const __half& x, const __half& y) + { + return __half{x} -= y; + } + friend + inline + __device__ + __half operator*(const __half& x, const __half& y) + { + return __half{x} *= y; + } + friend + inline + __device__ + __half operator/(const __half& x, const __half& y) + { + return __half{x} /= y; + } + friend + inline + __device__ + bool operator==(const __half& x, const __half& y) + { + return x.data == y.data; + } + friend + inline + __device__ + bool operator!=(const __half& x, const __half& y) + { + return !(x == y); + } + friend + inline + __device__ + bool operator<(const __half& x, const __half& y) + { + return x.data < y.data; + } + friend + inline + __device__ + bool operator>(const __half& x, const __half& y) + { + return y.data < x.data; + } + friend + inline + __device__ + bool operator<=(const __half& x, const __half& y) + { + return !(y < x); + } + friend + inline + __device__ + bool operator>=(const __half& x, const __half& y) + { + return !(x < y); + } + #endif // !defined(__HIP_NO_HALF_OPERATORS__) + }; + // END STRUCT __HALF + + // BEGIN STRUCT __HALF2 + struct __half2 { + public: + union { + static_assert( + sizeof(_Float16_2) == sizeof(unsigned short[2]), ""); + + struct { + __half x; + __half y; + }; + _Float16_2 data; + }; + + // CREATORS + __HOST_DEVICE__ + __half2() = default; + __HOST_DEVICE__ + __half2(const __half2_raw& xx) : data{xx.data} {} + __HOST_DEVICE__ + __half2(decltype(data) xx) : data{xx} {} + __HOST_DEVICE__ + __half2(const __half& xx, const __half& yy) + : + data{static_cast<__half_raw>(xx).data, + static_cast<__half_raw>(yy).data} + {} + __HOST_DEVICE__ + __half2(const __half2&) = default; + __HOST_DEVICE__ + __half2(__half2&&) = default; + __HOST_DEVICE__ + ~__half2() = default; + + // MANIPULATORS + __HOST_DEVICE__ + __half2& operator=(const __half2&) = default; + __HOST_DEVICE__ + __half2& operator=(__half2&&) = default; + __HOST_DEVICE__ + __half2& operator=(const __half2_raw& xx) + { + data = xx.data; + return *this; + } + + // MANIPULATORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_OPERATORS__) + __device__ + __half2& operator+=(const __half2& xx) + { + data += xx.data; + return *this; + } + __device__ + __half2& operator-=(const __half2& xx) + { + data -= xx.data; + return *this; + } + __device__ + __half2& operator*=(const __half2& xx) + { + data *= xx.data; + return *this; + } + __device__ + __half2& operator/=(const __half2& xx) + { + data /= xx.data; + return *this; + } + __device__ + __half2& operator++() { return *this += _Float16_2{1, 1}; } + __device__ + __half2 operator++(int) + { + __half2 tmp{*this}; + ++*this; + return tmp; + } + __device__ + __half2& operator--() { return *this -= _Float16_2{1, 1}; } + __device__ + __half2 operator--(int) + { + __half2 tmp{*this}; + --*this; + return tmp; + } + #endif + + // ACCESSORS + __HOST_DEVICE__ + operator decltype(data)() const { return data; } + __HOST_DEVICE__ + operator __half2_raw() const { + __half2_raw r; + r.data = data; + return r; + } + + // ACCESSORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_OPERATORS__) + __device__ + __half2 operator+() const { return *this; } + __device__ + __half2 operator-() const + { + __half2 tmp{*this}; + tmp.data = -tmp.data; + return tmp; + } + #endif + + // FRIENDS + #if !defined(__HIP_NO_HALF_OPERATORS__) + friend + inline + __device__ + __half2 operator+(const __half2& xx, const __half2& yy) + { + return __half2{xx} += yy; + } + friend + inline + __device__ + __half2 operator-(const __half2& xx, const __half2& yy) + { + return __half2{xx} -= yy; + } + friend + inline + __device__ + __half2 operator*(const __half2& xx, const __half2& yy) + { + return __half2{xx} *= yy; + } + friend + inline + __device__ + __half2 operator/(const __half2& xx, const __half2& yy) + { + return __half2{xx} /= yy; + } + friend + inline + __device__ + bool operator==(const __half2& xx, const __half2& yy) + { + auto r = xx.data == yy.data; + return r.x != 0 && r.y != 0; + } + friend + inline + __device__ + bool operator!=(const __half2& xx, const __half2& yy) + { + return !(xx == yy); + } + friend + inline + __device__ + bool operator<(const __half2& xx, const __half2& yy) + { + auto r = xx.data < yy.data; + return r.x != 0 && r.y != 0; + } + friend + inline + __device__ + bool operator>(const __half2& xx, const __half2& yy) + { + return yy < xx; + } + friend + inline + __device__ + bool operator<=(const __half2& xx, const __half2& yy) + { + return !(yy < xx); + } + friend + inline + __device__ + bool operator>=(const __half2& xx, const __half2& yy) + { + return !(xx < yy); + } + #endif // !defined(__HIP_NO_HALF_OPERATORS__) + }; + // END STRUCT __HALF2 + + namespace + { + inline + __HOST_DEVICE__ + __half2 make_half2(__half x, __half y) + { + return __half2{x, y}; + } + + inline + __HOST_DEVICE__ + __half __low2half(__half2 x) + { + return __half{__half_raw{static_cast<__half2_raw>(x).data.x}}; + } + + inline + __HOST_DEVICE__ + __half __high2half(__half2 x) + { + return __half{__half_raw{static_cast<__half2_raw>(x).data.y}}; + } + + inline + __HOST_DEVICE__ + __half2 __half2half2(__half x) + { + return __half2{x, x}; + } + + inline + __HOST_DEVICE__ + __half2 __halves2half2(__half x, __half y) + { + return __half2{x, y}; + } + + inline + __HOST_DEVICE__ + __half2 __low2half2(__half2 x) + { + return __half2{ + _Float16_2{ + static_cast<__half2_raw>(x).data.x, + static_cast<__half2_raw>(x).data.x}}; + } + + inline + __HOST_DEVICE__ + __half2 __high2half2(__half2 x) + { + return __half2{ + _Float16_2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(x).data.y}}; + } + + inline + __HOST_DEVICE__ + __half2 __lows2half2(__half2 x, __half2 y) + { + return __half2{ + _Float16_2{ + static_cast<__half2_raw>(x).data.x, + static_cast<__half2_raw>(y).data.x}}; + } + + inline + __HOST_DEVICE__ + __half2 __highs2half2(__half2 x, __half2 y) + { + return __half2{ + _Float16_2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(y).data.y}}; + } + + inline + __HOST_DEVICE__ + __half2 __lowhigh2highlow(__half2 x) + { + return __half2{ + _Float16_2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(x).data.x}}; + } + + // Bitcasts + inline + __device__ + short __half_as_short(__half x) + { + return static_cast<__half_raw>(x).x; + } + + inline + __device__ + unsigned short __half_as_ushort(__half x) + { + return static_cast<__half_raw>(x).x; + } + + inline + __device__ + __half __short_as_half(short x) + { + __half_raw r; r.x = x; + return r; + } + + inline + __device__ + __half __ushort_as_half(unsigned short x) + { + __half_raw r; r.x = x; + return r; + } + + // float -> half | half2 + inline + __HOST_DEVICE__ + __half __float2half(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __HOST_DEVICE__ + __half __float2half_rn(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + #if !defined(__HIPCC_RTC__) + // TODO: rounding behaviour is not correct for host functions. + inline + __host__ + __half __float2half_rz(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __host__ + __half __float2half_rd(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __host__ + __half __float2half_ru(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + #endif + inline + __device__ + __half __float2half_rz(float x) + { + return __half_raw{__ocml_cvtrtz_f16_f32(x)}; + } + inline + __device__ + __half __float2half_rd(float x) + { + return __half_raw{__ocml_cvtrtn_f16_f32(x)}; + } + inline + __device__ + __half __float2half_ru(float x) + { + return __half_raw{__ocml_cvtrtp_f16_f32(x)}; + } + inline + __HOST_DEVICE__ + __half2 __float2half2_rn(float x) + { + return __half2{ + _Float16_2{ + static_cast<_Float16>(x), static_cast<_Float16>(x)}}; + } + inline + __HOST_DEVICE__ + __half2 __floats2half2_rn(float x, float y) + { + return __half2{_Float16_2{ + static_cast<_Float16>(x), static_cast<_Float16>(y)}}; + } + inline + __HOST_DEVICE__ + __half2 __float22half2_rn(float2 x) + { + return __floats2half2_rn(x.x, x.y); + } + + // half | half2 -> float + inline + __HOST_DEVICE__ + float __half2float(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __HOST_DEVICE__ + float __low2float(__half2 x) + { + return static_cast<__half2_raw>(x).data.x; + } + inline + __HOST_DEVICE__ + float __high2float(__half2 x) + { + return static_cast<__half2_raw>(x).data.y; + } + inline + __HOST_DEVICE__ + float2 __half22float2(__half2 x) + { + return make_float2( + static_cast<__half2_raw>(x).data.x, + static_cast<__half2_raw>(x).data.y); + } + + // half -> int + inline + __device__ + int __half2int_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // int -> half + inline + __device__ + __half __int2half_rn(int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __int2half_rz(int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __int2half_rd(int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __int2half_ru(int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + + // half -> short + inline + __device__ + short __half2short_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // short -> half + inline + __device__ + __half __short2half_rn(short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __short2half_rz(short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __short2half_rd(short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __short2half_ru(short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + + // half -> long long + inline + __device__ + long long __half2ll_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // long long -> half + inline + __device__ + __half __ll2half_rn(long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ll2half_rz(long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ll2half_rd(long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ll2half_ru(long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + + // half -> unsigned int + inline + __device__ + unsigned int __half2uint_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // unsigned int -> half + inline + __device__ + __half __uint2half_rn(unsigned int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __uint2half_rz(unsigned int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __uint2half_rd(unsigned int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __uint2half_ru(unsigned int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + + // half -> unsigned short + inline + __device__ + unsigned short __half2ushort_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // unsigned short -> half + inline + __device__ + __half __ushort2half_rn(unsigned short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ushort2half_rz(unsigned short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ushort2half_rd(unsigned short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ushort2half_ru(unsigned short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + + // half -> unsigned long long + inline + __device__ + unsigned long long __half2ull_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // unsigned long long -> half + inline + __device__ + __half __ull2half_rn(unsigned long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ull2half_rz(unsigned long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ull2half_rd(unsigned long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ull2half_ru(unsigned long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + + // Load primitives + inline + __device__ + __half __ldg(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldcg(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldca(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldcs(const __half* ptr) { return *ptr; } + + inline + __HOST_DEVICE__ + __half2 __ldg(const __half2* ptr) { return *ptr; } + inline + __HOST_DEVICE__ + __half2 __ldcg(const __half2* ptr) { return *ptr; } + inline + __HOST_DEVICE__ + __half2 __ldca(const __half2* ptr) { return *ptr; } + inline + __HOST_DEVICE__ + __half2 __ldcs(const __half2* ptr) { return *ptr; } + + // Relations + inline + __device__ + bool __heq(__half x, __half y) + { + return static_cast<__half_raw>(x).data == + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hne(__half x, __half y) + { + return static_cast<__half_raw>(x).data != + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hle(__half x, __half y) + { + return static_cast<__half_raw>(x).data <= + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hge(__half x, __half y) + { + return static_cast<__half_raw>(x).data >= + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hlt(__half x, __half y) + { + return static_cast<__half_raw>(x).data < + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hgt(__half x, __half y) + { + return static_cast<__half_raw>(x).data > + static_cast<__half_raw>(y).data; + } + inline __device__ + bool __hequ(__half x, __half y) { + return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) && + !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data); + } + inline __device__ + bool __hneu(__half x, __half y) { + return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data); + } + inline __device__ + bool __hleu(__half x, __half y) { + return !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data); + } + inline + __device__ + bool __hgeu(__half x, __half y) { + return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data); + } + inline + __device__ + bool __hltu(__half x, __half y) { + return !(static_cast<__half_raw>(x).data >= static_cast<__half_raw>(y).data); + } + inline + __device__ + bool __hgtu(__half x, __half y) { + return !(static_cast<__half_raw>(x).data <= static_cast<__half_raw>(y).data); + } + + inline + __HOST_DEVICE__ + __half2 __heq2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data == + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hne2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data != + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hle2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data <= + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hge2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data >= + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hlt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data < + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hgt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data > + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline __HOST_DEVICE__ + __half2 __hequ2(__half2 x, __half2 y) { + auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) && + !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data); + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hneu2(__half2 x, __half2 y) { + auto r = !(static_cast<__half2_raw>(x).data == static_cast<__half2_raw>(y).data); + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hleu2(__half2 x, __half2 y) { + auto r = !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data); + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hgeu2(__half2 x, __half2 y) { + auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data); + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hltu2(__half2 x, __half2 y) { + auto r = !(static_cast<__half2_raw>(x).data >= static_cast<__half2_raw>(y).data); + return __builtin_convertvector(-r, _Float16_2); + } + inline + __HOST_DEVICE__ + __half2 __hgtu2(__half2 x, __half2 y) { + auto r = !(static_cast<__half2_raw>(x).data <= static_cast<__half2_raw>(y).data); + return __builtin_convertvector(-r, _Float16_2); + } + + inline + __HOST_DEVICE__ + bool __hbeq2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__heq2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __HOST_DEVICE__ + bool __hbne2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hne2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __HOST_DEVICE__ + bool __hble2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hle2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __HOST_DEVICE__ + bool __hbge2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hge2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __HOST_DEVICE__ + bool __hblt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hlt2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __HOST_DEVICE__ + bool __hbgt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hgt2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __HOST_DEVICE__ + bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); } + inline + __HOST_DEVICE__ + bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); } + inline + __HOST_DEVICE__ + bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); } + inline + __HOST_DEVICE__ + bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); } + inline + __HOST_DEVICE__ + bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); } + inline + __HOST_DEVICE__ + bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); } + inline + __device__ + __half __hmax(const __half x, const __half y) { + return __half_raw{__ocml_fmax_f16(static_cast<__half_raw>(x).data, + static_cast<__half_raw>(y).data)}; + } + inline + __device__ + __half __hmax_nan(const __half x, const __half y) { + if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) { + return x; + } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) { + return y; + } + return __hmax(x, y); + } + inline + __device__ + __half __hmin(const __half x, const __half y) { + return __half_raw{__ocml_fmin_f16(static_cast<__half_raw>(x).data, + static_cast<__half_raw>(y).data)}; + } + inline + __device__ + __half __hmin_nan(const __half x, const __half y) { + if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) { + return x; + } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) { + return y; + } + return __hmin(x, y); + } + + // Arithmetic + inline + __device__ + __half __clamp_01(__half x) + { + auto r = static_cast<__half_raw>(x); + + if (__hlt(x, __half_raw{0})) return __half_raw{0}; + if (__hlt(__half_raw{1}, x)) return __half_raw{1}; + return r; + } + + inline + __device__ + __half __hadd(__half x, __half y) + { + return __half_raw{ + static_cast<__half_raw>(x).data + + static_cast<__half_raw>(y).data}; + } + inline + __device__ + __half __habs(__half x) + { + return __half_raw{ + __ocml_fabs_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half __hsub(__half x, __half y) + { + return __half_raw{ + static_cast<__half_raw>(x).data - + static_cast<__half_raw>(y).data}; + } + inline + __device__ + __half __hmul(__half x, __half y) + { + return __half_raw{ + static_cast<__half_raw>(x).data * + static_cast<__half_raw>(y).data}; + } + inline + __device__ + __half __hadd_sat(__half x, __half y) + { + return __clamp_01(__hadd(x, y)); + } + inline + __device__ + __half __hsub_sat(__half x, __half y) + { + return __clamp_01(__hsub(x, y)); + } + inline + __device__ + __half __hmul_sat(__half x, __half y) + { + return __clamp_01(__hmul(x, y)); + } + inline + __device__ + __half __hfma(__half x, __half y, __half z) + { + return __half_raw{__ocml_fma_f16( + static_cast<__half_raw>(x).data, + static_cast<__half_raw>(y).data, + static_cast<__half_raw>(z).data)}; + } + inline + __device__ + __half __hfma_sat(__half x, __half y, __half z) + { + return __clamp_01(__hfma(x, y, z)); + } + inline + __device__ + __half __hdiv(__half x, __half y) + { + return __half_raw{ + static_cast<__half_raw>(x).data / + static_cast<__half_raw>(y).data}; + } + + inline + __HOST_DEVICE__ + __half2 __hadd2(__half2 x, __half2 y) + { + return __half2{ + static_cast<__half2_raw>(x).data + + static_cast<__half2_raw>(y).data}; + } + inline + __HOST_DEVICE__ + __half2 __habs2(__half2 x) + { + return __half2{ + __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)}; + } + inline + __HOST_DEVICE__ + __half2 __hsub2(__half2 x, __half2 y) + { + return __half2{ + static_cast<__half2_raw>(x).data - + static_cast<__half2_raw>(y).data}; + } + inline + __HOST_DEVICE__ + __half2 __hmul2(__half2 x, __half2 y) + { + return __half2{ + static_cast<__half2_raw>(x).data * + static_cast<__half2_raw>(y).data}; + } + inline + __HOST_DEVICE__ + __half2 __hadd2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hadd2(x, y)); + return __half2{ + __clamp_01(__half_raw{r.data.x}), + __clamp_01(__half_raw{r.data.y})}; + } + inline + __HOST_DEVICE__ + __half2 __hsub2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hsub2(x, y)); + return __half2{ + __clamp_01(__half_raw{r.data.x}), + __clamp_01(__half_raw{r.data.y})}; + } + inline + __HOST_DEVICE__ + __half2 __hmul2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hmul2(x, y)); + return __half2{ + __clamp_01(__half_raw{r.data.x}), + __clamp_01(__half_raw{r.data.y})}; + } + inline + __HOST_DEVICE__ + __half2 __hfma2(__half2 x, __half2 y, __half2 z) + { + return __half2{__ocml_fma_2f16(x, y, z)}; + } + inline + __HOST_DEVICE__ + __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z) + { + auto r = static_cast<__half2_raw>(__hfma2(x, y, z)); + return __half2{ + __clamp_01(__half_raw{r.data.x}), + __clamp_01(__half_raw{r.data.y})}; + } + inline + __HOST_DEVICE__ + __half2 __h2div(__half2 x, __half2 y) + { + return __half2{ + static_cast<__half2_raw>(x).data / + static_cast<__half2_raw>(y).data}; + } + + // Math functions + #if defined(__clang__) && defined(__HIP__) + inline + __device__ + float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) { + return __ockl_fdot2(static_cast<__half2_raw>(a).data, + static_cast<__half2_raw>(b).data, + c, saturate); + } + #endif + inline + __device__ + __half htrunc(__half x) + { + return __half_raw{ + __ocml_trunc_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hceil(__half x) + { + return __half_raw{ + __ocml_ceil_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hfloor(__half x) + { + return __half_raw{ + __ocml_floor_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hrint(__half x) + { + return __half_raw{ + __ocml_rint_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hsin(__half x) + { + return __half_raw{ + __ocml_sin_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hcos(__half x) + { + return __half_raw{ + __ocml_cos_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hexp(__half x) + { + return __half_raw{ + __ocml_exp_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hexp2(__half x) + { + return __half_raw{ + __ocml_exp2_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hexp10(__half x) + { + return __half_raw{ + __ocml_exp10_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hlog2(__half x) + { + return __half_raw{ + __ocml_log2_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hlog(__half x) + { + return __half_raw{ + __ocml_log_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hlog10(__half x) + { + return __half_raw{ + __ocml_log10_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hrcp(__half x) + { + return __half_raw{ + static_cast<_Float16>(1.0f) /static_cast<__half_raw>(x).data}; + } + inline + __device__ + __half hrsqrt(__half x) + { + return __half_raw{ + __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hsqrt(__half x) + { + return __half_raw{ + __ocml_sqrt_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + bool __hisinf(__half x) + { + return __ocml_isinf_f16(static_cast<__half_raw>(x).data); + } + inline + __device__ + bool __hisnan(__half x) + { + return __ocml_isnan_f16(static_cast<__half_raw>(x).data); + } + inline + __device__ + __half __hneg(__half x) + { + return __half_raw{-static_cast<__half_raw>(x).data}; + } + + inline + __HOST_DEVICE__ + __half2 h2trunc(__half2 x) + { + return __half2{__ocml_trunc_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2ceil(__half2 x) + { + return __half2{__ocml_ceil_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2floor(__half2 x) + { + return __half2{__ocml_floor_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2rint(__half2 x) + { + return __half2{__ocml_rint_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2sin(__half2 x) + { + return __half2{__ocml_sin_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2cos(__half2 x) + { + return __half2{__ocml_cos_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2exp(__half2 x) + { + return __half2{__ocml_exp_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2exp2(__half2 x) + { + return __half2{__ocml_exp2_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2exp10(__half2 x) + { + return __half2{__ocml_exp10_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2log2(__half2 x) + { + return __half2{__ocml_log2_2f16(x)}; + } + inline + __HOST_DEVICE__ + __half2 h2log(__half2 x) { return __ocml_log_2f16(x); } + inline + __HOST_DEVICE__ + __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); } + inline + __HOST_DEVICE__ + __half2 h2rcp(__half2 x) { + return _Float16_2{ + _Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data}; + } + inline + __HOST_DEVICE__ + __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); } + inline + __HOST_DEVICE__ + __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); } + inline + __HOST_DEVICE__ + __half2 __hisinf2(__half2 x) + { + auto r = __ocml_isinf_2f16(x); + return __half2{_Float16_2{ + static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}}; + } + inline + __HOST_DEVICE__ + __half2 __hisnan2(__half2 x) + { + auto r = __ocml_isnan_2f16(x); + return __half2{_Float16_2{ + static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}}; + } + inline + __HOST_DEVICE__ + __half2 __hneg2(__half2 x) + { + return __half2{-static_cast<__half2_raw>(x).data}; + } + } // Anonymous namespace. + + #if !defined(HIP_NO_HALF) + using half = __half; + using half2 = __half2; + #endif + __device__ + inline + __half __shfl(__half var, int src_lane, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl(__half2 var, int src_lane, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.h; + } + __device__ + inline + __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half __shfl_xor(__half var, int lane_mask, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_xor(__half2 var, int lane_mask, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.h; + } + #endif // defined(__cplusplus) +#elif defined(__GNUC__) + #if !defined(__HIPCC_RTC__) + #include "hip_fp16_gcc.h" + #endif +#endif // !defined(__clang__) && defined(__GNUC__) + +#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if !defined(__HIPCC_RTC__) +#include "hip_fp16_math_fwd.h" +#include "amd_hip_vector_types.h" +#include "math_fwd.h" + +#include + +#include +// assert.h is only for the host version of assert. +// The device version of assert is implemented in hip/amd_detail/hip_runtime.h. +// Users should include hip_runtime.h for the device version of assert. +#if !__HIP_DEVICE_COMPILE__ +#include +#endif +#include +#include +#include +#endif // !defined(__HIPCC_RTC__) + +#if _LIBCPP_VERSION && __HIP__ +namespace std { +template <> +struct __numeric_type<_Float16> +{ + static _Float16 __test(_Float16); + + typedef _Float16 type; + static const bool value = true; +}; +} +#endif // _LIBCPP_VERSION + +#pragma push_macro("__DEVICE__") +#pragma push_macro("__RETURN_TYPE") + +#define __DEVICE__ static __device__ +#define __RETURN_TYPE bool + +// DOT FUNCTIONS +#if __HIP_CLANG_ONLY__ +__DEVICE__ +inline +int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) { + return __ockl_sdot2(a.data, b.data, c, saturate); +} +__DEVICE__ +inline +uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) { + return __ockl_udot2(a.data, b.data, c, saturate); +} +__DEVICE__ +inline +int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) { + return __ockl_sdot4(a.data, b.data, c, saturate); +} +__DEVICE__ +inline +uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) { + return __ockl_udot4(a.data, b.data, c, saturate); +} +__DEVICE__ +inline +int amd_mixed_dot(int a, int b, int c, bool saturate) { + return __ockl_sdot8(a, b, c, saturate); +} +__DEVICE__ +inline +uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) { + return __ockl_udot8(a, b, c, saturate); +} +#endif + +#pragma pop_macro("__DEVICE__") +#pragma pop_macro("__RETURN_TYPE") +// For backward compatibility. +// There are HIP applications e.g. TensorFlow, expecting __HIP_ARCH_* macros +// defined after including math_functions.h. +#if !defined(__HIPCC_RTC__) +#include +#endif diff --git a/setup.py b/setup.py index bdab81dab6..380d7f96ec 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ setup(name='tinygrad', long_description=long_description, long_description_content_type='text/markdown', packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', - 'tinygrad.runtime', 'tinygrad.runtime.graph', 'tinygrad.shape', 'tinygrad.features'], + 'tinygrad.runtime', 'tinygrad.runtime.compiler', 'tinygrad.runtime.graph', 'tinygrad.shape', 'tinygrad.features'], classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License" diff --git a/test/test_device_speed.py b/test/test_device_speed.py index afcad10ee9..5084e1c708 100644 --- a/test/test_device_speed.py +++ b/test/test_device_speed.py @@ -14,6 +14,11 @@ class TestDeviceSpeed(unittest.TestCase): with Timing("compiler "): self.dev.compiler(self.empty) + def test_empty_compile_twice(self): + self.dev.compiler(self.empty) + with Timing("compiler "): + self.dev.compiler(self.empty) + def test_launch_speed(self): prg_bin = self.dev.compiler(self.empty) prg = self.dev.runtime("test", prg_bin) diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 5cca8964a4..bd6cb542ec 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -223,30 +223,66 @@ class CUDALanguage(CStyleLanguage): code_for_workitem = {"g": lambda x: f"blockIdx.{chr(120+x)}", "l": lambda x: f"threadIdx.{chr(120+x)}", "i": lambda x: f"(blockIdx.{chr(120+x)}*blockDim.{chr(120+x)}+threadIdx.{chr(120+x)})"} code_for_op = {**CStyleLanguage().code_for_op, **code_for_op_half} - half_prekernel ="#include \n"+"#include \n"+""" + half_prekernel = "#include \n"+"#include \n"+""" struct half4 { half x, y, z, w; }; __device__ half4 make_half4(half x, half y, half z, half w) { half4 ret; ret.x = x; ret.y = y; ret.z = z; ret.w = w; return ret; } """ type_map = {dtypes.bfloat16: "nv_bfloat16"} CUDARenderer = functools.partial(uops_to_cstyle, CUDALanguage()) -class HIPLanguage(CUDALanguage): +code_for_op_hip = { + BinaryOps.MAX: lambda a,b,dtype: f"__ocml_fmax_f32({a},{b})" if dtype != dtypes.half else f"__ocml_fmax_f16({a},{b})", + UnaryOps.SQRT: lambda x,dtype: f"__ocml_sqrt_f32({x})" if dtype != dtypes.half else f"__ocml_sqrt_f16({x})", + UnaryOps.SIN: lambda x,dtype: f"__ocml_sin_f32({x})" if dtype != dtypes.half else f"__ocml_sin_f16({x})", + UnaryOps.LOG2: lambda x,dtype: f"__ocml_log2_f32({x})" if dtype != dtypes.half else f"__ocml_log2_f16({x})", + UnaryOps.EXP2: lambda x,dtype: f"__ocml_exp2_f32({x})" if dtype != dtypes.half else f"__ocml_exp2_f16({x})", +} + +def _make_hip_dtype(base_type, name, cnt): + nms = "xyzwabcdefghijkl"[:cnt] + return f"typedef {base_type} {name}{cnt} __attribute__((ext_vector_type({cnt})));\n" + \ + f"static inline __attribute__((device)) {name}{cnt} make_{name}{cnt}(" + ', '.join([f"{base_type} {x}" for x in nms]) + \ + ") { return {" + ', '.join(nms) + "}; }" + +class HIPLanguage(CStyleLanguage): kernel_prefix = "#include \n#define INFINITY (__builtin_inff())\n#define NAN (__builtin_nanf(\"\"))" + """ - typedef float float8 __attribute__((ext_vector_type(8))); - __device__ float8 make_float8(float x, float y, float z, float w, float a, float b, float c, float d) { return {x, y, z, w, a, b, c, d}; } - extern "C" __global__ - """ + #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock))) + #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), amdgpu_waves_per_eu(minBlocksPerMultiprocessor))) + #define select_impl_(_1, _2, impl_, ...) impl_ + #define __launch_bounds__(...) select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__) + typedef long unsigned int size_t; + #define half _Float16 + struct hip_bfloat16 { unsigned short data; }; + + extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int); + extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int); + extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int); + + extern "C" { + __attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float); + __attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float); + __attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float); + __attribute__((device)) float __ocml_sin_f32(float); + __attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float); + __attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16); + __attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16); + __attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16); + __attribute__((device)) _Float16 __ocml_sin_f16(_Float16); + __attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16); + }\n""" + '\n'.join([_make_hip_dtype(*x) for x in [("signed int", "int", 2), + ("_Float16", "half", 2), ("_Float16", "half", 4), ("_Float16", "half", 8), ("_Float16", "half", 16), + ("float", "float", 2), ("float", "float", 4), ("float", "float", 8)]]) + \ + 'extern "C" __attribute__((global))' + code_for_workitem = {"g": lambda x: f"__ockl_get_group_id({x})", "l": lambda x: f"__ockl_get_local_id({x})", + "i": lambda x: f"(__ockl_get_group_id({x})*__ockl_get_local_size({x})+__ockl_get_local_id({x}))"} + code_for_op = {**CStyleLanguage().code_for_op, **code_for_op_hip} + smem_prefix = "__attribute__((shared))" + barrier = '__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");' + '__builtin_amdgcn_s_barrier();' + \ + '__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");' + float4 = "make_float4" launch_bounds = True uses_ptr_arithmetic = True - half_prekernel = "#include \n" + """ -typedef union { struct { half x, y, z, w; } __attribute__((aligned(8))); half data[4]; } half4; -__device__ half4 make_half4(half x, half y, half z, half w) { return {x, y, z, w}; } -typedef union { struct { half x, y, z, w, a, b, c, d; } __attribute__((aligned(16))); half data[8]; } half8; -__device__ half8 make_half8(half x, half y, half z, half w, half a, half b, half c, half d) { return {x, y, z, w, a, b, c, d}; } - typedef _Float16 half16 __attribute__((ext_vector_type(16))); -__device__ half16 make_half16(half x, half y, half z, half w, half a, half b, half c, half d, - half e, half f, half g, half h, half i, half j, half k, half l) { - return {x, y, z, w, a, b, c, d, e, f, g, h, i, j, k, l}; } - """ type_map = {dtypes.bfloat16: "hip_bfloat16"} -HIPRenderer = functools.partial(uops_to_cstyle, HIPLanguage()) +HIPRenderer = functools.partial(uops_to_cstyle, HIPLanguage()) \ No newline at end of file diff --git a/tinygrad/runtime/compiler/hip_comgr.py b/tinygrad/runtime/compiler/hip_comgr.py new file mode 100644 index 0000000000..45b093f05c --- /dev/null +++ b/tinygrad/runtime/compiler/hip_comgr.py @@ -0,0 +1,47 @@ +import ctypes +import tinygrad.runtime.autogen.comgr as comgr + +def check(status): + if status != 0: + comgr.amd_comgr_status_string(status, ctypes.byref(status_str := ctypes.POINTER(ctypes.c_char)())) + raise RuntimeError(f"comgr fail {status}, {ctypes.string_at(status_str).decode()}") + +def _get_comgr_data(data_set, data_type): + check(comgr.amd_comgr_action_data_get_data(data_set, data_type, 0, ctypes.byref(data_exec := comgr.amd_comgr_data_t()))) + check(comgr.amd_comgr_get_data(data_exec, ctypes.byref(sz := ctypes.c_uint64()), None)) + check(comgr.amd_comgr_get_data(data_exec, ctypes.byref(sz), (dat := ctypes.create_string_buffer(sz.value)))) + check(comgr.amd_comgr_release_data(data_exec)) + return bytes(dat) + +# AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1 +def compile_hip(prg:str, arch="gfx1100") -> bytes: + check(comgr.amd_comgr_create_action_info(ctypes.byref(action_info := comgr.amd_comgr_action_info_t()))) + check(comgr.amd_comgr_action_info_set_language(action_info, comgr.AMD_COMGR_LANGUAGE_HIP)) + check(comgr.amd_comgr_action_info_set_isa_name(action_info, b"amdgcn-amd-amdhsa--" + arch.encode())) + check(comgr.amd_comgr_action_info_set_logging(action_info, True)) + + check(comgr.amd_comgr_create_data_set(ctypes.byref(data_set_src := comgr.amd_comgr_data_set_t()))) + check(comgr.amd_comgr_create_data_set(ctypes.byref(data_set_bc := comgr.amd_comgr_data_set_t()))) + check(comgr.amd_comgr_create_data_set(ctypes.byref(data_set_reloc := comgr.amd_comgr_data_set_t()))) + check(comgr.amd_comgr_create_data_set(ctypes.byref(data_set_exec := comgr.amd_comgr_data_set_t()))) + + check(comgr.amd_comgr_create_data(comgr.AMD_COMGR_DATA_KIND_SOURCE, ctypes.byref(data_src := comgr.amd_comgr_data_t()))) + check(comgr.amd_comgr_set_data(data_src, len(rprg := prg.encode()), rprg)) + check(comgr.amd_comgr_set_data_name(data_src, b"")) + + check(comgr.amd_comgr_data_set_add(data_set_src, data_src)) + # -include hiprtc_runtime.h was removed + check(comgr.amd_comgr_action_info_set_options(action_info, b"-O3 -mcumode --hip-version=6.0.32830 -DHIP_VERSION_MAJOR=6 -DHIP_VERSION_MINOR=0 -DHIP_VERSION_PATCH=32830 -D__HIPCC_RTC__ -std=c++14 -nogpuinc -Wno-gnu-line-marker -Wno-missing-prototypes --offload-arch=gfx1100 -I/opt/rocm/include -Xclang -disable-llvm-passes")) # noqa: E501 + status = comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, action_info, data_set_src, data_set_bc) + if status != 0: + print(_get_comgr_data(data_set_bc, comgr.AMD_COMGR_DATA_KIND_LOG).decode()) + raise RuntimeError("compile failed") + check(comgr.amd_comgr_action_info_set_options(action_info, b"-O3 -mllvm -amdgpu-internalize-symbols")) + check(comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE, action_info, data_set_bc, data_set_reloc)) + check(comgr.amd_comgr_action_info_set_options(action_info, b"")) + check(comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action_info, data_set_reloc, data_set_exec)) + ret = _get_comgr_data(data_set_exec, comgr.AMD_COMGR_DATA_KIND_EXECUTABLE) + check(comgr.amd_comgr_release_data(data_src)) + for x in [data_set_src, data_set_bc, data_set_reloc, data_set_exec]: check(comgr.amd_comgr_destroy_data_set(x)) + check(comgr.amd_comgr_destroy_action_info(action_info)) + return ret diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py index 55cb35d01c..ba12dcc6d8 100644 --- a/tinygrad/runtime/ops_hip.py +++ b/tinygrad/runtime/ops_hip.py @@ -3,10 +3,11 @@ import ctypes, functools, subprocess, io from typing import Tuple, TypeVar, List, Any, cast, Set import tinygrad.runtime.autogen.hip as hip from tinygrad.helpers import DEBUG, getenv, init_c_var -from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t, to_char_p_p, get_bytes +from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.codegen.kernel import LinearizerOptions +from tinygrad.runtime.compiler.hip_comgr import compile_hip # The default HIP stream is used for everything. MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile @@ -21,13 +22,6 @@ def hip_set_device(d:int): def check(status): if status != 0: raise RuntimeError(f"HIP Error {status}, {ctypes.string_at(hip.hipGetErrorString(status)).decode()}") -def compile_hip(prg:str, arch="gfx1100") -> bytes: - check(hip.hiprtcCreateProgram(ctypes.byref(prog := hip.hiprtcProgram()), prg.encode(), "".encode(), 0, None, None)) - compile_options = [f'--offload-arch={arch}', '-I/opt/rocm/include'] - status = hip.hiprtcCompileProgram(prog, len(compile_options), to_char_p_p([o.encode() for o in compile_options])) - if status != 0: raise RuntimeError(f"compile failed: {get_bytes(prog, hip.hiprtcGetProgramLogSize, hip.hiprtcGetProgramLog, check).decode()}") - return get_bytes(prog, hip.hiprtcGetCodeSize, hip.hiprtcGetCode, check) - class HIPProgram: def __init__(self, device:int, name:str, lib:bytes): self.device, self.name, self.lib = device, name, lib