diff --git a/include/device.h b/include/device.h index e656a925a..9e5bc0fb3 100644 --- a/include/device.h +++ b/include/device.h @@ -1,5 +1,13 @@ +#ifndef DEVICE_H +#define DEVICE_H + +#pragma once + #include #include +#include +#include +#include extern "C" { cudaStream_t *cuda_create_stream(uint32_t gpu_index); @@ -35,4 +43,17 @@ int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index); int cuda_get_max_shared_memory(uint32_t gpu_index); int cuda_synchronize_stream(void *v_stream); + +#define check_cuda_error(ans) \ + { cuda_error((ans), __FILE__, __LINE__); } +inline void cuda_error(cudaError_t code, const char *file, int line, + bool abort = true) { + if (code != cudaSuccess) { + fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file, + line); + if (abort) + exit(code); + } } +} +#endif \ No newline at end of file diff --git a/include/helper_cuda.h b/include/helper_cuda.h deleted file mode 100644 index 2b38732cf..000000000 --- a/include/helper_cuda.h +++ /dev/null @@ -1,1154 +0,0 @@ -/** - * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -//////////////////////////////////////////////////////////////////////////////// -// These are CUDA Helper functions for initialization and error checking - -#ifndef HELPER_CUDA_H -#define HELPER_CUDA_H - -#pragma once - -#include -#include -#include - -#include "helper_string.h" - -#ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 -#endif - -// Note, it is required that your SDK sample to include the proper header files, -// please refer the CUDA examples for examples of the needed CUDA headers, which -// may change depending on which CUDA functions are used. - -// CUDA Runtime error messages -#ifdef __DRIVER_TYPES_H__ -static const char *_cudaGetErrorEnum(cudaError_t error) { - switch (error) { - case cudaSuccess: - return "cudaSuccess"; - - case cudaErrorMissingConfiguration: - return "cudaErrorMissingConfiguration"; - - case cudaErrorMemoryAllocation: - return "cudaErrorMemoryAllocation"; - - case cudaErrorInitializationError: - return "cudaErrorInitializationError"; - - case cudaErrorLaunchFailure: - return "cudaErrorLaunchFailure"; - - case cudaErrorPriorLaunchFailure: - return "cudaErrorPriorLaunchFailure"; - - case cudaErrorLaunchTimeout: - return "cudaErrorLaunchTimeout"; - - case cudaErrorLaunchOutOfResources: - return "cudaErrorLaunchOutOfResources"; - - case cudaErrorInvalidDeviceFunction: - return "cudaErrorInvalidDeviceFunction"; - - case cudaErrorInvalidConfiguration: - return "cudaErrorInvalidConfiguration"; - - case cudaErrorInvalidDevice: - return "cudaErrorInvalidDevice"; - - case cudaErrorInvalidValue: - return "cudaErrorInvalidValue"; - - case cudaErrorInvalidPitchValue: - return "cudaErrorInvalidPitchValue"; - - case cudaErrorInvalidSymbol: - return "cudaErrorInvalidSymbol"; - - case cudaErrorMapBufferObjectFailed: - return "cudaErrorMapBufferObjectFailed"; - - case cudaErrorUnmapBufferObjectFailed: - return "cudaErrorUnmapBufferObjectFailed"; - - case cudaErrorInvalidHostPointer: - return "cudaErrorInvalidHostPointer"; - - case cudaErrorInvalidDevicePointer: - return "cudaErrorInvalidDevicePointer"; - - case cudaErrorInvalidTexture: - return "cudaErrorInvalidTexture"; - - case cudaErrorInvalidTextureBinding: - return "cudaErrorInvalidTextureBinding"; - - case cudaErrorInvalidChannelDescriptor: - return "cudaErrorInvalidChannelDescriptor"; - - case cudaErrorInvalidMemcpyDirection: - return "cudaErrorInvalidMemcpyDirection"; - - case cudaErrorAddressOfConstant: - return "cudaErrorAddressOfConstant"; - - case cudaErrorTextureFetchFailed: - return "cudaErrorTextureFetchFailed"; - - case cudaErrorTextureNotBound: - return "cudaErrorTextureNotBound"; - - case cudaErrorSynchronizationError: - return "cudaErrorSynchronizationError"; - - case cudaErrorInvalidFilterSetting: - return "cudaErrorInvalidFilterSetting"; - - case cudaErrorInvalidNormSetting: - return "cudaErrorInvalidNormSetting"; - - case cudaErrorMixedDeviceExecution: - return "cudaErrorMixedDeviceExecution"; - - case cudaErrorCudartUnloading: - return "cudaErrorCudartUnloading"; - - case cudaErrorUnknown: - return "cudaErrorUnknown"; - - case cudaErrorNotYetImplemented: - return "cudaErrorNotYetImplemented"; - - case cudaErrorMemoryValueTooLarge: - return "cudaErrorMemoryValueTooLarge"; - - case cudaErrorInvalidResourceHandle: - return "cudaErrorInvalidResourceHandle"; - - case cudaErrorNotReady: - return "cudaErrorNotReady"; - - case cudaErrorInsufficientDriver: - return "cudaErrorInsufficientDriver"; - - case cudaErrorSetOnActiveProcess: - return "cudaErrorSetOnActiveProcess"; - - case cudaErrorInvalidSurface: - return "cudaErrorInvalidSurface"; - - case cudaErrorNoDevice: - return "cudaErrorNoDevice"; - - case cudaErrorECCUncorrectable: - return "cudaErrorECCUncorrectable"; - - case cudaErrorSharedObjectSymbolNotFound: - return "cudaErrorSharedObjectSymbolNotFound"; - - case cudaErrorSharedObjectInitFailed: - return "cudaErrorSharedObjectInitFailed"; - - case cudaErrorUnsupportedLimit: - return "cudaErrorUnsupportedLimit"; - - case cudaErrorDuplicateVariableName: - return "cudaErrorDuplicateVariableName"; - - case cudaErrorDuplicateTextureName: - return "cudaErrorDuplicateTextureName"; - - case cudaErrorDuplicateSurfaceName: - return "cudaErrorDuplicateSurfaceName"; - - case cudaErrorDevicesUnavailable: - return "cudaErrorDevicesUnavailable"; - - case cudaErrorInvalidKernelImage: - return "cudaErrorInvalidKernelImage"; - - case cudaErrorNoKernelImageForDevice: - return "cudaErrorNoKernelImageForDevice"; - - case cudaErrorIncompatibleDriverContext: - return "cudaErrorIncompatibleDriverContext"; - - case cudaErrorPeerAccessAlreadyEnabled: - return "cudaErrorPeerAccessAlreadyEnabled"; - - case cudaErrorPeerAccessNotEnabled: - return "cudaErrorPeerAccessNotEnabled"; - - case cudaErrorDeviceAlreadyInUse: - return "cudaErrorDeviceAlreadyInUse"; - - case cudaErrorProfilerDisabled: - return "cudaErrorProfilerDisabled"; - - case cudaErrorProfilerNotInitialized: - return "cudaErrorProfilerNotInitialized"; - - case cudaErrorProfilerAlreadyStarted: - return "cudaErrorProfilerAlreadyStarted"; - - case cudaErrorProfilerAlreadyStopped: - return "cudaErrorProfilerAlreadyStopped"; - - /* Since CUDA 4.0*/ - case cudaErrorAssert: - return "cudaErrorAssert"; - - case cudaErrorTooManyPeers: - return "cudaErrorTooManyPeers"; - - case cudaErrorHostMemoryAlreadyRegistered: - return "cudaErrorHostMemoryAlreadyRegistered"; - - case cudaErrorHostMemoryNotRegistered: - return "cudaErrorHostMemoryNotRegistered"; - - /* Since CUDA 5.0 */ - case cudaErrorOperatingSystem: - return "cudaErrorOperatingSystem"; - - case cudaErrorPeerAccessUnsupported: - return "cudaErrorPeerAccessUnsupported"; - - case cudaErrorLaunchMaxDepthExceeded: - return "cudaErrorLaunchMaxDepthExceeded"; - - case cudaErrorLaunchFileScopedTex: - return "cudaErrorLaunchFileScopedTex"; - - case cudaErrorLaunchFileScopedSurf: - return "cudaErrorLaunchFileScopedSurf"; - - case cudaErrorSyncDepthExceeded: - return "cudaErrorSyncDepthExceeded"; - - case cudaErrorLaunchPendingCountExceeded: - return "cudaErrorLaunchPendingCountExceeded"; - - case cudaErrorNotPermitted: - return "cudaErrorNotPermitted"; - - case cudaErrorNotSupported: - return "cudaErrorNotSupported"; - - /* Since CUDA 6.0 */ - case cudaErrorHardwareStackError: - return "cudaErrorHardwareStackError"; - - case cudaErrorIllegalInstruction: - return "cudaErrorIllegalInstruction"; - - case cudaErrorMisalignedAddress: - return "cudaErrorMisalignedAddress"; - - case cudaErrorInvalidAddressSpace: - return "cudaErrorInvalidAddressSpace"; - - case cudaErrorInvalidPc: - return "cudaErrorInvalidPc"; - - case cudaErrorIllegalAddress: - return "cudaErrorIllegalAddress"; - - /* Since CUDA 6.5*/ - case cudaErrorInvalidPtx: - return "cudaErrorInvalidPtx"; - - case cudaErrorInvalidGraphicsContext: - return "cudaErrorInvalidGraphicsContext"; - - case cudaErrorStartupFailure: - return "cudaErrorStartupFailure"; - - case cudaErrorApiFailureBase: - return "cudaErrorApiFailureBase"; - } - - return ""; -} -#endif - -#ifdef __cuda_cuda_h__ -// CUDA Driver API errors -static const char *_cudaGetErrorEnum(CUresult error) { - switch (error) { - case CUDA_SUCCESS: - return "CUDA_SUCCESS"; - - case CUDA_ERROR_INVALID_VALUE: - return "CUDA_ERROR_INVALID_VALUE"; - - case CUDA_ERROR_OUT_OF_MEMORY: - return "CUDA_ERROR_OUT_OF_MEMORY"; - - case CUDA_ERROR_NOT_INITIALIZED: - return "CUDA_ERROR_NOT_INITIALIZED"; - - case CUDA_ERROR_DEINITIALIZED: - return "CUDA_ERROR_DEINITIALIZED"; - - case CUDA_ERROR_PROFILER_DISABLED: - return "CUDA_ERROR_PROFILER_DISABLED"; - - case CUDA_ERROR_PROFILER_NOT_INITIALIZED: - return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; - - case CUDA_ERROR_PROFILER_ALREADY_STARTED: - return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; - - case CUDA_ERROR_PROFILER_ALREADY_STOPPED: - return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; - - case CUDA_ERROR_NO_DEVICE: - return "CUDA_ERROR_NO_DEVICE"; - - case CUDA_ERROR_INVALID_DEVICE: - return "CUDA_ERROR_INVALID_DEVICE"; - - case CUDA_ERROR_INVALID_IMAGE: - return "CUDA_ERROR_INVALID_IMAGE"; - - case CUDA_ERROR_INVALID_CONTEXT: - return "CUDA_ERROR_INVALID_CONTEXT"; - - case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: - return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; - - case CUDA_ERROR_MAP_FAILED: - return "CUDA_ERROR_MAP_FAILED"; - - case CUDA_ERROR_UNMAP_FAILED: - return "CUDA_ERROR_UNMAP_FAILED"; - - case CUDA_ERROR_ARRAY_IS_MAPPED: - return "CUDA_ERROR_ARRAY_IS_MAPPED"; - - case CUDA_ERROR_ALREADY_MAPPED: - return "CUDA_ERROR_ALREADY_MAPPED"; - - case CUDA_ERROR_NO_BINARY_FOR_GPU: - return "CUDA_ERROR_NO_BINARY_FOR_GPU"; - - case CUDA_ERROR_ALREADY_ACQUIRED: - return "CUDA_ERROR_ALREADY_ACQUIRED"; - - case CUDA_ERROR_NOT_MAPPED: - return "CUDA_ERROR_NOT_MAPPED"; - - case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: - return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; - - case CUDA_ERROR_NOT_MAPPED_AS_POINTER: - return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; - - case CUDA_ERROR_ECC_UNCORRECTABLE: - return "CUDA_ERROR_ECC_UNCORRECTABLE"; - - case CUDA_ERROR_UNSUPPORTED_LIMIT: - return "CUDA_ERROR_UNSUPPORTED_LIMIT"; - - case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: - return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; - - case CUDA_ERROR_INVALID_SOURCE: - return "CUDA_ERROR_INVALID_SOURCE"; - - case CUDA_ERROR_FILE_NOT_FOUND: - return "CUDA_ERROR_FILE_NOT_FOUND"; - - case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: - return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; - - case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: - return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; - - case CUDA_ERROR_OPERATING_SYSTEM: - return "CUDA_ERROR_OPERATING_SYSTEM"; - - case CUDA_ERROR_INVALID_HANDLE: - return "CUDA_ERROR_INVALID_HANDLE"; - - case CUDA_ERROR_NOT_FOUND: - return "CUDA_ERROR_NOT_FOUND"; - - case CUDA_ERROR_NOT_READY: - return "CUDA_ERROR_NOT_READY"; - - case CUDA_ERROR_LAUNCH_FAILED: - return "CUDA_ERROR_LAUNCH_FAILED"; - - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: - return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; - - case CUDA_ERROR_LAUNCH_TIMEOUT: - return "CUDA_ERROR_LAUNCH_TIMEOUT"; - - case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: - return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; - - case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: - return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; - - case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: - return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; - - case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: - return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; - - case CUDA_ERROR_CONTEXT_IS_DESTROYED: - return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; - - case CUDA_ERROR_ASSERT: - return "CUDA_ERROR_ASSERT"; - - case CUDA_ERROR_TOO_MANY_PEERS: - return "CUDA_ERROR_TOO_MANY_PEERS"; - - case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: - return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; - - case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: - return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; - - case CUDA_ERROR_UNKNOWN: - return "CUDA_ERROR_UNKNOWN"; - } - - return ""; -} -#endif - -#ifdef CUBLAS_API_H_ -// cuBLAS API errors -static const char *_cudaGetErrorEnum(cublasStatus_t error) { - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - } - - return ""; -} -#endif - -#ifdef _CUFFT_H_ -// cuFFT API errors -static const char *_cudaGetErrorEnum(cufftResult error) { - switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - - case CUFFT_INVALID_DEVICE: - return "CUFFT_INVALID_DEVICE"; - - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; - - case CUFFT_NO_WORKSPACE: - return "CUFFT_NO_WORKSPACE"; - - case CUFFT_NOT_IMPLEMENTED: - return "CUFFT_NOT_IMPLEMENTED"; - - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; - } - - return ""; -} -#endif - -#ifdef CUSPARSEAPI -// cuSPARSE API errors -static const char *_cudaGetErrorEnum(cusparseStatus_t error) { - switch (error) { - case CUSPARSE_STATUS_SUCCESS: - return "CUSPARSE_STATUS_SUCCESS"; - - case CUSPARSE_STATUS_NOT_INITIALIZED: - return "CUSPARSE_STATUS_NOT_INITIALIZED"; - - case CUSPARSE_STATUS_ALLOC_FAILED: - return "CUSPARSE_STATUS_ALLOC_FAILED"; - - case CUSPARSE_STATUS_INVALID_VALUE: - return "CUSPARSE_STATUS_INVALID_VALUE"; - - case CUSPARSE_STATUS_ARCH_MISMATCH: - return "CUSPARSE_STATUS_ARCH_MISMATCH"; - - case CUSPARSE_STATUS_MAPPING_ERROR: - return "CUSPARSE_STATUS_MAPPING_ERROR"; - - case CUSPARSE_STATUS_EXECUTION_FAILED: - return "CUSPARSE_STATUS_EXECUTION_FAILED"; - - case CUSPARSE_STATUS_INTERNAL_ERROR: - return "CUSPARSE_STATUS_INTERNAL_ERROR"; - - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - } - - return ""; -} -#endif - -#ifdef CURAND_H_ -// cuRAND API errors -static const char *_cudaGetErrorEnum(curandStatus_t error) { - switch (error) { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; - - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; - - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; - - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; - - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; - - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; - - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; - - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; - - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; - - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; - - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; - } - - return ""; -} -#endif - -#ifdef NV_NPPIDEFS_H -// NPP API errors -static const char *_cudaGetErrorEnum(NppStatus error) { - switch (error) { - case NPP_NOT_SUPPORTED_MODE_ERROR: - return "NPP_NOT_SUPPORTED_MODE_ERROR"; - - case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; - - case NPP_RESIZE_NO_OPERATION_ERROR: - return "NPP_RESIZE_NO_OPERATION_ERROR"; - - case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: - return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; - -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - - case NPP_BAD_ARG_ERROR: - return "NPP_BAD_ARGUMENT_ERROR"; - - case NPP_COEFF_ERROR: - return "NPP_COEFFICIENT_ERROR"; - - case NPP_RECT_ERROR: - return "NPP_RECTANGLE_ERROR"; - - case NPP_QUAD_ERROR: - return "NPP_QUADRANGLE_ERROR"; - - case NPP_MEM_ALLOC_ERR: - return "NPP_MEMORY_ALLOCATION_ERROR"; - - case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; - - case NPP_INVALID_INPUT: - return "NPP_INVALID_INPUT"; - - case NPP_POINTER_ERROR: - return "NPP_POINTER_ERROR"; - - case NPP_WARNING: - return "NPP_WARNING"; - - case NPP_ODD_ROI_WARNING: - return "NPP_ODD_ROI_WARNING"; -#else - - // These are for CUDA 5.5 or higher - case NPP_BAD_ARGUMENT_ERROR: - return "NPP_BAD_ARGUMENT_ERROR"; - - case NPP_COEFFICIENT_ERROR: - return "NPP_COEFFICIENT_ERROR"; - - case NPP_RECTANGLE_ERROR: - return "NPP_RECTANGLE_ERROR"; - - case NPP_QUADRANGLE_ERROR: - return "NPP_QUADRANGLE_ERROR"; - - case NPP_MEMORY_ALLOCATION_ERR: - return "NPP_MEMORY_ALLOCATION_ERROR"; - - case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; - - case NPP_INVALID_HOST_POINTER_ERROR: - return "NPP_INVALID_HOST_POINTER_ERROR"; - - case NPP_INVALID_DEVICE_POINTER_ERROR: - return "NPP_INVALID_DEVICE_POINTER_ERROR"; -#endif - - case NPP_LUT_NUMBER_OF_LEVELS_ERROR: - return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; - - case NPP_TEXTURE_BIND_ERROR: - return "NPP_TEXTURE_BIND_ERROR"; - - case NPP_WRONG_INTERSECTION_ROI_ERROR: - return "NPP_WRONG_INTERSECTION_ROI_ERROR"; - - case NPP_NOT_EVEN_STEP_ERROR: - return "NPP_NOT_EVEN_STEP_ERROR"; - - case NPP_INTERPOLATION_ERROR: - return "NPP_INTERPOLATION_ERROR"; - - case NPP_RESIZE_FACTOR_ERROR: - return "NPP_RESIZE_FACTOR_ERROR"; - - case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: - return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; - -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - - case NPP_MEMFREE_ERR: - return "NPP_MEMFREE_ERR"; - - case NPP_MEMSET_ERR: - return "NPP_MEMSET_ERR"; - - case NPP_MEMCPY_ERR: - return "NPP_MEMCPY_ERROR"; - - case NPP_MIRROR_FLIP_ERR: - return "NPP_MIRROR_FLIP_ERR"; -#else - - case NPP_MEMFREE_ERROR: - return "NPP_MEMFREE_ERROR"; - - case NPP_MEMSET_ERROR: - return "NPP_MEMSET_ERROR"; - - case NPP_MEMCPY_ERROR: - return "NPP_MEMCPY_ERROR"; - - case NPP_MIRROR_FLIP_ERROR: - return "NPP_MIRROR_FLIP_ERROR"; -#endif - - case NPP_ALIGNMENT_ERROR: - return "NPP_ALIGNMENT_ERROR"; - - case NPP_STEP_ERROR: - return "NPP_STEP_ERROR"; - - case NPP_SIZE_ERROR: - return "NPP_SIZE_ERROR"; - - case NPP_NULL_POINTER_ERROR: - return "NPP_NULL_POINTER_ERROR"; - - case NPP_CUDA_KERNEL_EXECUTION_ERROR: - return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; - - case NPP_NOT_IMPLEMENTED_ERROR: - return "NPP_NOT_IMPLEMENTED_ERROR"; - - case NPP_ERROR: - return "NPP_ERROR"; - - case NPP_SUCCESS: - return "NPP_SUCCESS"; - - case NPP_WRONG_INTERSECTION_QUAD_WARNING: - return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; - - case NPP_MISALIGNED_DST_ROI_WARNING: - return "NPP_MISALIGNED_DST_ROI_WARNING"; - - case NPP_AFFINE_QUAD_INCORRECT_WARNING: - return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; - - case NPP_DOUBLE_SIZE_WARNING: - return "NPP_DOUBLE_SIZE_WARNING"; - - case NPP_WRONG_INTERSECTION_ROI_WARNING: - return "NPP_WRONG_INTERSECTION_ROI_WARNING"; - -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 - /* These are 6.0 or higher */ - case NPP_LUT_PALETTE_BITSIZE_ERROR: - return "NPP_LUT_PALETTE_BITSIZE_ERROR"; - - case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; - - case NPP_QUALITY_INDEX_ERROR: - return "NPP_QUALITY_INDEX_ERROR"; - - case NPP_CHANNEL_ORDER_ERROR: - return "NPP_CHANNEL_ORDER_ERROR"; - - case NPP_ZERO_MASK_VALUE_ERROR: - return "NPP_ZERO_MASK_VALUE_ERROR"; - - case NPP_NUMBER_OF_CHANNELS_ERROR: - return "NPP_NUMBER_OF_CHANNELS_ERROR"; - - case NPP_COI_ERROR: - return "NPP_COI_ERROR"; - - case NPP_DIVISOR_ERROR: - return "NPP_DIVISOR_ERROR"; - - case NPP_CHANNEL_ERROR: - return "NPP_CHANNEL_ERROR"; - - case NPP_STRIDE_ERROR: - return "NPP_STRIDE_ERROR"; - - case NPP_ANCHOR_ERROR: - return "NPP_ANCHOR_ERROR"; - - case NPP_MASK_SIZE_ERROR: - return "NPP_MASK_SIZE_ERROR"; - - case NPP_MOMENT_00_ZERO_ERROR: - return "NPP_MOMENT_00_ZERO_ERROR"; - - case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: - return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; - - case NPP_THRESHOLD_ERROR: - return "NPP_THRESHOLD_ERROR"; - - case NPP_CONTEXT_MATCH_ERROR: - return "NPP_CONTEXT_MATCH_ERROR"; - - case NPP_FFT_FLAG_ERROR: - return "NPP_FFT_FLAG_ERROR"; - - case NPP_FFT_ORDER_ERROR: - return "NPP_FFT_ORDER_ERROR"; - - case NPP_SCALE_RANGE_ERROR: - return "NPP_SCALE_RANGE_ERROR"; - - case NPP_DATA_TYPE_ERROR: - return "NPP_DATA_TYPE_ERROR"; - - case NPP_OUT_OFF_RANGE_ERROR: - return "NPP_OUT_OFF_RANGE_ERROR"; - - case NPP_DIVIDE_BY_ZERO_ERROR: - return "NPP_DIVIDE_BY_ZERO_ERROR"; - - case NPP_RANGE_ERROR: - return "NPP_RANGE_ERROR"; - - case NPP_NO_MEMORY_ERROR: - return "NPP_NO_MEMORY_ERROR"; - - case NPP_ERROR_RESERVED: - return "NPP_ERROR_RESERVED"; - - case NPP_NO_OPERATION_WARNING: - return "NPP_NO_OPERATION_WARNING"; - - case NPP_DIVIDE_BY_ZERO_WARNING: - return "NPP_DIVIDE_BY_ZERO_WARNING"; -#endif - } - - return ""; -} -#endif - -#ifdef __DRIVER_TYPES_H__ -#ifndef DEVICE_RESET -#define DEVICE_RESET cudaDeviceReset(); -#endif -#else -#ifndef DEVICE_RESET -#define DEVICE_RESET -#endif -#endif - -template -void check(T result, char const *const func, const char *const file, - int const line) { - if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, - static_cast(result), _cudaGetErrorEnum(result), func); - DEVICE_RESET - // Make sure we call CUDA Device Reset before exiting - exit(EXIT_FAILURE); - } -} - -#ifdef __DRIVER_TYPES_H__ -// This will output the proper CUDA error strings in the event that a CUDA host -// call returns an error -#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) - -// This will output the proper error string when calling cudaGetLastError -#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) - -inline void __getLastCudaError(const char *errorMessage, const char *file, - const int line) { - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) { - fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", - file, line, errorMessage, (int)err, cudaGetErrorString(err)); - DEVICE_RESET - exit(EXIT_FAILURE); - } -} -#endif - -#ifndef MAX -#define MAX(a, b) (a > b ? a : b) -#endif - -// Float To Int conversion -inline int ftoi(float value) { - return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5)); -} - -// Beginning of GPU Architecture definitions -inline int _ConvertSMVer2Cores(int major, int minor) { - // Defines for GPU Architecture types (using the SM version to determine the # - // of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM - // minor version - int Cores; - } sSMtoCores; - - sSMtoCores nGpuArchCoresPerSM[] = { - {0x20, 32}, // Fermi Generation (SM 2.0) GF100 class - {0x21, 48}, // Fermi Generation (SM 2.1) GF10x class - {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class - {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class - {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class - {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class - {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class - {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class - {-1, -1}}; - - int index = 0; - - while (nGpuArchCoresPerSM[index].SM != -1) { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { - return nGpuArchCoresPerSM[index].Cores; - } - - index++; - } - - // If we don't find the values, we default use the previous one to run - // properly - printf( - "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", - major, minor, nGpuArchCoresPerSM[index - 1].Cores); - return nGpuArchCoresPerSM[index - 1].Cores; -} -// end of GPU Architecture definitions - -#ifdef __CUDA_RUNTIME_H__ -// General GPU Device CUDA Initialization -inline int gpuDeviceInit(int devID) { - int device_count; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) { - fprintf(stderr, - "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - if (devID < 0) { - devID = 0; - } - - if (devID > device_count - 1) { - fprintf(stderr, "\n"); - fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", - device_count); - fprintf(stderr, - ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", - devID); - fprintf(stderr, "\n"); - return -devID; - } - - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - - if (deviceProp.computeMode == cudaComputeModeProhibited) { - fprintf(stderr, "Error: device is running in , no " - "threads can use ::cudaSetDevice().\n"); - return -1; - } - - if (deviceProp.major < 1) { - fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); - exit(EXIT_FAILURE); - } - - checkCudaErrors(cudaSetDevice(devID)); - printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); - - return devID; -} - -// This function returns the best GPU (with maximum GFLOPS) -inline int gpuGetMaxGflopsDeviceId() { - int current_device = 0, sm_per_multiproc = 0; - int max_perf_device = 0; - int device_count = 0, best_SM_arch = 0; - int devices_prohibited = 0; - - unsigned long long max_compute_perf = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceCount(&device_count); - - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) { - fprintf( - stderr, - "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - // Find the best major SM Architecture GPU device - while (current_device < device_count) { - cudaGetDeviceProperties(&deviceProp, current_device); - - // If this GPU is not running on Compute Mode prohibited, then we can add it - // to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) { - if (deviceProp.major > 0 && deviceProp.major < 9999) { - best_SM_arch = MAX(best_SM_arch, deviceProp.major); - } - } else { - devices_prohibited++; - } - - current_device++; - } - - if (devices_prohibited == device_count) { - fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have " - "compute mode prohibited.\n"); - exit(EXIT_FAILURE); - } - - // Find the best CUDA capable GPU device - current_device = 0; - - while (current_device < device_count) { - cudaGetDeviceProperties(&deviceProp, current_device); - - // If this GPU is not running on Compute Mode prohibited, then we can add it - // to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) { - if (deviceProp.major == 9999 && deviceProp.minor == 9999) { - sm_per_multiproc = 1; - } else { - sm_per_multiproc = - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); - } - - unsigned long long compute_perf = - (unsigned long long)deviceProp.multiProcessorCount * - sm_per_multiproc * deviceProp.clockRate; - - if (compute_perf > max_compute_perf) { - // If we find GPU with SM major > 2, search only these - if (best_SM_arch > 2) { - // If our device==dest_SM_arch, choose this, or else pass - if (deviceProp.major == best_SM_arch) { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } else { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } - } - - ++current_device; - } - - return max_perf_device; -} - -// Initialization code to find the best CUDA Device -inline int findCudaDevice(int argc, const char **argv) { - return 0; - /* - cudaDeviceProp deviceProp; - int devID = 0; - - // If the command-line has a device number specified, use it - if (checkCmdLineFlag(argc, argv, "device")) - { - devID = getCmdLineArgumentInt(argc, argv, "device="); - - if (devID < 0) - { - printf("Invalid command line parameter\n "); - exit(EXIT_FAILURE); - } - else - { - devID = gpuDeviceInit(devID); - - if (devID < 0) - { - printf("exiting...\n"); - exit(EXIT_FAILURE); - } - } - } - else - { - // Otherwise pick the device with highest Gflops/s - devID = gpuGetMaxGflopsDeviceId(); - checkCudaErrors(cudaSetDevice(devID)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - devID, deviceProp.name, deviceProp.major, deviceProp.minor); - } - - return devID; */ -} - -// General check for CUDA GPU SM Capabilities -inline bool checkCudaCapabilities(int major_version, int minor_version) { - cudaDeviceProp deviceProp; - deviceProp.major = 0; - deviceProp.minor = 0; - int dev; - - checkCudaErrors(cudaGetDevice(&dev)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - - if ((deviceProp.major > major_version) || - (deviceProp.major == major_version && - deviceProp.minor >= minor_version)) { - printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, - deviceProp.name, deviceProp.major, deviceProp.minor); - return true; - } else { - printf(" No GPU device was found that can support CUDA compute capability " - "%d.%d.\n", - major_version, minor_version); - return false; - } -} -#endif - -// end of CUDA Helper Functions - -#endif diff --git a/include/helper_string.h b/include/helper_string.h deleted file mode 100644 index 562e06755..000000000 --- a/include/helper_string.h +++ /dev/null @@ -1,664 +0,0 @@ -/** - * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -// These are helper functions for the SDK samples (string parsing, timers, etc) -#ifndef STRING_HELPER_H -#define STRING_HELPER_H - -#include -#include -#include -#include - -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -#ifndef _CRT_SECURE_NO_DEPRECATE -#define _CRT_SECURE_NO_DEPRECATE -#endif -#ifndef STRCASECMP -#define STRCASECMP _stricmp -#endif -#ifndef STRNCASECMP -#define STRNCASECMP _strnicmp -#endif -#ifndef STRCPY -#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) -#endif - -#ifndef FOPEN -#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result != 0) -#endif -#ifndef SSCANF -#define SSCANF sscanf_s -#endif -#ifndef SPRINTF -#define SPRINTF sprintf_s -#endif -#else // Linux Includes -#include -#include - -#ifndef STRCASECMP -#define STRCASECMP strcasecmp -#endif -#ifndef STRNCASECMP -#define STRNCASECMP strncasecmp -#endif -#ifndef STRCPY -#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) -#endif - -#ifndef FOPEN -#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result == NULL) -#endif -#ifndef SSCANF -#define SSCANF sscanf -#endif -#ifndef SPRINTF -#define SPRINTF sprintf -#endif -#endif - -#ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 -#endif - -// CUDA Utility Helper Functions -inline int stringRemoveDelimiter(char delimiter, const char *string) { - int string_start = 0; - - while (string[string_start] == delimiter) { - string_start++; - } - - if (string_start >= (int)strlen(string) - 1) { - return 0; - } - - return string_start; -} - -inline int getFileExtension(char *filename, char **extension) { - int string_length = (int)strlen(filename); - - while (filename[string_length--] != '.') { - if (string_length == 0) - break; - } - - if (string_length > 0) - string_length += 2; - - if (string_length == 0) - *extension = NULL; - else - *extension = &filename[string_length]; - - return string_length; -} - -inline bool checkCmdLineFlag(const int argc, const char **argv, - const char *string_ref) { - bool bFound = false; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - - const char *equal_pos = strchr(string_argv, '='); - int argv_length = - (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); - - int length = (int)strlen(string_ref); - - if (length == argv_length && - !STRNCASECMP(string_argv, string_ref, length)) { - bFound = true; - continue; - } - } - } - - return bFound; -} - -// This function wraps the CUDA Driver API into a template function -template -inline bool getCmdLineArgumentValue(const int argc, const char **argv, - const char *string_ref, T *value) { - bool bFound = false; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = (int)strlen(string_ref); - - if (!STRNCASECMP(string_argv, string_ref, length)) { - if (length + 1 <= (int)strlen(string_argv)) { - int auto_inc = (string_argv[length] == '=') ? 1 : 0; - *value = (T)atoi(&string_argv[length + auto_inc]); - } - - bFound = true; - i = argc; - } - } - } - - return bFound; -} - -inline int getCmdLineArgumentInt(const int argc, const char **argv, - const char *string_ref) { - bool bFound = false; - int value = -1; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = (int)strlen(string_ref); - - if (!STRNCASECMP(string_argv, string_ref, length)) { - if (length + 1 <= (int)strlen(string_argv)) { - int auto_inc = (string_argv[length] == '=') ? 1 : 0; - value = atoi(&string_argv[length + auto_inc]); - } else { - value = 0; - } - - bFound = true; - continue; - } - } - } - - if (bFound) { - return value; - } else { - return 0; - } -} - -inline float getCmdLineArgumentFloat(const int argc, const char **argv, - const char *string_ref) { - bool bFound = false; - float value = -1; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = (int)strlen(string_ref); - - if (!STRNCASECMP(string_argv, string_ref, length)) { - if (length + 1 <= (int)strlen(string_argv)) { - int auto_inc = (string_argv[length] == '=') ? 1 : 0; - value = (float)atof(&string_argv[length + auto_inc]); - } else { - value = 0.f; - } - - bFound = true; - continue; - } - } - } - - if (bFound) { - return value; - } else { - return 0; - } -} - -inline bool getCmdLineArgumentString(const int argc, const char **argv, - const char *string_ref, - char **string_retval) { - bool bFound = false; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - char *string_argv = (char *)&argv[i][string_start]; - int length = (int)strlen(string_ref); - - if (!STRNCASECMP(string_argv, string_ref, length)) { - *string_retval = &string_argv[length + 1]; - bFound = true; - continue; - } - } - } - - if (!bFound) { - *string_retval = NULL; - } - - return bFound; -} - -////////////////////////////////////////////////////////////////////////////// -//! Find the path for a file assuming that -//! files are found in the searchPath. -//! -//! @return the path if succeeded, otherwise 0 -//! @param filename name of the file -//! @param executable_path optional absolute path of the executable -////////////////////////////////////////////////////////////////////////////// -inline char *sdkFindFilePath(const char *filename, - const char *executable_path) { - // defines a variable that is replaced with the name of the - // executable - - // Typical relative search paths to locate needed companion files (e.g. sample - // input data, or JIT source files) The origin for the relative search may be - // the .exe file, a .bat file launching an .exe, a browser .exe launching the - // .exe or .bat, etc - const char *searchPath[] = { - "./", // same dir - "./common/", // "/common/" subdir - "./common/data/", // "/common/data/" subdir - "./data/", // "/data/" subdir - "./src/", // "/src/" subdir - "./src//data/", // "/src//data/" subdir - "./inc/", // "/inc/" subdir - "./0_Simple/", // "/0_Simple/" subdir - "./1_Utilities/", // "/1_Utilities/" subdir - "./2_Graphics/", // "/2_Graphics/" subdir - "./3_Imaging/", // "/3_Imaging/" subdir - "./4_Finance/", // "/4_Finance/" subdir - "./5_Simulations/", // "/5_Simulations/" subdir - "./6_Advanced/", // "/6_Advanced/" subdir - "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir - "./8_Android/", // "/8_Android/" subdir - "./samples/", // "/samples/" subdir - - "../", // up 1 in tree - "../common/", // up 1 in tree, "/common/" subdir - "../common/data/", // up 1 in tree, "/common/data/" subdir - "../data/", // up 1 in tree, "/data/" subdir - "../src/", // up 1 in tree, "/src/" subdir - "../inc/", // up 1 in tree, "/inc/" subdir - - "../0_Simple//data/", // up 1 in tree, - // "/0_Simple//" - // subdir - "../1_Utilities//data/", // up 1 in tree, - // "/1_Utilities//" - // subdir - "../2_Graphics//data/", // up 1 in tree, - // "/2_Graphics//" - // subdir - "../3_Imaging//data/", // up 1 in tree, - // "/3_Imaging//" - // subdir - "../4_Finance//data/", // up 1 in tree, - // "/4_Finance//" - // subdir - "../5_Simulations//data/", // up 1 in tree, - // "/5_Simulations//" - // subdir - "../6_Advanced//data/", // up 1 in tree, - // "/6_Advanced//" - // subdir - "../7_CUDALibraries//data/", // up 1 in tree, - // "/7_CUDALibraries//" - // subdir - "../8_Android//data/", // up 1 in tree, - // "/8_Android//" - // subdir - "../samples//data/", // up 1 in tree, - // "/samples//" - // subdir - "../../", // up 2 in tree - "../../common/", // up 2 in tree, "/common/" subdir - "../../common/data/", // up 2 in tree, "/common/data/" subdir - "../../data/", // up 2 in tree, "/data/" subdir - "../../src/", // up 2 in tree, "/src/" subdir - "../../inc/", // up 2 in tree, "/inc/" subdir - "../../sandbox//data/", // up 2 in tree, - // "/sandbox//" - // subdir - "../../0_Simple//data/", // up 2 in tree, - // "/0_Simple//" - // subdir - "../../1_Utilities//data/", // up 2 in tree, - // "/1_Utilities//" - // subdir - "../../2_Graphics//data/", // up 2 in tree, - // "/2_Graphics//" - // subdir - "../../3_Imaging//data/", // up 2 in tree, - // "/3_Imaging//" - // subdir - "../../4_Finance//data/", // up 2 in tree, - // "/4_Finance//" - // subdir - "../../5_Simulations//data/", // up 2 in tree, - // "/5_Simulations//" - // subdir - "../../6_Advanced//data/", // up 2 in tree, - // "/6_Advanced//" - // subdir - "../../7_CUDALibraries//data/", // up 2 in tree, - // "/7_CUDALibraries//" - // subdir - "../../8_Android//data/", // up 2 in tree, - // "/8_Android//" - // subdir - "../../samples//data/", // up 2 in tree, - // "/samples//" - // subdir - "../../../", // up 3 in tree - "../../../src//", // up 3 in tree, - // "/src//" subdir - "../../../src//data/", // up 3 in tree, - // "/src//data/" - // subdir - "../../../src//src/", // up 3 in tree, - // "/src//src/" - // subdir - "../../../src//inc/", // up 3 in tree, - // "/src//inc/" - // subdir - "../../../sandbox//", // up 3 in tree, - // "/sandbox//" - // subdir - "../../../sandbox//data/", // up 3 in tree, - // "/sandbox//data/" - // subdir - "../../../sandbox//src/", // up 3 in tree, - // "/sandbox//src/" - // subdir - "../../../sandbox//inc/", // up 3 in tree, - // "/sandbox//inc/" - // subdir - "../../../0_Simple//data/", // up 3 in tree, - // "/0_Simple//" - // subdir - "../../../1_Utilities//data/", // up 3 in tree, - // "/1_Utilities//" - // subdir - "../../../2_Graphics//data/", // up 3 in tree, - // "/2_Graphics//" - // subdir - "../../../3_Imaging//data/", // up 3 in tree, - // "/3_Imaging//" - // subdir - "../../../4_Finance//data/", // up 3 in tree, - // "/4_Finance//" - // subdir - "../../../5_Simulations//data/", // up 3 in tree, - // "/5_Simulations//" - // subdir - "../../../6_Advanced//data/", // up 3 in tree, - // "/6_Advanced//" - // subdir - "../../../7_CUDALibraries//data/", // up 3 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../8_Android//data/", // up 3 in tree, - // "/8_Android//" - // subdir - "../../../0_Simple//", // up 3 in tree, - // "/0_Simple//" - // subdir - "../../../1_Utilities//", // up 3 in tree, - // "/1_Utilities//" - // subdir - "../../../2_Graphics//", // up 3 in tree, - // "/2_Graphics//" - // subdir - "../../../3_Imaging//", // up 3 in tree, - // "/3_Imaging//" - // subdir - "../../../4_Finance//", // up 3 in tree, - // "/4_Finance//" - // subdir - "../../../5_Simulations//", // up 3 in tree, - // "/5_Simulations//" - // subdir - "../../../6_Advanced//", // up 3 in tree, - // "/6_Advanced//" - // subdir - "../../../7_CUDALibraries//", // up 3 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../8_Android//", // up 3 in tree, - // "/8_Android//" - // subdir - "../../../samples//data/", // up 3 in tree, - // "/samples//" - // subdir - "../../../common/", // up 3 in tree, "../../../common/" subdir - "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir - "../../../data/", // up 3 in tree, "../../../data/" subdir - "../../../../", // up 4 in tree - "../../../../src//", // up 4 in tree, - // "/src//" subdir - "../../../../src//data/", // up 4 in tree, - // "/src//data/" - // subdir - "../../../../src//src/", // up 4 in tree, - // "/src//src/" - // subdir - "../../../../src//inc/", // up 4 in tree, - // "/src//inc/" - // subdir - "../../../../sandbox//", // up 4 in tree, - // "/sandbox//" - // subdir - "../../../../sandbox//data/", // up 4 in tree, - // "/sandbox//data/" - // subdir - "../../../../sandbox//src/", // up 4 in tree, - // "/sandbox//src/" - // subdir - "../../../../sandbox//inc/", // up 4 in tree, - // "/sandbox//inc/" - // subdir - "../../../../0_Simple//data/", // up 4 in tree, - // "/0_Simple//" - // subdir - "../../../../1_Utilities//data/", // up 4 in tree, - // "/1_Utilities//" - // subdir - "../../../../2_Graphics//data/", // up 4 in tree, - // "/2_Graphics//" - // subdir - "../../../../3_Imaging//data/", // up 4 in tree, - // "/3_Imaging//" - // subdir - "../../../../4_Finance//data/", // up 4 in tree, - // "/4_Finance//" - // subdir - "../../../../5_Simulations//data/", // up 4 in tree, - // "/5_Simulations//" - // subdir - "../../../../6_Advanced//data/", // up 4 in tree, - // "/6_Advanced//" - // subdir - "../../../../7_CUDALibraries//data/", // up 4 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../../8_Android//data/", // up 4 in tree, - // "/8_Android//" - // subdir - "../../../../0_Simple//", // up 4 in tree, - // "/0_Simple//" - // subdir - "../../../../1_Utilities//", // up 4 in tree, - // "/1_Utilities//" - // subdir - "../../../../2_Graphics//", // up 4 in tree, - // "/2_Graphics//" - // subdir - "../../../../3_Imaging//", // up 4 in tree, - // "/3_Imaging//" - // subdir - "../../../../4_Finance//", // up 4 in tree, - // "/4_Finance//" - // subdir - "../../../../5_Simulations//", // up 4 in tree, - // "/5_Simulations//" - // subdir - "../../../../6_Advanced//", // up 4 in tree, - // "/6_Advanced//" - // subdir - "../../../../7_CUDALibraries//", // up 4 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../../8_Android//", // up 4 in tree, - // "/8_Android//" - // subdir - "../../../../samples//data/", // up 4 in tree, - // "/samples//" - // subdir - "../../../../common/", // up 4 in tree, "../../../common/" subdir - "../../../../common/data/", // up 4 in tree, "../../../common/data/" - // subdir - "../../../../data/", // up 4 in tree, "../../../data/" subdir - "../../../../../", // up 5 in tree - "../../../../../src//", // up 5 in tree, - // "/src//" - // subdir - "../../../../../src//data/", // up 5 in tree, - // "/src//data/" - // subdir - "../../../../../src//src/", // up 5 in tree, - // "/src//src/" - // subdir - "../../../../../src//inc/", // up 5 in tree, - // "/src//inc/" - // subdir - "../../../../../sandbox//", // up 5 in tree, - // "/sandbox//" - // subdir - "../../../../../sandbox//data/", // up 5 in tree, - // "/sandbox//data/" - // subdir - "../../../../../sandbox//src/", // up 5 in tree, - // "/sandbox//src/" - // subdir - "../../../../../sandbox//inc/", // up 5 in tree, - // "/sandbox//inc/" - // subdir - "../../../../../0_Simple//data/", // up 5 in tree, - // "/0_Simple//" - // subdir - "../../../../../1_Utilities//data/", // up 5 in tree, - // "/1_Utilities//" - // subdir - "../../../../../2_Graphics//data/", // up 5 in tree, - // "/2_Graphics//" - // subdir - "../../../../../3_Imaging//data/", // up 5 in tree, - // "/3_Imaging//" - // subdir - "../../../../../4_Finance//data/", // up 5 in tree, - // "/4_Finance//" - // subdir - "../../../../../5_Simulations//data/", // up 5 in tree, - // "/5_Simulations//" - // subdir - "../../../../../6_Advanced//data/", // up 5 in tree, - // "/6_Advanced//" - // subdir - "../../../../../7_CUDALibraries//data/", // up 5 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../../../8_Android//data/", // up 5 in tree, - // "/8_Android//" - // subdir - "../../../../../samples//data/", // up 5 in tree, - // "/samples//" - // subdir - "../../../../../common/", // up 5 in tree, "../../../common/" subdir - "../../../../../common/data/", // up 5 in tree, "../../../common/data/" - // subdir - }; - - // Extract the executable name - std::string executable_name; - - if (executable_path != 0) { - executable_name = std::string(executable_path); - -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - // Windows path delimiter - size_t delimiter_pos = executable_name.find_last_of('\\'); - executable_name.erase(0, delimiter_pos + 1); - - if (executable_name.rfind(".exe") != std::string::npos) { - // we strip .exe, only if the .exe is found - executable_name.resize(executable_name.size() - 4); - } - -#else - // Linux & OSX path delimiter - size_t delimiter_pos = executable_name.find_last_of('/'); - executable_name.erase(0, delimiter_pos + 1); -#endif - } - - // Loop over all search paths and return the first hit - for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { - std::string path(searchPath[i]); - size_t executable_name_pos = path.find(""); - - // If there is executable_name variable in the searchPath - // replace it with the value - if (executable_name_pos != std::string::npos) { - if (executable_path != 0) { - path.replace(executable_name_pos, strlen(""), - executable_name); - } else { - // Skip this path entry if no executable argument is given - continue; - } - } - -#ifdef _DEBUG - printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); -#endif - - // Test if the file exists - path.append(filename); - FILE *fp; - FOPEN(fp, path.c_str(), "rb"); - - if (fp != NULL) { - fclose(fp); - // File found - // returning an allocated array here for backwards compatibility reasons - char *file_path = (char *)malloc(path.length() + 1); - STRCPY(file_path, path.length() + 1, path.c_str()); - return file_path; - } - - if (fp) { - fclose(fp); - } - } - - // File not found - return 0; -} - -#endif diff --git a/src/addition.cuh b/src/addition.cuh index 6bd698cc8..7c19d131a 100644 --- a/src/addition.cuh +++ b/src/addition.cuh @@ -4,10 +4,9 @@ #ifdef __CDT_PARSER__ #undef __CUDA_RUNTIME_H__ #include -#include #endif -#include "../include/helper_cuda.h" +#include "device.h" #include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" #include @@ -58,7 +57,7 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output, auto stream = static_cast(v_stream); addition<<>>(output, input_1, input_2, num_entries); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); } template @@ -77,12 +76,12 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index, auto stream = static_cast(v_stream); - checkCudaErrors(cudaMemcpyAsync(output, lwe_input, - (input_lwe_dimension + 1) * - input_lwe_ciphertext_count * sizeof(T), - cudaMemcpyDeviceToDevice, *stream)); + check_cuda_error(cudaMemcpyAsync(output, lwe_input, + (input_lwe_dimension + 1) * + input_lwe_ciphertext_count * sizeof(T), + cudaMemcpyDeviceToDevice, *stream)); plaintext_addition<<>>( output, lwe_input, plaintext_input, input_lwe_dimension, num_entries); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); } #endif // CUDA_ADD_H diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh index b26fa65fa..f68dfab3b 100644 --- a/src/bit_extraction.cuh +++ b/src/bit_extraction.cuh @@ -3,7 +3,6 @@ #include "cooperative_groups.h" -#include "../include/helper_cuda.h" #include "bootstrap.h" #include "bootstrap_low_latency.cuh" #include "device.h" @@ -156,7 +155,7 @@ __host__ void host_extract_bits( copy_and_shift_lwe<<>>( lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in, 1ll << (ciphertext_n_bits - delta_log - 1)); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) { cuda_keyswitch_lwe_ciphertext_vector( @@ -167,7 +166,7 @@ __host__ void host_extract_bits( copy_small_lwe<<<1, 256, 0, *stream>>>( list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1, number_of_bits, number_of_bits - bit_idx - 1); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); if (bit_idx == number_of_bits - 1) { break; @@ -177,7 +176,7 @@ __host__ void host_extract_bits( add_to_body<<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer, lwe_dimension_out, 1ll << (ciphertext_n_bits - 2)); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); // Fill lut for the current bit (equivalent to trivial encryption as mask is // 0s) The LUT is filled with -alpha in each coefficient where alpha = @@ -185,7 +184,7 @@ __host__ void host_extract_bits( fill_lut_body_for_current_bit <<>>( lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx)); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); host_bootstrap_low_latency( v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs, @@ -199,7 +198,7 @@ __host__ void host_extract_bits( lwe_array_in_shifted_buffer, lwe_array_in_buffer, lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx), 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2)); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); } } diff --git a/src/boolean_gates.cu b/src/boolean_gates.cu index 02b3612cc..d906065f3 100644 --- a/src/boolean_gates.cu +++ b/src/boolean_gates.cu @@ -3,7 +3,6 @@ #include "bootstrap.h" #include "device.h" -#include "helper_cuda.h" #include "keyswitch.h" #include "linear_algebra.h" @@ -48,7 +47,7 @@ extern "C" void cuda_boolean_and_32( cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async( (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), @@ -76,7 +75,7 @@ extern "C" void cuda_boolean_and_32( (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *h_pbs_lut_indexes = (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t)); for (uint index = 0; index < input_lwe_ciphertext_count; index++) { @@ -87,19 +86,19 @@ extern "C" void cuda_boolean_and_32( cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async( (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_bootstrap_amortized_lwe_ciphertext_vector_32( v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes, lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count, input_lwe_ciphertext_count, 1, 0, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(lwe_buffer_2, stream, gpu_index); cuda_drop_async(pbs_lut, stream, gpu_index); @@ -151,7 +150,7 @@ extern "C" void cuda_boolean_nand_32( cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async( (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), @@ -179,7 +178,7 @@ extern "C" void cuda_boolean_nand_32( (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *h_pbs_lut_indexes = (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t)); for (uint index = 0; index < input_lwe_ciphertext_count; index++) { @@ -190,19 +189,19 @@ extern "C" void cuda_boolean_nand_32( cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async( (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_bootstrap_amortized_lwe_ciphertext_vector_32( v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes, lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count, input_lwe_ciphertext_count, 1, 0, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(lwe_buffer_3, stream, gpu_index); cuda_drop_async(pbs_lut, stream, gpu_index); @@ -254,7 +253,7 @@ extern "C" void cuda_boolean_nor_32( cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async( (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), @@ -282,7 +281,7 @@ extern "C" void cuda_boolean_nor_32( (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *h_pbs_lut_indexes = (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t)); for (uint index = 0; index < input_lwe_ciphertext_count; index++) { @@ -293,19 +292,19 @@ extern "C" void cuda_boolean_nor_32( cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async( (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_bootstrap_amortized_lwe_ciphertext_vector_32( v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes, lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count, input_lwe_ciphertext_count, 1, 0, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(lwe_buffer_3, stream, gpu_index); cuda_drop_async(pbs_lut, stream, gpu_index); @@ -349,7 +348,7 @@ extern "C" void cuda_boolean_or_32( cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async( (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), @@ -377,7 +376,7 @@ extern "C" void cuda_boolean_or_32( (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *h_pbs_lut_indexes = (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t)); for (uint index = 0; index < input_lwe_ciphertext_count; index++) { @@ -388,19 +387,19 @@ extern "C" void cuda_boolean_or_32( cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async( (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_bootstrap_amortized_lwe_ciphertext_vector_32( v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes, lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count, input_lwe_ciphertext_count, 1, 0, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(lwe_buffer_2, stream, gpu_index); cuda_drop_async(pbs_lut, stream, gpu_index); @@ -444,7 +443,7 @@ extern "C" void cuda_boolean_xor_32( cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async( (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), @@ -468,7 +467,7 @@ extern "C" void cuda_boolean_xor_32( cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async( (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), @@ -493,7 +492,7 @@ extern "C" void cuda_boolean_xor_32( (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *h_pbs_lut_indexes = (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t)); for (uint index = 0; index < input_lwe_ciphertext_count; index++) { @@ -504,19 +503,19 @@ extern "C" void cuda_boolean_xor_32( cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async( (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_bootstrap_amortized_lwe_ciphertext_vector_32( v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes, lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count, input_lwe_ciphertext_count, 1, 0, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(lwe_buffer_3, stream, gpu_index); cuda_drop_async(pbs_lut, stream, gpu_index); @@ -560,7 +559,7 @@ extern "C" void cuda_boolean_xnor_32( cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async( (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), @@ -591,7 +590,7 @@ extern "C" void cuda_boolean_xnor_32( cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_buffer_4 = (uint32_t *)cuda_malloc_async( (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), @@ -616,7 +615,7 @@ extern "C" void cuda_boolean_xnor_32( (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *h_pbs_lut_indexes = (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t)); for (uint index = 0; index < input_lwe_ciphertext_count; index++) { @@ -627,19 +626,19 @@ extern "C" void cuda_boolean_xnor_32( cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes, input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async( (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_bootstrap_amortized_lwe_ciphertext_vector_32( v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes, lwe_buffer_4, bootstrapping_key, input_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count, input_lwe_ciphertext_count, 1, 0, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(lwe_buffer_4, stream, gpu_index); cuda_drop_async(pbs_lut, stream, gpu_index); diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 0135e35b2..c937f3c32 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -1,7 +1,6 @@ #ifdef __CDT_PARSER__ #undef __CUDA_RUNTIME_H__ #include -#include #endif #ifndef CNCRT_AMORTIZED_PBS_H @@ -15,9 +14,7 @@ #include "crypto/torus.cuh" #include "device.h" #include "fft/bnsmfft.cuh" -#include "fft/smfft.cuh" #include "fft/twiddles.cuh" -#include "helper_cuda.h" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial.cuh" @@ -327,10 +324,10 @@ __host__ void host_bootstrap_amortized( // device then has to be allocated dynamically. // For lower compute capabilities, this call // just does nothing and the amount of shared memory used is 48 KB - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( device_bootstrap_amortized, cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL)); - checkCudaErrors(cudaFuncSetCacheConfig( + check_cuda_error(cudaFuncSetCacheConfig( device_bootstrap_amortized, cudaFuncCachePreferShared)); d_mem = (char *)cuda_malloc_async(0, stream, gpu_index); @@ -341,7 +338,7 @@ __host__ void host_bootstrap_amortized( bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, base_log, level_count, lwe_idx, 0); } - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(d_mem, stream, gpu_index); } diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 33ec3d005..877950b1f 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -1,7 +1,6 @@ #ifdef __CDT_PARSER__ #undef __CUDA_RUNTIME_H__ #include -#include #endif #ifndef LOWLAT_PBS_H @@ -15,9 +14,7 @@ #include "crypto/torus.cuh" #include "device.h" #include "fft/bnsmfft.cuh" -#include "fft/smfft.cuh" #include "fft/twiddles.cuh" -#include "helper_cuda.h" #include "polynomial/parameters.cuh" #include "polynomial/polynomial.cuh" #include "polynomial/polynomial_math.cuh" @@ -299,12 +296,12 @@ __host__ void host_bootstrap_low_latency( if (max_shared_memory < SM_PART) { kernel_args[11] = &DM_FULL; - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count * level_count * 2, stream, gpu_index); - checkCudaErrors(cudaGetLastError()); - checkCudaErrors(cudaLaunchCooperativeKernel( + check_cuda_error(cudaGetLastError()); + check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_bootstrap_low_latency, grid, thds, (void **)kernel_args, 0, *stream)); } else if (max_shared_memory < SM_FULL) { @@ -312,14 +309,14 @@ __host__ void host_bootstrap_low_latency( d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count * level_count * 2, stream, gpu_index); - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( device_bootstrap_low_latency, cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART)); cudaFuncSetCacheConfig( device_bootstrap_low_latency, cudaFuncCachePreferShared); - checkCudaErrors(cudaGetLastError()); - checkCudaErrors(cudaLaunchCooperativeKernel( + check_cuda_error(cudaGetLastError()); + check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_bootstrap_low_latency, grid, thds, (void **)kernel_args, SM_PART, *stream)); @@ -327,17 +324,17 @@ __host__ void host_bootstrap_low_latency( int DM_NONE = 0; kernel_args[11] = &DM_NONE; d_mem = (char *)cuda_malloc_async(0, stream, gpu_index); - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( device_bootstrap_low_latency, cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL)); cudaFuncSetCacheConfig(device_bootstrap_low_latency, cudaFuncCachePreferShared); - checkCudaErrors(cudaLaunchCooperativeKernel( + check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_bootstrap_low_latency, grid, thds, (void **)kernel_args, SM_FULL, *stream)); } - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); // Synchronize the streams before copying the result to lwe_array_out at the // right place cuda_drop_async(mask_buffer_fft, stream, gpu_index); diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh index af3194bf3..dbca600a2 100644 --- a/src/circuit_bootstrap.cuh +++ b/src/circuit_bootstrap.cuh @@ -5,7 +5,6 @@ #include "bootstrap.h" #include "bootstrap_amortized.cuh" #include "device.h" -#include "helper_cuda.h" #include "keyswitch.cuh" #include "polynomial/parameters.cuh" #include "utils/timer.cuh" diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh index 3631648c8..71e40f7ce 100644 --- a/src/crypto/bootstrapping_key.cuh +++ b/src/crypto/bootstrapping_key.cuh @@ -85,10 +85,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, case 512: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - checkCudaErrors(cudaFuncSetCacheConfig( + check_cuda_error(cudaFuncSetCacheConfig( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> @@ -104,10 +104,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, case 1024: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - checkCudaErrors(cudaFuncSetCacheConfig( + check_cuda_error(cudaFuncSetCacheConfig( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> @@ -123,10 +123,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, case 2048: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - checkCudaErrors(cudaFuncSetCacheConfig( + check_cuda_error(cudaFuncSetCacheConfig( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> @@ -142,10 +142,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, case 4096: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - checkCudaErrors(cudaFuncSetCacheConfig( + check_cuda_error(cudaFuncSetCacheConfig( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> @@ -161,10 +161,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, case 8192: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - checkCudaErrors(cudaFuncSetCacheConfig( + check_cuda_error(cudaFuncSetCacheConfig( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> diff --git a/src/crypto/ggsw.cuh b/src/crypto/ggsw.cuh index 8ab73ed99..5db8bc399 100644 --- a/src/crypto/ggsw.cuh +++ b/src/crypto/ggsw.cuh @@ -64,13 +64,13 @@ void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src, d_mem = (char *)cuda_malloc_async(shared_memory_size, stream, gpu_index); device_batch_fft_ggsw_vector <<>>(dest, src, d_mem); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(d_mem, stream, gpu_index); } else { device_batch_fft_ggsw_vector <<>>(dest, src, d_mem); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); } } diff --git a/src/device.cu b/src/device.cu index c31e33d63..b9dab7594 100644 --- a/src/device.cu +++ b/src/device.cu @@ -2,7 +2,6 @@ #include #include #include -#include /// Unsafe function to create a CUDA stream, must check first that GPU exists cudaStream_t *cuda_create_stream(uint32_t gpu_index) { @@ -25,7 +24,8 @@ int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) { void *cuda_malloc(uint64_t size, uint32_t gpu_index) { cudaSetDevice(gpu_index); void *ptr; - checkCudaErrors(cudaMalloc((void **)&ptr, size)); + cudaMalloc((void **)&ptr, size); + check_cuda_error(cudaGetLastError()); return ptr; } @@ -37,13 +37,14 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t *stream, void *ptr; int support_async_alloc; - checkCudaErrors(cudaDeviceGetAttribute( + check_cuda_error(cudaDeviceGetAttribute( &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index)); - if (support_async_alloc) - checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream)); - else - checkCudaErrors(cudaMalloc((void **)&ptr, size)); + if (support_async_alloc) { + check_cuda_error(cudaMallocAsync((void **)&ptr, size, *stream)); + } else { + check_cuda_error(cudaMalloc((void **)&ptr, size)); + } return ptr; } @@ -91,7 +92,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, } cudaSetDevice(gpu_index); - checkCudaErrors( + check_cuda_error( cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream)); return 0; } @@ -133,7 +134,7 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, } cudaSetDevice(gpu_index); - checkCudaErrors( + check_cuda_error( cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream)); return 0; } @@ -152,7 +153,7 @@ int cuda_drop(void *ptr, uint32_t gpu_index) { return -2; } cudaSetDevice(gpu_index); - checkCudaErrors(cudaFree(ptr)); + check_cuda_error(cudaFree(ptr)); return 0; } @@ -160,13 +161,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) { int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) { int support_async_alloc; - checkCudaErrors(cudaDeviceGetAttribute( + check_cuda_error(cudaDeviceGetAttribute( &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index)); - if (support_async_alloc) - checkCudaErrors(cudaFreeAsync(ptr, *stream)); - else - checkCudaErrors(cudaFree(ptr)); + if (support_async_alloc) { + check_cuda_error(cudaFreeAsync(ptr, *stream)); + } else { + check_cuda_error(cudaFree(ptr)); + } return 0; } diff --git a/src/fft/bnsmfft.cuh b/src/fft/bnsmfft.cuh index 23cd9a05e..9b68b8065 100644 --- a/src/fft/bnsmfft.cuh +++ b/src/fft/bnsmfft.cuh @@ -1,5 +1,5 @@ -#ifndef GPU_BOOTSTRAP_FFT_1024_CUH -#define GPU_BOOTSTRAP_FFT_1024_CUH +#ifndef GPU_BOOTSTRAP_FFT_CUH +#define GPU_BOOTSTRAP_FFT_CUH #include "complex/operations.cuh" #include "polynomial/functions.cuh" @@ -21,9 +21,6 @@ * w_j,k = exp(-i pi j/2^k) * is replaced with: * \zeta_j,k = exp(-i pi (2j-1)/2^k) - * - this technique also implies a correction of the - * complex obtained after the FFT, which is done in the - * forward_negacyclic_fft_inplace function of bootstrap.cuh */ template __device__ void NSMFFT_direct(double2 *A) { @@ -118,7 +115,7 @@ template __device__ void NSMFFT_inverse(double2 *A) { // none of the twiddles have equal real and imag part, so // complete complex multiplication has to be done - // here we have more than one twiddles + // here we have more than one twiddle while (m > 1) { tid = threadIdx.x; m >>= 1; @@ -145,7 +142,7 @@ template __device__ void NSMFFT_inverse(double2 *A) { /* * global batch fft * does fft in half size - * unrolling halfsize fft result in half size + 1 eleemnts + * unrolling half size fft result in half size + 1 elements * this function must be called with actual degree * function takes as input already compressed input */ @@ -174,4 +171,4 @@ __global__ void batch_NSMFFT(double2 *d_input, double2 *d_output, } } -#endif // GPU_BOOTSTRAP_FFT_1024_CUH +#endif // GPU_BOOTSTRAP_FFT_CUH diff --git a/src/fft/smfft.cuh b/src/fft/smfft.cuh deleted file mode 100644 index bc31d42a3..000000000 --- a/src/fft/smfft.cuh +++ /dev/null @@ -1,402 +0,0 @@ -/* -#ifndef GPU_BOOTSTRAP_SMFFT_CUH -#define GPU_BOOTSTRAP_SMFFT_CUH - -#include "../complex/operations.cuh" -#include "twiddles.cuh" - -__device__ __inline__ double2 Get_W_value_inverse(int index) { - double2 ctemp = _gTwiddles[index]; - ctemp.y = -ctemp.y; - return (ctemp); -} -template -__device__ double2 Get_after_inverse_fft_twiddle(int index) { - double2 ctemp; - switch (params::degree) { - case 512: - ctemp = INVERSE_TWIDDLES_512[index]; - break; - case 1024: - ctemp = gTwiddles1024[index]; - ctemp.x /= params::degree; - ctemp.y /= -params::degree; - break; - default: - break; - } - - return ctemp; -} - -__device__ __inline__ double shfl(double *value, int par) { -#if (CUDART_VERSION >= 9000) - return (__shfl_sync(0xffffffff, (*value), par)); -#else - return (__shfl((*value), par)); -#endif -} - -__device__ __inline__ double shfl_xor(double *value, int par) { -#if (CUDART_VERSION >= 9000) - return (__shfl_xor_sync(0xffffffff, (*value), par)); -#else - return (__shfl_xor((*value), par)); -#endif -} - -__device__ __inline__ double shfl_down(double *value, int par) { -#if (CUDART_VERSION >= 9000) - return (__shfl_down_sync(0xffffffff, (*value), par)); -#else - return (__shfl_down((*value), par)); -#endif -} - -__device__ __inline__ void -reorder_16_register(double2 *A_DFT_value, double2 *B_DFT_value, - double2 *C_DFT_value, double2 *D_DFT_value, int *local_id) { - double2 Af2temp, Bf2temp, Cf2temp, Df2temp; - unsigned int target = (((unsigned int)__brev(((*local_id) & 15))) >> (28)) + - 16 * ((*local_id) >> 4); - Af2temp.x = shfl(&(A_DFT_value->x), target); - Af2temp.y = shfl(&(A_DFT_value->y), target); - Bf2temp.x = shfl(&(B_DFT_value->x), target); - Bf2temp.y = shfl(&(B_DFT_value->y), target); - Cf2temp.x = shfl(&(C_DFT_value->x), target); - Cf2temp.y = shfl(&(C_DFT_value->y), target); - Df2temp.x = shfl(&(D_DFT_value->x), target); - Df2temp.y = shfl(&(D_DFT_value->y), target); - __syncwarp(); - (*A_DFT_value) = Af2temp; - (*B_DFT_value) = Bf2temp; - (*C_DFT_value) = Cf2temp; - (*D_DFT_value) = Df2temp; -} - -__device__ __inline__ void reorder_32_register(double2 *A_DFT_value, - double2 *B_DFT_value, - double2 *C_DFT_value, - double2 *D_DFT_value) { - double2 Af2temp, Bf2temp, Cf2temp, Df2temp; - unsigned int target = ((unsigned int)__brev(threadIdx.x)) >> (27); - Af2temp.x = shfl(&(A_DFT_value->x), target); - Af2temp.y = shfl(&(A_DFT_value->y), target); - Bf2temp.x = shfl(&(B_DFT_value->x), target); - Bf2temp.y = shfl(&(B_DFT_value->y), target); - Cf2temp.x = shfl(&(C_DFT_value->x), target); - Cf2temp.y = shfl(&(C_DFT_value->y), target); - Df2temp.x = shfl(&(D_DFT_value->x), target); - Df2temp.y = shfl(&(D_DFT_value->y), target); - __syncwarp(); - (*A_DFT_value) = Af2temp; - (*B_DFT_value) = Bf2temp; - (*C_DFT_value) = Cf2temp; - (*D_DFT_value) = Df2temp; -} - -template -__device__ __inline__ void -reorder_512(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value, - double2 *C_DFT_value, double2 *D_DFT_value) { - int local_id = threadIdx.x & (params::warp - 1); - int warp_id = threadIdx.x / params::warp; - - // reorder elements within warp so we can save them in semi-transposed manner - // into shared memory - reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value); - - // reorder elements within warp so we can save them in semi-transposed manner - // into shared memory - __syncthreads(); - unsigned int sm_store_pos = - (local_id >> 1) + 16 * (local_id & 1) + warp_id * 132; - s_input[sm_store_pos] = *A_DFT_value; - s_input[sm_store_pos + 33] = *B_DFT_value; - s_input[66 + sm_store_pos] = *C_DFT_value; - s_input[66 + sm_store_pos + 33] = *D_DFT_value; - - __syncthreads(); - - // Read shared memory to get reordered input - unsigned int sm_read_pos = (local_id & 15) * 32 + local_id + warp_id * 4; - __syncthreads(); - *A_DFT_value = s_input[sm_read_pos + 0]; - *B_DFT_value = s_input[sm_read_pos + 1]; - *C_DFT_value = s_input[sm_read_pos + 2]; - *D_DFT_value = s_input[sm_read_pos + 3]; - - __syncthreads(); - reorder_16_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value, - &local_id); - - __syncthreads(); -} - -template -__device__ __inline__ void -reorder_1024(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value, - double2 *C_DFT_value, double2 *D_DFT_value) { - int local_id = threadIdx.x & (params::warp - 1); - int warp_id = threadIdx.x / params::warp; - - // reorder elements within params::warp so we can save them in semi-transposed - // manner into shared memory - reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value); - - // reorder elements within params::warp so we can save them in semi-transposed - // manner into shared memory - __syncthreads(); - unsigned int sm_store_pos = - (local_id >> 0) + 32 * (local_id & 0) + warp_id * 132; - s_input[sm_store_pos] = *A_DFT_value; - s_input[sm_store_pos + 33] = *B_DFT_value; - s_input[66 + sm_store_pos] = *C_DFT_value; - s_input[66 + sm_store_pos + 33] = *D_DFT_value; - - // Read shared memory to get reordered input - unsigned int sm_read_pos = (local_id & 31) * 32 + local_id + warp_id * 4; - __syncthreads(); - *A_DFT_value = s_input[sm_read_pos + 0]; - *B_DFT_value = s_input[sm_read_pos + 1]; - *C_DFT_value = s_input[sm_read_pos + 2]; - *D_DFT_value = s_input[sm_read_pos + 3]; - - __syncthreads(); - reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value); -} - -__device__ bool printOnce = true; - -template __device__ void do_SMFFT_CT_DIT(double2 *s_input) { - double2 A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value; - double2 W; - double2 Aftemp, Bftemp, Cftemp, Dftemp; - - int j, m_param; - int parity, itemp; - int A_read_index, B_read_index, C_read_index, D_read_index; - int PoT, PoTp1, q; - - int local_id = threadIdx.x & (params::warp - 1); - int warp_id = threadIdx.x / params::warp; - A_DFT_value = s_input[local_id + (warp_id << 2) * params::warp]; - B_DFT_value = - s_input[local_id + (warp_id << 2) * params::warp + params::warp]; - C_DFT_value = - s_input[local_id + (warp_id << 2) * params::warp + 2 * params::warp]; - D_DFT_value = - s_input[local_id + (warp_id << 2) * params::warp + 3 * params::warp]; - - switch (params::log2_degree) { - case 9: - reorder_512(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value, - &D_DFT_value); - break; - case 10: - reorder_1024(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value, - &D_DFT_value); - break; - // case 11: - // reorder_2048(s_input, &A_DFT_value, &B_DFT_value, - //&C_DFT_value, &D_DFT_value); break; - default: - break; - } - - //----> FFT - PoT = 1; - PoTp1 = 2; - - //--> First iteration - itemp = local_id & 1; - parity = (1 - itemp * 2); - - A_DFT_value.x = parity * A_DFT_value.x + shfl_xor(&A_DFT_value.x, 1); - A_DFT_value.y = parity * A_DFT_value.y + shfl_xor(&A_DFT_value.y, 1); - B_DFT_value.x = parity * B_DFT_value.x + shfl_xor(&B_DFT_value.x, 1); - B_DFT_value.y = parity * B_DFT_value.y + shfl_xor(&B_DFT_value.y, 1); - C_DFT_value.x = parity * C_DFT_value.x + shfl_xor(&C_DFT_value.x, 1); - C_DFT_value.y = parity * C_DFT_value.y + shfl_xor(&C_DFT_value.y, 1); - D_DFT_value.x = parity * D_DFT_value.x + shfl_xor(&D_DFT_value.x, 1); - D_DFT_value.y = parity * D_DFT_value.y + shfl_xor(&D_DFT_value.y, 1); - - //--> Second through Fifth iteration (no synchronization) - PoT = 2; - PoTp1 = 4; - for (q = 1; q < 5; q++) { - m_param = (local_id & (PoTp1 - 1)); - itemp = m_param >> q; - parity = ((itemp << 1) - 1); - if (params::fft_direction) - W = Get_W_value_inverse((q - 1) * 257 + itemp * m_param); - else - W = _gTwiddles[(q - 1) * 257 + itemp * m_param]; - Aftemp.x = W.x * A_DFT_value.x - W.y * A_DFT_value.y; - Aftemp.y = W.x * A_DFT_value.y + W.y * A_DFT_value.x; - Bftemp.x = W.x * B_DFT_value.x - W.y * B_DFT_value.y; - Bftemp.y = W.x * B_DFT_value.y + W.y * B_DFT_value.x; - Cftemp.x = W.x * C_DFT_value.x - W.y * C_DFT_value.y; - Cftemp.y = W.x * C_DFT_value.y + W.y * C_DFT_value.x; - Dftemp.x = W.x * D_DFT_value.x - W.y * D_DFT_value.y; - Dftemp.y = W.x * D_DFT_value.y + W.y * D_DFT_value.x; - - A_DFT_value.x = Aftemp.x + parity * shfl_xor(&Aftemp.x, PoT); - A_DFT_value.y = Aftemp.y + parity * shfl_xor(&Aftemp.y, PoT); - B_DFT_value.x = Bftemp.x + parity * shfl_xor(&Bftemp.x, PoT); - B_DFT_value.y = Bftemp.y + parity * shfl_xor(&Bftemp.y, PoT); - C_DFT_value.x = Cftemp.x + parity * shfl_xor(&Cftemp.x, PoT); - C_DFT_value.y = Cftemp.y + parity * shfl_xor(&Cftemp.y, PoT); - D_DFT_value.x = Dftemp.x + parity * shfl_xor(&Dftemp.x, PoT); - D_DFT_value.y = Dftemp.y + parity * shfl_xor(&Dftemp.y, PoT); - - PoT = PoT << 1; - PoTp1 = PoTp1 << 1; - } - - itemp = local_id + (warp_id << 2) * params::warp; - s_input[itemp] = A_DFT_value; - s_input[itemp + params::warp] = B_DFT_value; - s_input[itemp + 2 * params::warp] = C_DFT_value; - s_input[itemp + 3 * params::warp] = D_DFT_value; - - for (q = 5; q < (params::log2_degree - 1); q++) { - __syncthreads(); - m_param = threadIdx.x & (PoT - 1); - j = threadIdx.x >> q; - - if (params::fft_direction) - W = Get_W_value_inverse((q - 1) * 257 + m_param); - else - W = _gTwiddles[(q - 1) * 257 + m_param]; - - A_read_index = j * (PoTp1 << 1) + m_param; - B_read_index = j * (PoTp1 << 1) + m_param + PoT; - C_read_index = j * (PoTp1 << 1) + m_param + PoTp1; - D_read_index = j * (PoTp1 << 1) + m_param + 3 * PoT; - - Aftemp = s_input[A_read_index]; - Bftemp = s_input[B_read_index]; - A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y; - A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x; - B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y; - B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x; - - Cftemp = s_input[C_read_index]; - Dftemp = s_input[D_read_index]; - C_DFT_value.x = Cftemp.x + W.x * Dftemp.x - W.y * Dftemp.y; - C_DFT_value.y = Cftemp.y + W.x * Dftemp.y + W.y * Dftemp.x; - D_DFT_value.x = Cftemp.x - W.x * Dftemp.x + W.y * Dftemp.y; - D_DFT_value.y = Cftemp.y - W.x * Dftemp.y - W.y * Dftemp.x; - - s_input[A_read_index] = A_DFT_value; - s_input[B_read_index] = B_DFT_value; - s_input[C_read_index] = C_DFT_value; - s_input[D_read_index] = D_DFT_value; - - PoT = PoT << 1; - PoTp1 = PoTp1 << 1; - } - - // last iteration - if (params::log2_degree > 6) { - __syncthreads(); - m_param = threadIdx.x; - - if (params::fft_direction) - W = Get_W_value_inverse((q - 1) * 257 + m_param); - else - W = _gTwiddles[(q - 1) * 257 + m_param]; - - A_read_index = m_param; - B_read_index = m_param + PoT; - C_read_index = m_param + (PoT >> 1); - D_read_index = m_param + 3 * (PoT >> 1); - - Aftemp = s_input[A_read_index]; - Bftemp = s_input[B_read_index]; - A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y; - A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x; - B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y; - B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x; - - Cftemp = s_input[C_read_index]; - Dftemp = s_input[D_read_index]; - C_DFT_value.x = Cftemp.x + W.y * Dftemp.x + W.x * Dftemp.y; - C_DFT_value.y = Cftemp.y + W.y * Dftemp.y - W.x * Dftemp.x; - D_DFT_value.x = Cftemp.x - W.y * Dftemp.x - W.x * Dftemp.y; - D_DFT_value.y = Cftemp.y - W.y * Dftemp.y + W.x * Dftemp.x; - - s_input[A_read_index] = A_DFT_value; - s_input[B_read_index] = B_DFT_value; - s_input[C_read_index] = C_DFT_value; - s_input[D_read_index] = D_DFT_value; - } -} - -template -__global__ void SMFFT_DIT_external(double2 *d_input, double2 *d_output) { - __syncthreads(); - - extern __shared__ double2 sharedmemBSK[]; - - double2 *s_input = sharedmemBSK; - - int cTid = threadIdx.x * params::opt; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - double2 tmp; - switch (params::degree) { - case 512: - tmp = INVERSE_TWIDDLES_512[cTid]; - tmp.x *= params::degree; - tmp.y *= -params::degree; - break; - case 1024: - tmp = gTwiddles1024[cTid]; - break; - default: - break; - } - - d_input[blockIdx.x * params::degree + cTid] *= tmp; - cTid++; - } - - __syncthreads(); - - s_input[threadIdx.x] = d_input[threadIdx.x + blockIdx.x * params::degree]; - s_input[threadIdx.x + params::quarter] = - d_input[threadIdx.x + blockIdx.x * params::degree + params::quarter]; - s_input[threadIdx.x + params::half] = - d_input[threadIdx.x + blockIdx.x * params::degree + params::half]; - s_input[threadIdx.x + params::three_quarters] = - d_input[threadIdx.x + blockIdx.x * params::degree + - params::three_quarters]; - - __syncthreads(); - - do_SMFFT_CT_DIT(s_input); - if (threadIdx.x == 0 && blockIdx.x == 0) { - for (int i = 0; i < 1024; i++) - printf("smfft[%u] %.10f %.10f\n", i, s_input[i].x, s_input[i].y); - } - __syncthreads(); - - - - __syncthreads(); - d_output[threadIdx.x + blockIdx.x * params::degree] = s_input[threadIdx.x]; - d_output[threadIdx.x + blockIdx.x * params::degree + params::quarter] = - s_input[threadIdx.x + params::quarter]; - d_output[threadIdx.x + blockIdx.x * params::degree + params::half] = - s_input[threadIdx.x + params::half]; - d_output[threadIdx.x + blockIdx.x * params::degree + params::three_quarters] = - s_input[threadIdx.x + params::three_quarters]; - - __syncthreads(); -} - -#endif // GPU_BOOTSTRAP_SMFFT_CUH - -*/ \ No newline at end of file diff --git a/src/keyswitch.cuh b/src/keyswitch.cuh index c384c72c7..7dd339c51 100644 --- a/src/keyswitch.cuh +++ b/src/keyswitch.cuh @@ -203,7 +203,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( keyswitch<<>>( lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower, lwe_upper, cutoff); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); } template diff --git a/src/multiplication.cuh b/src/multiplication.cuh index 3f770966e..0fa7cc662 100644 --- a/src/multiplication.cuh +++ b/src/multiplication.cuh @@ -4,10 +4,9 @@ #ifdef __CDT_PARSER__ #undef __CUDA_RUNTIME_H__ #include -#include #endif -#include "../include/helper_cuda.h" +#include "device.h" #include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" @@ -46,7 +45,7 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output, auto stream = static_cast(v_stream); cleartext_multiplication<<>>( output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); } #endif // CUDA_MULT_H diff --git a/src/negation.cuh b/src/negation.cuh index b6536f7be..e02ab3e24 100644 --- a/src/negation.cuh +++ b/src/negation.cuh @@ -4,10 +4,9 @@ #ifdef __CDT_PARSER__ #undef __CUDA_RUNTIME_H__ #include -#include #endif -#include "../include/helper_cuda.h" +#include "device.h" #include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" @@ -40,7 +39,7 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output, auto stream = static_cast(v_stream); negation<<>>(output, input, num_entries); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); } #endif // CUDA_NEGATE_H diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh index e40773820..6cc1e49aa 100644 --- a/src/polynomial/functions.cuh +++ b/src/polynomial/functions.cuh @@ -1,6 +1,6 @@ #ifndef GPU_POLYNOMIAL_FUNCTIONS #define GPU_POLYNOMIAL_FUNCTIONS -#include "helper_cuda.h" +#include "device.h" #include "utils/timer.cuh" /* diff --git a/src/polynomial/polynomial.cuh b/src/polynomial/polynomial.cuh index e5022e21d..3c2c7d1c6 100644 --- a/src/polynomial/polynomial.cuh +++ b/src/polynomial/polynomial.cuh @@ -3,9 +3,8 @@ #include "complex/operations.cuh" #include "crypto/torus.cuh" +#include "device.h" #include "fft/bnsmfft.cuh" -#include "fft/smfft.cuh" -#include "helper_cuda.h" #include "parameters.cuh" #include "utils/timer.cuh" #include diff --git a/src/vertical_packing.cuh b/src/vertical_packing.cuh index d3ef42446..917273ecd 100644 --- a/src/vertical_packing.cuh +++ b/src/vertical_packing.cuh @@ -8,9 +8,7 @@ #include "crypto/torus.cuh" #include "device.h" #include "fft/bnsmfft.cuh" -#include "fft/smfft.cuh" #include "fft/twiddles.cuh" -#include "helper_cuda.h" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial.cuh" @@ -181,12 +179,12 @@ template __host__ void add_padding_to_lut_async(Torus *lut_out, Torus *lut_in, uint32_t glwe_dimension, uint32_t num_lut, cudaStream_t *stream) { - checkCudaErrors(cudaMemsetAsync(lut_out, 0, - num_lut * (glwe_dimension + 1) * - params::degree * sizeof(Torus), - *stream)); + check_cuda_error(cudaMemsetAsync(lut_out, 0, + num_lut * (glwe_dimension + 1) * + params::degree * sizeof(Torus), + *stream)); for (int i = 0; i < num_lut; i++) - checkCudaErrors(cudaMemcpyAsync( + check_cuda_error(cudaMemcpyAsync( lut_out + (2 * i + 1) * params::degree, lut_in + i * params::degree, params::degree * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream)); } @@ -304,10 +302,10 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index, d_mem = (char *)cuda_malloc_async( memory_needed_per_block * (1 << (r - 1)) * tau, stream, gpu_index); } else { - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( device_batch_cmux, cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block)); - checkCudaErrors( + check_cuda_error( cudaFuncSetCacheConfig(device_batch_cmux, cudaFuncCachePreferShared)); } @@ -349,11 +347,11 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index, polynomial_size, base_log, level_count, layer_idx, // r num_lut); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); } for (int i = 0; i < tau; i++) - checkCudaErrors(cudaMemcpyAsync( + check_cuda_error(cudaMemcpyAsync( glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size, glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream)); @@ -480,11 +478,11 @@ __host__ void host_blind_rotate_and_sample_extraction( d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, stream, gpu_index); else { - checkCudaErrors(cudaFuncSetAttribute( + check_cuda_error(cudaFuncSetAttribute( device_blind_rotation_and_sample_extraction, cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block)); - checkCudaErrors(cudaFuncSetCacheConfig( + check_cuda_error(cudaFuncSetCacheConfig( device_blind_rotation_and_sample_extraction, cudaFuncCachePreferShared)); @@ -499,7 +497,7 @@ __host__ void host_blind_rotate_and_sample_extraction( batch_fft_ggsw_vector( stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size, l_gadget, gpu_index, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); // dim3 thds(polynomial_size / params::opt, 1, 1); @@ -519,7 +517,7 @@ __host__ void host_blind_rotate_and_sample_extraction( glwe_dimension, // k polynomial_size, base_log, l_gadget, memory_needed_per_block, d_mem); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); // cuda_drop_async(d_ggsw_fft_in, stream, gpu_index); diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh index de5bced06..9a8bb9acb 100644 --- a/src/wop_bootstrap.cuh +++ b/src/wop_bootstrap.cuh @@ -6,7 +6,7 @@ #include "bit_extraction.cuh" #include "bootstrap.h" #include "circuit_bootstrap.cuh" -#include "helper_cuda.h" +#include "device.h" #include "utils/kernel_dimensions.cuh" #include "utils/timer.cuh" #include "vertical_packing.cuh" @@ -77,7 +77,7 @@ __host__ void host_circuit_bootstrap_vertical_packing( cuda_memcpy_async_to_gpu( lut_vector_indexes, h_lut_vector_indexes, number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); uint32_t bits = sizeof(Torus) * 8; uint32_t delta_log = (bits - 1); @@ -89,7 +89,7 @@ __host__ void host_circuit_bootstrap_vertical_packing( polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk, base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs, base_log_cbs, number_of_inputs, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); // Free memory cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index); @@ -112,7 +112,7 @@ __host__ void host_circuit_bootstrap_vertical_packing( v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector, glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs, r, tau, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); // Blind rotation + sample extraction // mbr = tau * p - r = log2(N) @@ -151,7 +151,7 @@ __host__ void host_wop_pbs( (uint32_t *)cuda_malloc_async(sizeof(uint32_t), stream, gpu_index); cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes, sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); Torus *lut_pbs = (Torus *)cuda_malloc_async( (2 * polynomial_size) * sizeof(Torus), stream, gpu_index); Torus *lwe_array_in_buffer = (Torus *)cuda_malloc_async( @@ -176,7 +176,7 @@ __host__ void host_wop_pbs( number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, number_of_inputs, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(lut_pbs, stream, gpu_index); cuda_drop_async(lut_vector_indexes, stream, gpu_index); cuda_drop_async(lwe_array_in_buffer, stream, gpu_index); @@ -192,7 +192,7 @@ __host__ void host_wop_pbs( number_of_inputs * number_of_bits_to_extract, number_of_inputs, max_shared_memory); - checkCudaErrors(cudaGetLastError()); + check_cuda_error(cudaGetLastError()); cuda_drop_async(lwe_array_out_bit_extract, stream, gpu_index); } #endif // WOP_PBS_H