diff --git a/include/device.h b/include/device.h
index e656a925a..9e5bc0fb3 100644
--- a/include/device.h
+++ b/include/device.h
@@ -1,5 +1,13 @@
+#ifndef DEVICE_H
+#define DEVICE_H
+
+#pragma once
+
 #include <cstdint>
 #include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 extern "C" {
 cudaStream_t *cuda_create_stream(uint32_t gpu_index);
@@ -35,4 +43,17 @@ int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
 int cuda_get_max_shared_memory(uint32_t gpu_index);
 
 int cuda_synchronize_stream(void *v_stream);
+
+#define check_cuda_error(ans)                                                  \
+  { cuda_error((ans), __FILE__, __LINE__); }
+inline void cuda_error(cudaError_t code, const char *file, int line,
+                       bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort)
+      exit(code);
+  }
 }
+}
+#endif
\ No newline at end of file
diff --git a/include/helper_cuda.h b/include/helper_cuda.h
deleted file mode 100644
index 2b38732cf..000000000
--- a/include/helper_cuda.h
+++ /dev/null
@@ -1,1154 +0,0 @@
-/**
- * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-////////////////////////////////////////////////////////////////////////////////
-// These are CUDA Helper functions for initialization and error checking
-
-#ifndef HELPER_CUDA_H
-#define HELPER_CUDA_H
-
-#pragma once
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "helper_string.h"
-
-#ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
-#endif
-
-// Note, it is required that your SDK sample to include the proper header files,
-// please refer the CUDA examples for examples of the needed CUDA headers, which
-// may change depending on which CUDA functions are used.
-
-// CUDA Runtime error messages
-#ifdef __DRIVER_TYPES_H__
-static const char *_cudaGetErrorEnum(cudaError_t error) {
-  switch (error) {
-  case cudaSuccess:
-    return "cudaSuccess";
-
-  case cudaErrorMissingConfiguration:
-    return "cudaErrorMissingConfiguration";
-
-  case cudaErrorMemoryAllocation:
-    return "cudaErrorMemoryAllocation";
-
-  case cudaErrorInitializationError:
-    return "cudaErrorInitializationError";
-
-  case cudaErrorLaunchFailure:
-    return "cudaErrorLaunchFailure";
-
-  case cudaErrorPriorLaunchFailure:
-    return "cudaErrorPriorLaunchFailure";
-
-  case cudaErrorLaunchTimeout:
-    return "cudaErrorLaunchTimeout";
-
-  case cudaErrorLaunchOutOfResources:
-    return "cudaErrorLaunchOutOfResources";
-
-  case cudaErrorInvalidDeviceFunction:
-    return "cudaErrorInvalidDeviceFunction";
-
-  case cudaErrorInvalidConfiguration:
-    return "cudaErrorInvalidConfiguration";
-
-  case cudaErrorInvalidDevice:
-    return "cudaErrorInvalidDevice";
-
-  case cudaErrorInvalidValue:
-    return "cudaErrorInvalidValue";
-
-  case cudaErrorInvalidPitchValue:
-    return "cudaErrorInvalidPitchValue";
-
-  case cudaErrorInvalidSymbol:
-    return "cudaErrorInvalidSymbol";
-
-  case cudaErrorMapBufferObjectFailed:
-    return "cudaErrorMapBufferObjectFailed";
-
-  case cudaErrorUnmapBufferObjectFailed:
-    return "cudaErrorUnmapBufferObjectFailed";
-
-  case cudaErrorInvalidHostPointer:
-    return "cudaErrorInvalidHostPointer";
-
-  case cudaErrorInvalidDevicePointer:
-    return "cudaErrorInvalidDevicePointer";
-
-  case cudaErrorInvalidTexture:
-    return "cudaErrorInvalidTexture";
-
-  case cudaErrorInvalidTextureBinding:
-    return "cudaErrorInvalidTextureBinding";
-
-  case cudaErrorInvalidChannelDescriptor:
-    return "cudaErrorInvalidChannelDescriptor";
-
-  case cudaErrorInvalidMemcpyDirection:
-    return "cudaErrorInvalidMemcpyDirection";
-
-  case cudaErrorAddressOfConstant:
-    return "cudaErrorAddressOfConstant";
-
-  case cudaErrorTextureFetchFailed:
-    return "cudaErrorTextureFetchFailed";
-
-  case cudaErrorTextureNotBound:
-    return "cudaErrorTextureNotBound";
-
-  case cudaErrorSynchronizationError:
-    return "cudaErrorSynchronizationError";
-
-  case cudaErrorInvalidFilterSetting:
-    return "cudaErrorInvalidFilterSetting";
-
-  case cudaErrorInvalidNormSetting:
-    return "cudaErrorInvalidNormSetting";
-
-  case cudaErrorMixedDeviceExecution:
-    return "cudaErrorMixedDeviceExecution";
-
-  case cudaErrorCudartUnloading:
-    return "cudaErrorCudartUnloading";
-
-  case cudaErrorUnknown:
-    return "cudaErrorUnknown";
-
-  case cudaErrorNotYetImplemented:
-    return "cudaErrorNotYetImplemented";
-
-  case cudaErrorMemoryValueTooLarge:
-    return "cudaErrorMemoryValueTooLarge";
-
-  case cudaErrorInvalidResourceHandle:
-    return "cudaErrorInvalidResourceHandle";
-
-  case cudaErrorNotReady:
-    return "cudaErrorNotReady";
-
-  case cudaErrorInsufficientDriver:
-    return "cudaErrorInsufficientDriver";
-
-  case cudaErrorSetOnActiveProcess:
-    return "cudaErrorSetOnActiveProcess";
-
-  case cudaErrorInvalidSurface:
-    return "cudaErrorInvalidSurface";
-
-  case cudaErrorNoDevice:
-    return "cudaErrorNoDevice";
-
-  case cudaErrorECCUncorrectable:
-    return "cudaErrorECCUncorrectable";
-
-  case cudaErrorSharedObjectSymbolNotFound:
-    return "cudaErrorSharedObjectSymbolNotFound";
-
-  case cudaErrorSharedObjectInitFailed:
-    return "cudaErrorSharedObjectInitFailed";
-
-  case cudaErrorUnsupportedLimit:
-    return "cudaErrorUnsupportedLimit";
-
-  case cudaErrorDuplicateVariableName:
-    return "cudaErrorDuplicateVariableName";
-
-  case cudaErrorDuplicateTextureName:
-    return "cudaErrorDuplicateTextureName";
-
-  case cudaErrorDuplicateSurfaceName:
-    return "cudaErrorDuplicateSurfaceName";
-
-  case cudaErrorDevicesUnavailable:
-    return "cudaErrorDevicesUnavailable";
-
-  case cudaErrorInvalidKernelImage:
-    return "cudaErrorInvalidKernelImage";
-
-  case cudaErrorNoKernelImageForDevice:
-    return "cudaErrorNoKernelImageForDevice";
-
-  case cudaErrorIncompatibleDriverContext:
-    return "cudaErrorIncompatibleDriverContext";
-
-  case cudaErrorPeerAccessAlreadyEnabled:
-    return "cudaErrorPeerAccessAlreadyEnabled";
-
-  case cudaErrorPeerAccessNotEnabled:
-    return "cudaErrorPeerAccessNotEnabled";
-
-  case cudaErrorDeviceAlreadyInUse:
-    return "cudaErrorDeviceAlreadyInUse";
-
-  case cudaErrorProfilerDisabled:
-    return "cudaErrorProfilerDisabled";
-
-  case cudaErrorProfilerNotInitialized:
-    return "cudaErrorProfilerNotInitialized";
-
-  case cudaErrorProfilerAlreadyStarted:
-    return "cudaErrorProfilerAlreadyStarted";
-
-  case cudaErrorProfilerAlreadyStopped:
-    return "cudaErrorProfilerAlreadyStopped";
-
-  /* Since CUDA 4.0*/
-  case cudaErrorAssert:
-    return "cudaErrorAssert";
-
-  case cudaErrorTooManyPeers:
-    return "cudaErrorTooManyPeers";
-
-  case cudaErrorHostMemoryAlreadyRegistered:
-    return "cudaErrorHostMemoryAlreadyRegistered";
-
-  case cudaErrorHostMemoryNotRegistered:
-    return "cudaErrorHostMemoryNotRegistered";
-
-  /* Since CUDA 5.0 */
-  case cudaErrorOperatingSystem:
-    return "cudaErrorOperatingSystem";
-
-  case cudaErrorPeerAccessUnsupported:
-    return "cudaErrorPeerAccessUnsupported";
-
-  case cudaErrorLaunchMaxDepthExceeded:
-    return "cudaErrorLaunchMaxDepthExceeded";
-
-  case cudaErrorLaunchFileScopedTex:
-    return "cudaErrorLaunchFileScopedTex";
-
-  case cudaErrorLaunchFileScopedSurf:
-    return "cudaErrorLaunchFileScopedSurf";
-
-  case cudaErrorSyncDepthExceeded:
-    return "cudaErrorSyncDepthExceeded";
-
-  case cudaErrorLaunchPendingCountExceeded:
-    return "cudaErrorLaunchPendingCountExceeded";
-
-  case cudaErrorNotPermitted:
-    return "cudaErrorNotPermitted";
-
-  case cudaErrorNotSupported:
-    return "cudaErrorNotSupported";
-
-  /* Since CUDA 6.0 */
-  case cudaErrorHardwareStackError:
-    return "cudaErrorHardwareStackError";
-
-  case cudaErrorIllegalInstruction:
-    return "cudaErrorIllegalInstruction";
-
-  case cudaErrorMisalignedAddress:
-    return "cudaErrorMisalignedAddress";
-
-  case cudaErrorInvalidAddressSpace:
-    return "cudaErrorInvalidAddressSpace";
-
-  case cudaErrorInvalidPc:
-    return "cudaErrorInvalidPc";
-
-  case cudaErrorIllegalAddress:
-    return "cudaErrorIllegalAddress";
-
-  /* Since CUDA 6.5*/
-  case cudaErrorInvalidPtx:
-    return "cudaErrorInvalidPtx";
-
-  case cudaErrorInvalidGraphicsContext:
-    return "cudaErrorInvalidGraphicsContext";
-
-  case cudaErrorStartupFailure:
-    return "cudaErrorStartupFailure";
-
-  case cudaErrorApiFailureBase:
-    return "cudaErrorApiFailureBase";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef __cuda_cuda_h__
-// CUDA Driver API errors
-static const char *_cudaGetErrorEnum(CUresult error) {
-  switch (error) {
-  case CUDA_SUCCESS:
-    return "CUDA_SUCCESS";
-
-  case CUDA_ERROR_INVALID_VALUE:
-    return "CUDA_ERROR_INVALID_VALUE";
-
-  case CUDA_ERROR_OUT_OF_MEMORY:
-    return "CUDA_ERROR_OUT_OF_MEMORY";
-
-  case CUDA_ERROR_NOT_INITIALIZED:
-    return "CUDA_ERROR_NOT_INITIALIZED";
-
-  case CUDA_ERROR_DEINITIALIZED:
-    return "CUDA_ERROR_DEINITIALIZED";
-
-  case CUDA_ERROR_PROFILER_DISABLED:
-    return "CUDA_ERROR_PROFILER_DISABLED";
-
-  case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
-    return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
-
-  case CUDA_ERROR_PROFILER_ALREADY_STARTED:
-    return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
-
-  case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
-    return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
-
-  case CUDA_ERROR_NO_DEVICE:
-    return "CUDA_ERROR_NO_DEVICE";
-
-  case CUDA_ERROR_INVALID_DEVICE:
-    return "CUDA_ERROR_INVALID_DEVICE";
-
-  case CUDA_ERROR_INVALID_IMAGE:
-    return "CUDA_ERROR_INVALID_IMAGE";
-
-  case CUDA_ERROR_INVALID_CONTEXT:
-    return "CUDA_ERROR_INVALID_CONTEXT";
-
-  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
-    return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
-
-  case CUDA_ERROR_MAP_FAILED:
-    return "CUDA_ERROR_MAP_FAILED";
-
-  case CUDA_ERROR_UNMAP_FAILED:
-    return "CUDA_ERROR_UNMAP_FAILED";
-
-  case CUDA_ERROR_ARRAY_IS_MAPPED:
-    return "CUDA_ERROR_ARRAY_IS_MAPPED";
-
-  case CUDA_ERROR_ALREADY_MAPPED:
-    return "CUDA_ERROR_ALREADY_MAPPED";
-
-  case CUDA_ERROR_NO_BINARY_FOR_GPU:
-    return "CUDA_ERROR_NO_BINARY_FOR_GPU";
-
-  case CUDA_ERROR_ALREADY_ACQUIRED:
-    return "CUDA_ERROR_ALREADY_ACQUIRED";
-
-  case CUDA_ERROR_NOT_MAPPED:
-    return "CUDA_ERROR_NOT_MAPPED";
-
-  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
-    return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
-
-  case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
-    return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
-
-  case CUDA_ERROR_ECC_UNCORRECTABLE:
-    return "CUDA_ERROR_ECC_UNCORRECTABLE";
-
-  case CUDA_ERROR_UNSUPPORTED_LIMIT:
-    return "CUDA_ERROR_UNSUPPORTED_LIMIT";
-
-  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
-    return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
-
-  case CUDA_ERROR_INVALID_SOURCE:
-    return "CUDA_ERROR_INVALID_SOURCE";
-
-  case CUDA_ERROR_FILE_NOT_FOUND:
-    return "CUDA_ERROR_FILE_NOT_FOUND";
-
-  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
-    return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
-
-  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
-    return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
-
-  case CUDA_ERROR_OPERATING_SYSTEM:
-    return "CUDA_ERROR_OPERATING_SYSTEM";
-
-  case CUDA_ERROR_INVALID_HANDLE:
-    return "CUDA_ERROR_INVALID_HANDLE";
-
-  case CUDA_ERROR_NOT_FOUND:
-    return "CUDA_ERROR_NOT_FOUND";
-
-  case CUDA_ERROR_NOT_READY:
-    return "CUDA_ERROR_NOT_READY";
-
-  case CUDA_ERROR_LAUNCH_FAILED:
-    return "CUDA_ERROR_LAUNCH_FAILED";
-
-  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
-    return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-
-  case CUDA_ERROR_LAUNCH_TIMEOUT:
-    return "CUDA_ERROR_LAUNCH_TIMEOUT";
-
-  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
-    return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
-
-  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
-    return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
-
-  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
-    return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
-
-  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
-    return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
-
-  case CUDA_ERROR_CONTEXT_IS_DESTROYED:
-    return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
-
-  case CUDA_ERROR_ASSERT:
-    return "CUDA_ERROR_ASSERT";
-
-  case CUDA_ERROR_TOO_MANY_PEERS:
-    return "CUDA_ERROR_TOO_MANY_PEERS";
-
-  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
-    return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
-
-  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
-    return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
-
-  case CUDA_ERROR_UNKNOWN:
-    return "CUDA_ERROR_UNKNOWN";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef CUBLAS_API_H_
-// cuBLAS API errors
-static const char *_cudaGetErrorEnum(cublasStatus_t error) {
-  switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef _CUFFT_H_
-// cuFFT API errors
-static const char *_cudaGetErrorEnum(cufftResult error) {
-  switch (error) {
-  case CUFFT_SUCCESS:
-    return "CUFFT_SUCCESS";
-
-  case CUFFT_INVALID_PLAN:
-    return "CUFFT_INVALID_PLAN";
-
-  case CUFFT_ALLOC_FAILED:
-    return "CUFFT_ALLOC_FAILED";
-
-  case CUFFT_INVALID_TYPE:
-    return "CUFFT_INVALID_TYPE";
-
-  case CUFFT_INVALID_VALUE:
-    return "CUFFT_INVALID_VALUE";
-
-  case CUFFT_INTERNAL_ERROR:
-    return "CUFFT_INTERNAL_ERROR";
-
-  case CUFFT_EXEC_FAILED:
-    return "CUFFT_EXEC_FAILED";
-
-  case CUFFT_SETUP_FAILED:
-    return "CUFFT_SETUP_FAILED";
-
-  case CUFFT_INVALID_SIZE:
-    return "CUFFT_INVALID_SIZE";
-
-  case CUFFT_UNALIGNED_DATA:
-    return "CUFFT_UNALIGNED_DATA";
-
-  case CUFFT_INCOMPLETE_PARAMETER_LIST:
-    return "CUFFT_INCOMPLETE_PARAMETER_LIST";
-
-  case CUFFT_INVALID_DEVICE:
-    return "CUFFT_INVALID_DEVICE";
-
-  case CUFFT_PARSE_ERROR:
-    return "CUFFT_PARSE_ERROR";
-
-  case CUFFT_NO_WORKSPACE:
-    return "CUFFT_NO_WORKSPACE";
-
-  case CUFFT_NOT_IMPLEMENTED:
-    return "CUFFT_NOT_IMPLEMENTED";
-
-  case CUFFT_LICENSE_ERROR:
-    return "CUFFT_LICENSE_ERROR";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef CUSPARSEAPI
-// cuSPARSE API errors
-static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
-  switch (error) {
-  case CUSPARSE_STATUS_SUCCESS:
-    return "CUSPARSE_STATUS_SUCCESS";
-
-  case CUSPARSE_STATUS_NOT_INITIALIZED:
-    return "CUSPARSE_STATUS_NOT_INITIALIZED";
-
-  case CUSPARSE_STATUS_ALLOC_FAILED:
-    return "CUSPARSE_STATUS_ALLOC_FAILED";
-
-  case CUSPARSE_STATUS_INVALID_VALUE:
-    return "CUSPARSE_STATUS_INVALID_VALUE";
-
-  case CUSPARSE_STATUS_ARCH_MISMATCH:
-    return "CUSPARSE_STATUS_ARCH_MISMATCH";
-
-  case CUSPARSE_STATUS_MAPPING_ERROR:
-    return "CUSPARSE_STATUS_MAPPING_ERROR";
-
-  case CUSPARSE_STATUS_EXECUTION_FAILED:
-    return "CUSPARSE_STATUS_EXECUTION_FAILED";
-
-  case CUSPARSE_STATUS_INTERNAL_ERROR:
-    return "CUSPARSE_STATUS_INTERNAL_ERROR";
-
-  case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-    return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef CURAND_H_
-// cuRAND API errors
-static const char *_cudaGetErrorEnum(curandStatus_t error) {
-  switch (error) {
-  case CURAND_STATUS_SUCCESS:
-    return "CURAND_STATUS_SUCCESS";
-
-  case CURAND_STATUS_VERSION_MISMATCH:
-    return "CURAND_STATUS_VERSION_MISMATCH";
-
-  case CURAND_STATUS_NOT_INITIALIZED:
-    return "CURAND_STATUS_NOT_INITIALIZED";
-
-  case CURAND_STATUS_ALLOCATION_FAILED:
-    return "CURAND_STATUS_ALLOCATION_FAILED";
-
-  case CURAND_STATUS_TYPE_ERROR:
-    return "CURAND_STATUS_TYPE_ERROR";
-
-  case CURAND_STATUS_OUT_OF_RANGE:
-    return "CURAND_STATUS_OUT_OF_RANGE";
-
-  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-
-  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-
-  case CURAND_STATUS_LAUNCH_FAILURE:
-    return "CURAND_STATUS_LAUNCH_FAILURE";
-
-  case CURAND_STATUS_PREEXISTING_FAILURE:
-    return "CURAND_STATUS_PREEXISTING_FAILURE";
-
-  case CURAND_STATUS_INITIALIZATION_FAILED:
-    return "CURAND_STATUS_INITIALIZATION_FAILED";
-
-  case CURAND_STATUS_ARCH_MISMATCH:
-    return "CURAND_STATUS_ARCH_MISMATCH";
-
-  case CURAND_STATUS_INTERNAL_ERROR:
-    return "CURAND_STATUS_INTERNAL_ERROR";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef NV_NPPIDEFS_H
-// NPP API errors
-static const char *_cudaGetErrorEnum(NppStatus error) {
-  switch (error) {
-  case NPP_NOT_SUPPORTED_MODE_ERROR:
-    return "NPP_NOT_SUPPORTED_MODE_ERROR";
-
-  case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
-    return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
-
-  case NPP_RESIZE_NO_OPERATION_ERROR:
-    return "NPP_RESIZE_NO_OPERATION_ERROR";
-
-  case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
-    return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
-
-  case NPP_BAD_ARG_ERROR:
-    return "NPP_BAD_ARGUMENT_ERROR";
-
-  case NPP_COEFF_ERROR:
-    return "NPP_COEFFICIENT_ERROR";
-
-  case NPP_RECT_ERROR:
-    return "NPP_RECTANGLE_ERROR";
-
-  case NPP_QUAD_ERROR:
-    return "NPP_QUADRANGLE_ERROR";
-
-  case NPP_MEM_ALLOC_ERR:
-    return "NPP_MEMORY_ALLOCATION_ERROR";
-
-  case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
-    return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
-
-  case NPP_INVALID_INPUT:
-    return "NPP_INVALID_INPUT";
-
-  case NPP_POINTER_ERROR:
-    return "NPP_POINTER_ERROR";
-
-  case NPP_WARNING:
-    return "NPP_WARNING";
-
-  case NPP_ODD_ROI_WARNING:
-    return "NPP_ODD_ROI_WARNING";
-#else
-
-    // These are for CUDA 5.5 or higher
-  case NPP_BAD_ARGUMENT_ERROR:
-    return "NPP_BAD_ARGUMENT_ERROR";
-
-  case NPP_COEFFICIENT_ERROR:
-    return "NPP_COEFFICIENT_ERROR";
-
-  case NPP_RECTANGLE_ERROR:
-    return "NPP_RECTANGLE_ERROR";
-
-  case NPP_QUADRANGLE_ERROR:
-    return "NPP_QUADRANGLE_ERROR";
-
-  case NPP_MEMORY_ALLOCATION_ERR:
-    return "NPP_MEMORY_ALLOCATION_ERROR";
-
-  case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
-    return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
-
-  case NPP_INVALID_HOST_POINTER_ERROR:
-    return "NPP_INVALID_HOST_POINTER_ERROR";
-
-  case NPP_INVALID_DEVICE_POINTER_ERROR:
-    return "NPP_INVALID_DEVICE_POINTER_ERROR";
-#endif
-
-  case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
-    return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
-
-  case NPP_TEXTURE_BIND_ERROR:
-    return "NPP_TEXTURE_BIND_ERROR";
-
-  case NPP_WRONG_INTERSECTION_ROI_ERROR:
-    return "NPP_WRONG_INTERSECTION_ROI_ERROR";
-
-  case NPP_NOT_EVEN_STEP_ERROR:
-    return "NPP_NOT_EVEN_STEP_ERROR";
-
-  case NPP_INTERPOLATION_ERROR:
-    return "NPP_INTERPOLATION_ERROR";
-
-  case NPP_RESIZE_FACTOR_ERROR:
-    return "NPP_RESIZE_FACTOR_ERROR";
-
-  case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
-    return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
-
-  case NPP_MEMFREE_ERR:
-    return "NPP_MEMFREE_ERR";
-
-  case NPP_MEMSET_ERR:
-    return "NPP_MEMSET_ERR";
-
-  case NPP_MEMCPY_ERR:
-    return "NPP_MEMCPY_ERROR";
-
-  case NPP_MIRROR_FLIP_ERR:
-    return "NPP_MIRROR_FLIP_ERR";
-#else
-
-  case NPP_MEMFREE_ERROR:
-    return "NPP_MEMFREE_ERROR";
-
-  case NPP_MEMSET_ERROR:
-    return "NPP_MEMSET_ERROR";
-
-  case NPP_MEMCPY_ERROR:
-    return "NPP_MEMCPY_ERROR";
-
-  case NPP_MIRROR_FLIP_ERROR:
-    return "NPP_MIRROR_FLIP_ERROR";
-#endif
-
-  case NPP_ALIGNMENT_ERROR:
-    return "NPP_ALIGNMENT_ERROR";
-
-  case NPP_STEP_ERROR:
-    return "NPP_STEP_ERROR";
-
-  case NPP_SIZE_ERROR:
-    return "NPP_SIZE_ERROR";
-
-  case NPP_NULL_POINTER_ERROR:
-    return "NPP_NULL_POINTER_ERROR";
-
-  case NPP_CUDA_KERNEL_EXECUTION_ERROR:
-    return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
-
-  case NPP_NOT_IMPLEMENTED_ERROR:
-    return "NPP_NOT_IMPLEMENTED_ERROR";
-
-  case NPP_ERROR:
-    return "NPP_ERROR";
-
-  case NPP_SUCCESS:
-    return "NPP_SUCCESS";
-
-  case NPP_WRONG_INTERSECTION_QUAD_WARNING:
-    return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
-
-  case NPP_MISALIGNED_DST_ROI_WARNING:
-    return "NPP_MISALIGNED_DST_ROI_WARNING";
-
-  case NPP_AFFINE_QUAD_INCORRECT_WARNING:
-    return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
-
-  case NPP_DOUBLE_SIZE_WARNING:
-    return "NPP_DOUBLE_SIZE_WARNING";
-
-  case NPP_WRONG_INTERSECTION_ROI_WARNING:
-    return "NPP_WRONG_INTERSECTION_ROI_WARNING";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
-  /* These are 6.0 or higher */
-  case NPP_LUT_PALETTE_BITSIZE_ERROR:
-    return "NPP_LUT_PALETTE_BITSIZE_ERROR";
-
-  case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
-    return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
-
-  case NPP_QUALITY_INDEX_ERROR:
-    return "NPP_QUALITY_INDEX_ERROR";
-
-  case NPP_CHANNEL_ORDER_ERROR:
-    return "NPP_CHANNEL_ORDER_ERROR";
-
-  case NPP_ZERO_MASK_VALUE_ERROR:
-    return "NPP_ZERO_MASK_VALUE_ERROR";
-
-  case NPP_NUMBER_OF_CHANNELS_ERROR:
-    return "NPP_NUMBER_OF_CHANNELS_ERROR";
-
-  case NPP_COI_ERROR:
-    return "NPP_COI_ERROR";
-
-  case NPP_DIVISOR_ERROR:
-    return "NPP_DIVISOR_ERROR";
-
-  case NPP_CHANNEL_ERROR:
-    return "NPP_CHANNEL_ERROR";
-
-  case NPP_STRIDE_ERROR:
-    return "NPP_STRIDE_ERROR";
-
-  case NPP_ANCHOR_ERROR:
-    return "NPP_ANCHOR_ERROR";
-
-  case NPP_MASK_SIZE_ERROR:
-    return "NPP_MASK_SIZE_ERROR";
-
-  case NPP_MOMENT_00_ZERO_ERROR:
-    return "NPP_MOMENT_00_ZERO_ERROR";
-
-  case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
-    return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
-
-  case NPP_THRESHOLD_ERROR:
-    return "NPP_THRESHOLD_ERROR";
-
-  case NPP_CONTEXT_MATCH_ERROR:
-    return "NPP_CONTEXT_MATCH_ERROR";
-
-  case NPP_FFT_FLAG_ERROR:
-    return "NPP_FFT_FLAG_ERROR";
-
-  case NPP_FFT_ORDER_ERROR:
-    return "NPP_FFT_ORDER_ERROR";
-
-  case NPP_SCALE_RANGE_ERROR:
-    return "NPP_SCALE_RANGE_ERROR";
-
-  case NPP_DATA_TYPE_ERROR:
-    return "NPP_DATA_TYPE_ERROR";
-
-  case NPP_OUT_OFF_RANGE_ERROR:
-    return "NPP_OUT_OFF_RANGE_ERROR";
-
-  case NPP_DIVIDE_BY_ZERO_ERROR:
-    return "NPP_DIVIDE_BY_ZERO_ERROR";
-
-  case NPP_RANGE_ERROR:
-    return "NPP_RANGE_ERROR";
-
-  case NPP_NO_MEMORY_ERROR:
-    return "NPP_NO_MEMORY_ERROR";
-
-  case NPP_ERROR_RESERVED:
-    return "NPP_ERROR_RESERVED";
-
-  case NPP_NO_OPERATION_WARNING:
-    return "NPP_NO_OPERATION_WARNING";
-
-  case NPP_DIVIDE_BY_ZERO_WARNING:
-    return "NPP_DIVIDE_BY_ZERO_WARNING";
-#endif
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef __DRIVER_TYPES_H__
-#ifndef DEVICE_RESET
-#define DEVICE_RESET cudaDeviceReset();
-#endif
-#else
-#ifndef DEVICE_RESET
-#define DEVICE_RESET
-#endif
-#endif
-
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-           int const line) {
-  if (result) {
-    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
-    DEVICE_RESET
-    // Make sure we call CUDA Device Reset before exiting
-    exit(EXIT_FAILURE);
-  }
-}
-
-#ifdef __DRIVER_TYPES_H__
-// This will output the proper CUDA error strings in the event that a CUDA host
-// call returns an error
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
-
-// This will output the proper error string when calling cudaGetLastError
-#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __getLastCudaError(const char *errorMessage, const char *file,
-                               const int line) {
-  cudaError_t err = cudaGetLastError();
-
-  if (cudaSuccess != err) {
-    fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
-            file, line, errorMessage, (int)err, cudaGetErrorString(err));
-    DEVICE_RESET
-    exit(EXIT_FAILURE);
-  }
-}
-#endif
-
-#ifndef MAX
-#define MAX(a, b) (a > b ? a : b)
-#endif
-
-// Float To Int conversion
-inline int ftoi(float value) {
-  return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
-}
-
-// Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2Cores(int major, int minor) {
-  // Defines for GPU Architecture types (using the SM version to determine the #
-  // of cores per SM
-  typedef struct {
-    int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
-            // minor version
-    int Cores;
-  } sSMtoCores;
-
-  sSMtoCores nGpuArchCoresPerSM[] = {
-      {0x20, 32},  // Fermi Generation (SM 2.0) GF100 class
-      {0x21, 48},  // Fermi Generation (SM 2.1) GF10x class
-      {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
-      {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
-      {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
-      {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
-      {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
-      {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
-      {-1, -1}};
-
-  int index = 0;
-
-  while (nGpuArchCoresPerSM[index].SM != -1) {
-    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
-      return nGpuArchCoresPerSM[index].Cores;
-    }
-
-    index++;
-  }
-
-  // If we don't find the values, we default use the previous one to run
-  // properly
-  printf(
-      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
-      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
-  return nGpuArchCoresPerSM[index - 1].Cores;
-}
-// end of GPU Architecture definitions
-
-#ifdef __CUDA_RUNTIME_H__
-// General GPU Device CUDA Initialization
-inline int gpuDeviceInit(int devID) {
-  int device_count;
-  checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-  if (device_count == 0) {
-    fprintf(stderr,
-            "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  if (devID < 0) {
-    devID = 0;
-  }
-
-  if (devID > device_count - 1) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
-            device_count);
-    fprintf(stderr,
-            ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n",
-            devID);
-    fprintf(stderr, "\n");
-    return -devID;
-  }
-
-  cudaDeviceProp deviceProp;
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-
-  if (deviceProp.computeMode == cudaComputeModeProhibited) {
-    fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no "
-                    "threads can use ::cudaSetDevice().\n");
-    return -1;
-  }
-
-  if (deviceProp.major < 1) {
-    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  checkCudaErrors(cudaSetDevice(devID));
-  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
-
-  return devID;
-}
-
-// This function returns the best GPU (with maximum GFLOPS)
-inline int gpuGetMaxGflopsDeviceId() {
-  int current_device = 0, sm_per_multiproc = 0;
-  int max_perf_device = 0;
-  int device_count = 0, best_SM_arch = 0;
-  int devices_prohibited = 0;
-
-  unsigned long long max_compute_perf = 0;
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceCount(&device_count);
-
-  checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-  if (device_count == 0) {
-    fprintf(
-        stderr,
-        "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Find the best major SM Architecture GPU device
-  while (current_device < device_count) {
-    cudaGetDeviceProperties(&deviceProp, current_device);
-
-    // If this GPU is not running on Compute Mode prohibited, then we can add it
-    // to the list
-    if (deviceProp.computeMode != cudaComputeModeProhibited) {
-      if (deviceProp.major > 0 && deviceProp.major < 9999) {
-        best_SM_arch = MAX(best_SM_arch, deviceProp.major);
-      }
-    } else {
-      devices_prohibited++;
-    }
-
-    current_device++;
-  }
-
-  if (devices_prohibited == device_count) {
-    fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have "
-                    "compute mode prohibited.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Find the best CUDA capable GPU device
-  current_device = 0;
-
-  while (current_device < device_count) {
-    cudaGetDeviceProperties(&deviceProp, current_device);
-
-    // If this GPU is not running on Compute Mode prohibited, then we can add it
-    // to the list
-    if (deviceProp.computeMode != cudaComputeModeProhibited) {
-      if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
-        sm_per_multiproc = 1;
-      } else {
-        sm_per_multiproc =
-            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
-      }
-
-      unsigned long long compute_perf =
-          (unsigned long long)deviceProp.multiProcessorCount *
-          sm_per_multiproc * deviceProp.clockRate;
-
-      if (compute_perf > max_compute_perf) {
-        // If we find GPU with SM major > 2, search only these
-        if (best_SM_arch > 2) {
-          // If our device==dest_SM_arch, choose this, or else pass
-          if (deviceProp.major == best_SM_arch) {
-            max_compute_perf = compute_perf;
-            max_perf_device = current_device;
-          }
-        } else {
-          max_compute_perf = compute_perf;
-          max_perf_device = current_device;
-        }
-      }
-    }
-
-    ++current_device;
-  }
-
-  return max_perf_device;
-}
-
-// Initialization code to find the best CUDA Device
-inline int findCudaDevice(int argc, const char **argv) {
-  return 0;
-  /*
-      cudaDeviceProp deviceProp;
-      int devID = 0;
-
-      // If the command-line has a device number specified, use it
-      if (checkCmdLineFlag(argc, argv, "device"))
-      {
-          devID = getCmdLineArgumentInt(argc, argv, "device=");
-
-          if (devID < 0)
-          {
-              printf("Invalid command line parameter\n ");
-              exit(EXIT_FAILURE);
-          }
-          else
-          {
-              devID = gpuDeviceInit(devID);
-
-              if (devID < 0)
-              {
-                  printf("exiting...\n");
-                  exit(EXIT_FAILURE);
-              }
-          }
-      }
-      else
-      {
-          // Otherwise pick the device with highest Gflops/s
-          devID = gpuGetMaxGflopsDeviceId();
-          checkCudaErrors(cudaSetDevice(devID));
-          checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-          printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
-     devID, deviceProp.name, deviceProp.major, deviceProp.minor);
-      }
-
-      return devID; */
-}
-
-// General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilities(int major_version, int minor_version) {
-  cudaDeviceProp deviceProp;
-  deviceProp.major = 0;
-  deviceProp.minor = 0;
-  int dev;
-
-  checkCudaErrors(cudaGetDevice(&dev));
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
-
-  if ((deviceProp.major > major_version) ||
-      (deviceProp.major == major_version &&
-       deviceProp.minor >= minor_version)) {
-    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
-           deviceProp.name, deviceProp.major, deviceProp.minor);
-    return true;
-  } else {
-    printf("  No GPU device was found that can support CUDA compute capability "
-           "%d.%d.\n",
-           major_version, minor_version);
-    return false;
-  }
-}
-#endif
-
-// end of CUDA Helper Functions
-
-#endif
diff --git a/include/helper_string.h b/include/helper_string.h
deleted file mode 100644
index 562e06755..000000000
--- a/include/helper_string.h
+++ /dev/null
@@ -1,664 +0,0 @@
-/**
- * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-// These are helper functions for the SDK samples (string parsing, timers, etc)
-#ifndef STRING_HELPER_H
-#define STRING_HELPER_H
-
-#include <fstream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-#ifndef _CRT_SECURE_NO_DEPRECATE
-#define _CRT_SECURE_NO_DEPRECATE
-#endif
-#ifndef STRCASECMP
-#define STRCASECMP _stricmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP _strnicmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result != 0)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf_s
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf_s
-#endif
-#else // Linux Includes
-#include <string.h>
-#include <strings.h>
-
-#ifndef STRCASECMP
-#define STRCASECMP strcasecmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP strncasecmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result == NULL)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf
-#endif
-#endif
-
-#ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
-#endif
-
-// CUDA Utility Helper Functions
-inline int stringRemoveDelimiter(char delimiter, const char *string) {
-  int string_start = 0;
-
-  while (string[string_start] == delimiter) {
-    string_start++;
-  }
-
-  if (string_start >= (int)strlen(string) - 1) {
-    return 0;
-  }
-
-  return string_start;
-}
-
-inline int getFileExtension(char *filename, char **extension) {
-  int string_length = (int)strlen(filename);
-
-  while (filename[string_length--] != '.') {
-    if (string_length == 0)
-      break;
-  }
-
-  if (string_length > 0)
-    string_length += 2;
-
-  if (string_length == 0)
-    *extension = NULL;
-  else
-    *extension = &filename[string_length];
-
-  return string_length;
-}
-
-inline bool checkCmdLineFlag(const int argc, const char **argv,
-                             const char *string_ref) {
-  bool bFound = false;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
-
-      const char *equal_pos = strchr(string_argv, '=');
-      int argv_length =
-          (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
-
-      int length = (int)strlen(string_ref);
-
-      if (length == argv_length &&
-          !STRNCASECMP(string_argv, string_ref, length)) {
-        bFound = true;
-        continue;
-      }
-    }
-  }
-
-  return bFound;
-}
-
-// This function wraps the CUDA Driver API into a template function
-template <class T>
-inline bool getCmdLineArgumentValue(const int argc, const char **argv,
-                                    const char *string_ref, T *value) {
-  bool bFound = false;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
-      int length = (int)strlen(string_ref);
-
-      if (!STRNCASECMP(string_argv, string_ref, length)) {
-        if (length + 1 <= (int)strlen(string_argv)) {
-          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-          *value = (T)atoi(&string_argv[length + auto_inc]);
-        }
-
-        bFound = true;
-        i = argc;
-      }
-    }
-  }
-
-  return bFound;
-}
-
-inline int getCmdLineArgumentInt(const int argc, const char **argv,
-                                 const char *string_ref) {
-  bool bFound = false;
-  int value = -1;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
-      int length = (int)strlen(string_ref);
-
-      if (!STRNCASECMP(string_argv, string_ref, length)) {
-        if (length + 1 <= (int)strlen(string_argv)) {
-          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-          value = atoi(&string_argv[length + auto_inc]);
-        } else {
-          value = 0;
-        }
-
-        bFound = true;
-        continue;
-      }
-    }
-  }
-
-  if (bFound) {
-    return value;
-  } else {
-    return 0;
-  }
-}
-
-inline float getCmdLineArgumentFloat(const int argc, const char **argv,
-                                     const char *string_ref) {
-  bool bFound = false;
-  float value = -1;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
-      int length = (int)strlen(string_ref);
-
-      if (!STRNCASECMP(string_argv, string_ref, length)) {
-        if (length + 1 <= (int)strlen(string_argv)) {
-          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-          value = (float)atof(&string_argv[length + auto_inc]);
-        } else {
-          value = 0.f;
-        }
-
-        bFound = true;
-        continue;
-      }
-    }
-  }
-
-  if (bFound) {
-    return value;
-  } else {
-    return 0;
-  }
-}
-
-inline bool getCmdLineArgumentString(const int argc, const char **argv,
-                                     const char *string_ref,
-                                     char **string_retval) {
-  bool bFound = false;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      char *string_argv = (char *)&argv[i][string_start];
-      int length = (int)strlen(string_ref);
-
-      if (!STRNCASECMP(string_argv, string_ref, length)) {
-        *string_retval = &string_argv[length + 1];
-        bFound = true;
-        continue;
-      }
-    }
-  }
-
-  if (!bFound) {
-    *string_retval = NULL;
-  }
-
-  return bFound;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Find the path for a file assuming that
-//! files are found in the searchPath.
-//!
-//! @return the path if succeeded, otherwise 0
-//! @param filename         name of the file
-//! @param executable_path  optional absolute path of the executable
-//////////////////////////////////////////////////////////////////////////////
-inline char *sdkFindFilePath(const char *filename,
-                             const char *executable_path) {
-  // <executable_name> defines a variable that is replaced with the name of the
-  // executable
-
-  // Typical relative search paths to locate needed companion files (e.g. sample
-  // input data, or JIT source files) The origin for the relative search may be
-  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
-  // .exe or .bat, etc
-  const char *searchPath[] = {
-      "./",                            // same dir
-      "./common/",                     // "/common/" subdir
-      "./common/data/",                // "/common/data/" subdir
-      "./data/",                       // "/data/" subdir
-      "./src/",                        // "/src/" subdir
-      "./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
-      "./inc/",                        // "/inc/" subdir
-      "./0_Simple/",                   // "/0_Simple/" subdir
-      "./1_Utilities/",                // "/1_Utilities/" subdir
-      "./2_Graphics/",                 // "/2_Graphics/" subdir
-      "./3_Imaging/",                  // "/3_Imaging/" subdir
-      "./4_Finance/",                  // "/4_Finance/" subdir
-      "./5_Simulations/",              // "/5_Simulations/" subdir
-      "./6_Advanced/",                 // "/6_Advanced/" subdir
-      "./7_CUDALibraries/",            // "/7_CUDALibraries/" subdir
-      "./8_Android/",                  // "/8_Android/" subdir
-      "./samples/",                    // "/samples/" subdir
-
-      "../",             // up 1 in tree
-      "../common/",      // up 1 in tree, "/common/" subdir
-      "../common/data/", // up 1 in tree, "/common/data/" subdir
-      "../data/",        // up 1 in tree, "/data/" subdir
-      "../src/",         // up 1 in tree, "/src/" subdir
-      "../inc/",         // up 1 in tree, "/inc/" subdir
-
-      "../0_Simple/<executable_name>/data/", // up 1 in tree,
-                                             // "/0_Simple/<executable_name>/"
-                                             // subdir
-      "../1_Utilities/<executable_name>/data/", // up 1 in tree,
-                                                // "/1_Utilities/<executable_name>/"
-                                                // subdir
-      "../2_Graphics/<executable_name>/data/", // up 1 in tree,
-                                               // "/2_Graphics/<executable_name>/"
-                                               // subdir
-      "../3_Imaging/<executable_name>/data/", // up 1 in tree,
-                                              // "/3_Imaging/<executable_name>/"
-                                              // subdir
-      "../4_Finance/<executable_name>/data/", // up 1 in tree,
-                                              // "/4_Finance/<executable_name>/"
-                                              // subdir
-      "../5_Simulations/<executable_name>/data/", // up 1 in tree,
-                                                  // "/5_Simulations/<executable_name>/"
-                                                  // subdir
-      "../6_Advanced/<executable_name>/data/", // up 1 in tree,
-                                               // "/6_Advanced/<executable_name>/"
-                                               // subdir
-      "../7_CUDALibraries/<executable_name>/data/", // up 1 in tree,
-                                                    // "/7_CUDALibraries/<executable_name>/"
-                                                    // subdir
-      "../8_Android/<executable_name>/data/", // up 1 in tree,
-                                              // "/8_Android/<executable_name>/"
-                                              // subdir
-      "../samples/<executable_name>/data/",   // up 1 in tree,
-                                              // "/samples/<executable_name>/"
-                                              // subdir
-      "../../",                               // up 2 in tree
-      "../../common/",                        // up 2 in tree, "/common/" subdir
-      "../../common/data/", // up 2 in tree, "/common/data/" subdir
-      "../../data/",        // up 2 in tree, "/data/" subdir
-      "../../src/",         // up 2 in tree, "/src/" subdir
-      "../../inc/",         // up 2 in tree, "/inc/" subdir
-      "../../sandbox/<executable_name>/data/",  // up 2 in tree,
-                                                // "/sandbox/<executable_name>/"
-                                                // subdir
-      "../../0_Simple/<executable_name>/data/", // up 2 in tree,
-                                                // "/0_Simple/<executable_name>/"
-                                                // subdir
-      "../../1_Utilities/<executable_name>/data/", // up 2 in tree,
-                                                   // "/1_Utilities/<executable_name>/"
-                                                   // subdir
-      "../../2_Graphics/<executable_name>/data/", // up 2 in tree,
-                                                  // "/2_Graphics/<executable_name>/"
-                                                  // subdir
-      "../../3_Imaging/<executable_name>/data/", // up 2 in tree,
-                                                 // "/3_Imaging/<executable_name>/"
-                                                 // subdir
-      "../../4_Finance/<executable_name>/data/", // up 2 in tree,
-                                                 // "/4_Finance/<executable_name>/"
-                                                 // subdir
-      "../../5_Simulations/<executable_name>/data/", // up 2 in tree,
-                                                     // "/5_Simulations/<executable_name>/"
-                                                     // subdir
-      "../../6_Advanced/<executable_name>/data/", // up 2 in tree,
-                                                  // "/6_Advanced/<executable_name>/"
-                                                  // subdir
-      "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree,
-                                                       // "/7_CUDALibraries/<executable_name>/"
-                                                       // subdir
-      "../../8_Android/<executable_name>/data/", // up 2 in tree,
-                                                 // "/8_Android/<executable_name>/"
-                                                 // subdir
-      "../../samples/<executable_name>/data/", // up 2 in tree,
-                                               // "/samples/<executable_name>/"
-                                               // subdir
-      "../../../",                             // up 3 in tree
-      "../../../src/<executable_name>/",       // up 3 in tree,
-                                         // "/src/<executable_name>/" subdir
-      "../../../src/<executable_name>/data/", // up 3 in tree,
-                                              // "/src/<executable_name>/data/"
-                                              // subdir
-      "../../../src/<executable_name>/src/",  // up 3 in tree,
-                                              // "/src/<executable_name>/src/"
-                                              // subdir
-      "../../../src/<executable_name>/inc/",  // up 3 in tree,
-                                              // "/src/<executable_name>/inc/"
-                                              // subdir
-      "../../../sandbox/<executable_name>/",  // up 3 in tree,
-                                              // "/sandbox/<executable_name>/"
-                                              // subdir
-      "../../../sandbox/<executable_name>/data/", // up 3 in tree,
-                                                  // "/sandbox/<executable_name>/data/"
-                                                  // subdir
-      "../../../sandbox/<executable_name>/src/", // up 3 in tree,
-                                                 // "/sandbox/<executable_name>/src/"
-                                                 // subdir
-      "../../../sandbox/<executable_name>/inc/", // up 3 in tree,
-                                                 // "/sandbox/<executable_name>/inc/"
-                                                 // subdir
-      "../../../0_Simple/<executable_name>/data/", // up 3 in tree,
-                                                   // "/0_Simple/<executable_name>/"
-                                                   // subdir
-      "../../../1_Utilities/<executable_name>/data/", // up 3 in tree,
-                                                      // "/1_Utilities/<executable_name>/"
-                                                      // subdir
-      "../../../2_Graphics/<executable_name>/data/", // up 3 in tree,
-                                                     // "/2_Graphics/<executable_name>/"
-                                                     // subdir
-      "../../../3_Imaging/<executable_name>/data/", // up 3 in tree,
-                                                    // "/3_Imaging/<executable_name>/"
-                                                    // subdir
-      "../../../4_Finance/<executable_name>/data/", // up 3 in tree,
-                                                    // "/4_Finance/<executable_name>/"
-                                                    // subdir
-      "../../../5_Simulations/<executable_name>/data/", // up 3 in tree,
-                                                        // "/5_Simulations/<executable_name>/"
-                                                        // subdir
-      "../../../6_Advanced/<executable_name>/data/", // up 3 in tree,
-                                                     // "/6_Advanced/<executable_name>/"
-                                                     // subdir
-      "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree,
-                                                          // "/7_CUDALibraries/<executable_name>/"
-                                                          // subdir
-      "../../../8_Android/<executable_name>/data/", // up 3 in tree,
-                                                    // "/8_Android/<executable_name>/"
-                                                    // subdir
-      "../../../0_Simple/<executable_name>/", // up 3 in tree,
-                                              // "/0_Simple/<executable_name>/"
-                                              // subdir
-      "../../../1_Utilities/<executable_name>/", // up 3 in tree,
-                                                 // "/1_Utilities/<executable_name>/"
-                                                 // subdir
-      "../../../2_Graphics/<executable_name>/", // up 3 in tree,
-                                                // "/2_Graphics/<executable_name>/"
-                                                // subdir
-      "../../../3_Imaging/<executable_name>/", // up 3 in tree,
-                                               // "/3_Imaging/<executable_name>/"
-                                               // subdir
-      "../../../4_Finance/<executable_name>/", // up 3 in tree,
-                                               // "/4_Finance/<executable_name>/"
-                                               // subdir
-      "../../../5_Simulations/<executable_name>/", // up 3 in tree,
-                                                   // "/5_Simulations/<executable_name>/"
-                                                   // subdir
-      "../../../6_Advanced/<executable_name>/", // up 3 in tree,
-                                                // "/6_Advanced/<executable_name>/"
-                                                // subdir
-      "../../../7_CUDALibraries/<executable_name>/", // up 3 in tree,
-                                                     // "/7_CUDALibraries/<executable_name>/"
-                                                     // subdir
-      "../../../8_Android/<executable_name>/", // up 3 in tree,
-                                               // "/8_Android/<executable_name>/"
-                                               // subdir
-      "../../../samples/<executable_name>/data/", // up 3 in tree,
-                                                  // "/samples/<executable_name>/"
-                                                  // subdir
-      "../../../common/",      // up 3 in tree, "../../../common/" subdir
-      "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
-      "../../../data/",        // up 3 in tree, "../../../data/" subdir
-      "../../../../",          // up 4 in tree
-      "../../../../src/<executable_name>/", // up 4 in tree,
-                                            // "/src/<executable_name>/" subdir
-      "../../../../src/<executable_name>/data/", // up 4 in tree,
-                                                 // "/src/<executable_name>/data/"
-                                                 // subdir
-      "../../../../src/<executable_name>/src/", // up 4 in tree,
-                                                // "/src/<executable_name>/src/"
-                                                // subdir
-      "../../../../src/<executable_name>/inc/", // up 4 in tree,
-                                                // "/src/<executable_name>/inc/"
-                                                // subdir
-      "../../../../sandbox/<executable_name>/", // up 4 in tree,
-                                                // "/sandbox/<executable_name>/"
-                                                // subdir
-      "../../../../sandbox/<executable_name>/data/", // up 4 in tree,
-                                                     // "/sandbox/<executable_name>/data/"
-                                                     // subdir
-      "../../../../sandbox/<executable_name>/src/", // up 4 in tree,
-                                                    // "/sandbox/<executable_name>/src/"
-                                                    // subdir
-      "../../../../sandbox/<executable_name>/inc/", // up 4 in tree,
-                                                    // "/sandbox/<executable_name>/inc/"
-                                                    // subdir
-      "../../../../0_Simple/<executable_name>/data/", // up 4 in tree,
-                                                      // "/0_Simple/<executable_name>/"
-                                                      // subdir
-      "../../../../1_Utilities/<executable_name>/data/", // up 4 in tree,
-                                                         // "/1_Utilities/<executable_name>/"
-                                                         // subdir
-      "../../../../2_Graphics/<executable_name>/data/", // up 4 in tree,
-                                                        // "/2_Graphics/<executable_name>/"
-                                                        // subdir
-      "../../../../3_Imaging/<executable_name>/data/", // up 4 in tree,
-                                                       // "/3_Imaging/<executable_name>/"
-                                                       // subdir
-      "../../../../4_Finance/<executable_name>/data/", // up 4 in tree,
-                                                       // "/4_Finance/<executable_name>/"
-                                                       // subdir
-      "../../../../5_Simulations/<executable_name>/data/", // up 4 in tree,
-                                                           // "/5_Simulations/<executable_name>/"
-                                                           // subdir
-      "../../../../6_Advanced/<executable_name>/data/", // up 4 in tree,
-                                                        // "/6_Advanced/<executable_name>/"
-                                                        // subdir
-      "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree,
-                                                             // "/7_CUDALibraries/<executable_name>/"
-                                                             // subdir
-      "../../../../8_Android/<executable_name>/data/", // up 4 in tree,
-                                                       // "/8_Android/<executable_name>/"
-                                                       // subdir
-      "../../../../0_Simple/<executable_name>/", // up 4 in tree,
-                                                 // "/0_Simple/<executable_name>/"
-                                                 // subdir
-      "../../../../1_Utilities/<executable_name>/", // up 4 in tree,
-                                                    // "/1_Utilities/<executable_name>/"
-                                                    // subdir
-      "../../../../2_Graphics/<executable_name>/", // up 4 in tree,
-                                                   // "/2_Graphics/<executable_name>/"
-                                                   // subdir
-      "../../../../3_Imaging/<executable_name>/", // up 4 in tree,
-                                                  // "/3_Imaging/<executable_name>/"
-                                                  // subdir
-      "../../../../4_Finance/<executable_name>/", // up 4 in tree,
-                                                  // "/4_Finance/<executable_name>/"
-                                                  // subdir
-      "../../../../5_Simulations/<executable_name>/", // up 4 in tree,
-                                                      // "/5_Simulations/<executable_name>/"
-                                                      // subdir
-      "../../../../6_Advanced/<executable_name>/", // up 4 in tree,
-                                                   // "/6_Advanced/<executable_name>/"
-                                                   // subdir
-      "../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree,
-                                                        // "/7_CUDALibraries/<executable_name>/"
-                                                        // subdir
-      "../../../../8_Android/<executable_name>/", // up 4 in tree,
-                                                  // "/8_Android/<executable_name>/"
-                                                  // subdir
-      "../../../../samples/<executable_name>/data/", // up 4 in tree,
-                                                     // "/samples/<executable_name>/"
-                                                     // subdir
-      "../../../../common/",      // up 4 in tree, "../../../common/" subdir
-      "../../../../common/data/", // up 4 in tree, "../../../common/data/"
-                                  // subdir
-      "../../../../data/",        // up 4 in tree, "../../../data/" subdir
-      "../../../../../",          // up 5 in tree
-      "../../../../../src/<executable_name>/",      // up 5 in tree,
-                                                    // "/src/<executable_name>/"
-                                                    // subdir
-      "../../../../../src/<executable_name>/data/", // up 5 in tree,
-                                                    // "/src/<executable_name>/data/"
-                                                    // subdir
-      "../../../../../src/<executable_name>/src/", // up 5 in tree,
-                                                   // "/src/<executable_name>/src/"
-                                                   // subdir
-      "../../../../../src/<executable_name>/inc/", // up 5 in tree,
-                                                   // "/src/<executable_name>/inc/"
-                                                   // subdir
-      "../../../../../sandbox/<executable_name>/", // up 5 in tree,
-                                                   // "/sandbox/<executable_name>/"
-                                                   // subdir
-      "../../../../../sandbox/<executable_name>/data/", // up 5 in tree,
-                                                        // "/sandbox/<executable_name>/data/"
-                                                        // subdir
-      "../../../../../sandbox/<executable_name>/src/", // up 5 in tree,
-                                                       // "/sandbox/<executable_name>/src/"
-                                                       // subdir
-      "../../../../../sandbox/<executable_name>/inc/", // up 5 in tree,
-                                                       // "/sandbox/<executable_name>/inc/"
-                                                       // subdir
-      "../../../../../0_Simple/<executable_name>/data/", // up 5 in tree,
-                                                         // "/0_Simple/<executable_name>/"
-                                                         // subdir
-      "../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree,
-                                                            // "/1_Utilities/<executable_name>/"
-                                                            // subdir
-      "../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree,
-                                                           // "/2_Graphics/<executable_name>/"
-                                                           // subdir
-      "../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree,
-                                                          // "/3_Imaging/<executable_name>/"
-                                                          // subdir
-      "../../../../../4_Finance/<executable_name>/data/", // up 5 in tree,
-                                                          // "/4_Finance/<executable_name>/"
-                                                          // subdir
-      "../../../../../5_Simulations/<executable_name>/data/", // up 5 in tree,
-                                                              // "/5_Simulations/<executable_name>/"
-                                                              // subdir
-      "../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree,
-                                                           // "/6_Advanced/<executable_name>/"
-                                                           // subdir
-      "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree,
-                                                                // "/7_CUDALibraries/<executable_name>/"
-                                                                // subdir
-      "../../../../../8_Android/<executable_name>/data/", // up 5 in tree,
-                                                          // "/8_Android/<executable_name>/"
-                                                          // subdir
-      "../../../../../samples/<executable_name>/data/", // up 5 in tree,
-                                                        // "/samples/<executable_name>/"
-                                                        // subdir
-      "../../../../../common/",      // up 5 in tree, "../../../common/" subdir
-      "../../../../../common/data/", // up 5 in tree, "../../../common/data/"
-                                     // subdir
-  };
-
-  // Extract the executable name
-  std::string executable_name;
-
-  if (executable_path != 0) {
-    executable_name = std::string(executable_path);
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    // Windows path delimiter
-    size_t delimiter_pos = executable_name.find_last_of('\\');
-    executable_name.erase(0, delimiter_pos + 1);
-
-    if (executable_name.rfind(".exe") != std::string::npos) {
-      // we strip .exe, only if the .exe is found
-      executable_name.resize(executable_name.size() - 4);
-    }
-
-#else
-    // Linux & OSX path delimiter
-    size_t delimiter_pos = executable_name.find_last_of('/');
-    executable_name.erase(0, delimiter_pos + 1);
-#endif
-  }
-
-  // Loop over all search paths and return the first hit
-  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
-    std::string path(searchPath[i]);
-    size_t executable_name_pos = path.find("<executable_name>");
-
-    // If there is executable_name variable in the searchPath
-    // replace it with the value
-    if (executable_name_pos != std::string::npos) {
-      if (executable_path != 0) {
-        path.replace(executable_name_pos, strlen("<executable_name>"),
-                     executable_name);
-      } else {
-        // Skip this path entry if no executable argument is given
-        continue;
-      }
-    }
-
-#ifdef _DEBUG
-    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
-#endif
-
-    // Test if the file exists
-    path.append(filename);
-    FILE *fp;
-    FOPEN(fp, path.c_str(), "rb");
-
-    if (fp != NULL) {
-      fclose(fp);
-      // File found
-      // returning an allocated array here for backwards compatibility reasons
-      char *file_path = (char *)malloc(path.length() + 1);
-      STRCPY(file_path, path.length() + 1, path.c_str());
-      return file_path;
-    }
-
-    if (fp) {
-      fclose(fp);
-    }
-  }
-
-  // File not found
-  return 0;
-}
-
-#endif
diff --git a/src/addition.cuh b/src/addition.cuh
index 6bd698cc8..7c19d131a 100644
--- a/src/addition.cuh
+++ b/src/addition.cuh
@@ -4,10 +4,9 @@
 #ifdef __CDT_PARSER__
 #undef __CUDA_RUNTIME_H__
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
 #endif
 
-#include "../include/helper_cuda.h"
+#include "device.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 #include <stdio.h>
@@ -58,7 +57,7 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
 
   auto stream = static_cast<cudaStream_t *>(v_stream);
   addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 }
 
 template <typename T>
@@ -77,12 +76,12 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
 
   auto stream = static_cast<cudaStream_t *>(v_stream);
 
-  checkCudaErrors(cudaMemcpyAsync(output, lwe_input,
-                                  (input_lwe_dimension + 1) *
-                                      input_lwe_ciphertext_count * sizeof(T),
-                                  cudaMemcpyDeviceToDevice, *stream));
+  check_cuda_error(cudaMemcpyAsync(output, lwe_input,
+                                   (input_lwe_dimension + 1) *
+                                       input_lwe_ciphertext_count * sizeof(T),
+                                   cudaMemcpyDeviceToDevice, *stream));
   plaintext_addition<<<grid, thds, 0, *stream>>>(
       output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 }
 #endif // CUDA_ADD_H
diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh
index b26fa65fa..f68dfab3b 100644
--- a/src/bit_extraction.cuh
+++ b/src/bit_extraction.cuh
@@ -3,7 +3,6 @@
 
 #include "cooperative_groups.h"
 
-#include "../include/helper_cuda.h"
 #include "bootstrap.h"
 #include "bootstrap_low_latency.cuh"
 #include "device.h"
@@ -156,7 +155,7 @@ __host__ void host_extract_bits(
   copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
       lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
       1ll << (ciphertext_n_bits - delta_log - 1));
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
     cuda_keyswitch_lwe_ciphertext_vector(
@@ -167,7 +166,7 @@ __host__ void host_extract_bits(
     copy_small_lwe<<<1, 256, 0, *stream>>>(
         list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1,
         number_of_bits, number_of_bits - bit_idx - 1);
-    checkCudaErrors(cudaGetLastError());
+    check_cuda_error(cudaGetLastError());
 
     if (bit_idx == number_of_bits - 1) {
       break;
@@ -177,7 +176,7 @@ __host__ void host_extract_bits(
     add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
                                              lwe_dimension_out,
                                              1ll << (ciphertext_n_bits - 2));
-    checkCudaErrors(cudaGetLastError());
+    check_cuda_error(cudaGetLastError());
 
     // Fill lut for the current bit (equivalent to trivial encryption as mask is
     // 0s) The LUT is filled with -alpha in each coefficient where alpha =
@@ -185,7 +184,7 @@ __host__ void host_extract_bits(
     fill_lut_body_for_current_bit<Torus, params>
         <<<blocks, threads, 0, *stream>>>(
             lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
-    checkCudaErrors(cudaGetLastError());
+    check_cuda_error(cudaGetLastError());
 
     host_bootstrap_low_latency<Torus, params>(
         v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
@@ -199,7 +198,7 @@ __host__ void host_extract_bits(
         lwe_array_in_shifted_buffer, lwe_array_in_buffer,
         lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),
         1ll << (ciphertext_n_bits - delta_log - bit_idx - 2));
-    checkCudaErrors(cudaGetLastError());
+    check_cuda_error(cudaGetLastError());
   }
 }
 
diff --git a/src/boolean_gates.cu b/src/boolean_gates.cu
index 02b3612cc..d906065f3 100644
--- a/src/boolean_gates.cu
+++ b/src/boolean_gates.cu
@@ -3,7 +3,6 @@
 
 #include "bootstrap.h"
 #include "device.h"
-#include "helper_cuda.h"
 #include "keyswitch.h"
 #include "linear_algebra.h"
 
@@ -48,7 +47,7 @@ extern "C" void cuda_boolean_and_32(
   cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
       (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
@@ -76,7 +75,7 @@ extern "C" void cuda_boolean_and_32(
                            (glwe_dimension + 1) * polynomial_size *
                                sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *h_pbs_lut_indexes =
       (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
   for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
@@ -87,19 +86,19 @@ extern "C" void cuda_boolean_and_32(
   cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
       (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
           sizeof(uint32_t),
       stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
       v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
       lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
       polynomial_size, pbs_base_log, pbs_level_count,
       input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_drop_async(lwe_buffer_2, stream, gpu_index);
   cuda_drop_async(pbs_lut, stream, gpu_index);
@@ -151,7 +150,7 @@ extern "C" void cuda_boolean_nand_32(
   cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
       (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
@@ -179,7 +178,7 @@ extern "C" void cuda_boolean_nand_32(
                            (glwe_dimension + 1) * polynomial_size *
                                sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *h_pbs_lut_indexes =
       (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
   for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
@@ -190,19 +189,19 @@ extern "C" void cuda_boolean_nand_32(
   cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
       (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
           sizeof(uint32_t),
       stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
       v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
       lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
       polynomial_size, pbs_base_log, pbs_level_count,
       input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_drop_async(lwe_buffer_3, stream, gpu_index);
   cuda_drop_async(pbs_lut, stream, gpu_index);
@@ -254,7 +253,7 @@ extern "C" void cuda_boolean_nor_32(
   cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
       (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
@@ -282,7 +281,7 @@ extern "C" void cuda_boolean_nor_32(
                            (glwe_dimension + 1) * polynomial_size *
                                sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *h_pbs_lut_indexes =
       (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
   for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
@@ -293,19 +292,19 @@ extern "C" void cuda_boolean_nor_32(
   cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
       (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
           sizeof(uint32_t),
       stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
       v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
       lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
       polynomial_size, pbs_base_log, pbs_level_count,
       input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_drop_async(lwe_buffer_3, stream, gpu_index);
   cuda_drop_async(pbs_lut, stream, gpu_index);
@@ -349,7 +348,7 @@ extern "C" void cuda_boolean_or_32(
   cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
       (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
@@ -377,7 +376,7 @@ extern "C" void cuda_boolean_or_32(
                            (glwe_dimension + 1) * polynomial_size *
                                sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *h_pbs_lut_indexes =
       (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
   for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
@@ -388,19 +387,19 @@ extern "C" void cuda_boolean_or_32(
   cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
       (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
           sizeof(uint32_t),
       stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
       v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
       lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
       polynomial_size, pbs_base_log, pbs_level_count,
       input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_drop_async(lwe_buffer_2, stream, gpu_index);
   cuda_drop_async(pbs_lut, stream, gpu_index);
@@ -444,7 +443,7 @@ extern "C" void cuda_boolean_xor_32(
   cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
       (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
@@ -468,7 +467,7 @@ extern "C" void cuda_boolean_xor_32(
   cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
       (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
@@ -493,7 +492,7 @@ extern "C" void cuda_boolean_xor_32(
                            (glwe_dimension + 1) * polynomial_size *
                                sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *h_pbs_lut_indexes =
       (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
   for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
@@ -504,19 +503,19 @@ extern "C" void cuda_boolean_xor_32(
   cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
       (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
           sizeof(uint32_t),
       stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
       v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
       lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
       polynomial_size, pbs_base_log, pbs_level_count,
       input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_drop_async(lwe_buffer_3, stream, gpu_index);
   cuda_drop_async(pbs_lut, stream, gpu_index);
@@ -560,7 +559,7 @@ extern "C" void cuda_boolean_xnor_32(
   cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
       (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
@@ -591,7 +590,7 @@ extern "C" void cuda_boolean_xnor_32(
   cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t *lwe_buffer_4 = (uint32_t *)cuda_malloc_async(
       (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
@@ -616,7 +615,7 @@ extern "C" void cuda_boolean_xnor_32(
                            (glwe_dimension + 1) * polynomial_size *
                                sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *h_pbs_lut_indexes =
       (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
   for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
@@ -627,19 +626,19 @@ extern "C" void cuda_boolean_xnor_32(
   cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
                            input_lwe_ciphertext_count * sizeof(uint32_t),
                            stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
       (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
           sizeof(uint32_t),
       stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
       v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
       lwe_buffer_4, bootstrapping_key, input_lwe_dimension, glwe_dimension,
       polynomial_size, pbs_base_log, pbs_level_count,
       input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_drop_async(lwe_buffer_4, stream, gpu_index);
   cuda_drop_async(pbs_lut, stream, gpu_index);
diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh
index 0135e35b2..c937f3c32 100644
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -1,7 +1,6 @@
 #ifdef __CDT_PARSER__
 #undef __CUDA_RUNTIME_H__
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
 #endif
 
 #ifndef CNCRT_AMORTIZED_PBS_H
@@ -15,9 +14,7 @@
 #include "crypto/torus.cuh"
 #include "device.h"
 #include "fft/bnsmfft.cuh"
-#include "fft/smfft.cuh"
 #include "fft/twiddles.cuh"
-#include "helper_cuda.h"
 #include "polynomial/functions.cuh"
 #include "polynomial/parameters.cuh"
 #include "polynomial/polynomial.cuh"
@@ -327,10 +324,10 @@ __host__ void host_bootstrap_amortized(
     // device then has to be allocated dynamically.
     // For lower compute capabilities, this call
     // just does nothing and the amount of shared memory used is 48 KB
-    checkCudaErrors(cudaFuncSetAttribute(
+    check_cuda_error(cudaFuncSetAttribute(
         device_bootstrap_amortized<Torus, params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
-    checkCudaErrors(cudaFuncSetCacheConfig(
+    check_cuda_error(cudaFuncSetCacheConfig(
         device_bootstrap_amortized<Torus, params, FULLSM>,
         cudaFuncCachePreferShared));
     d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
@@ -341,7 +338,7 @@ __host__ void host_bootstrap_amortized(
             bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
             base_log, level_count, lwe_idx, 0);
   }
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   cuda_drop_async(d_mem, stream, gpu_index);
 }
diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh
index 33ec3d005..877950b1f 100644
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -1,7 +1,6 @@
 #ifdef __CDT_PARSER__
 #undef __CUDA_RUNTIME_H__
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
 #endif
 
 #ifndef LOWLAT_PBS_H
@@ -15,9 +14,7 @@
 #include "crypto/torus.cuh"
 #include "device.h"
 #include "fft/bnsmfft.cuh"
-#include "fft/smfft.cuh"
 #include "fft/twiddles.cuh"
-#include "helper_cuda.h"
 #include "polynomial/parameters.cuh"
 #include "polynomial/polynomial.cuh"
 #include "polynomial/polynomial_math.cuh"
@@ -299,12 +296,12 @@ __host__ void host_bootstrap_low_latency(
 
   if (max_shared_memory < SM_PART) {
     kernel_args[11] = &DM_FULL;
-    checkCudaErrors(cudaGetLastError());
+    check_cuda_error(cudaGetLastError());
     d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
                                           level_count * 2,
                                       stream, gpu_index);
-    checkCudaErrors(cudaGetLastError());
-    checkCudaErrors(cudaLaunchCooperativeKernel(
+    check_cuda_error(cudaGetLastError());
+    check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
         (void **)kernel_args, 0, *stream));
   } else if (max_shared_memory < SM_FULL) {
@@ -312,14 +309,14 @@ __host__ void host_bootstrap_low_latency(
     d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
                                           level_count * 2,
                                       stream, gpu_index);
-    checkCudaErrors(cudaFuncSetAttribute(
+    check_cuda_error(cudaFuncSetAttribute(
         device_bootstrap_low_latency<Torus, params, PARTIALSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART));
     cudaFuncSetCacheConfig(
         device_bootstrap_low_latency<Torus, params, PARTIALSM>,
         cudaFuncCachePreferShared);
-    checkCudaErrors(cudaGetLastError());
-    checkCudaErrors(cudaLaunchCooperativeKernel(
+    check_cuda_error(cudaGetLastError());
+    check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_bootstrap_low_latency<Torus, params, PARTIALSM>, grid,
         thds, (void **)kernel_args, SM_PART, *stream));
 
@@ -327,17 +324,17 @@ __host__ void host_bootstrap_low_latency(
     int DM_NONE = 0;
     kernel_args[11] = &DM_NONE;
     d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
-    checkCudaErrors(cudaFuncSetAttribute(
+    check_cuda_error(cudaFuncSetAttribute(
         device_bootstrap_low_latency<Torus, params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
     cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params, FULLSM>,
                            cudaFuncCachePreferShared);
-    checkCudaErrors(cudaLaunchCooperativeKernel(
+    check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_bootstrap_low_latency<Torus, params, FULLSM>, grid, thds,
         (void **)kernel_args, SM_FULL, *stream));
   }
 
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   // Synchronize the streams before copying the result to lwe_array_out at the
   // right place
   cuda_drop_async(mask_buffer_fft, stream, gpu_index);
diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh
index af3194bf3..dbca600a2 100644
--- a/src/circuit_bootstrap.cuh
+++ b/src/circuit_bootstrap.cuh
@@ -5,7 +5,6 @@
 #include "bootstrap.h"
 #include "bootstrap_amortized.cuh"
 #include "device.h"
-#include "helper_cuda.h"
 #include "keyswitch.cuh"
 #include "polynomial/parameters.cuh"
 #include "utils/timer.cuh"
diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh
index 3631648c8..71e40f7ce 100644
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -85,10 +85,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
   case 512:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      checkCudaErrors(cudaFuncSetAttribute(
+      check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      checkCudaErrors(cudaFuncSetCacheConfig(
+      check_cuda_error(cudaFuncSetCacheConfig(
           batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>,
           cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>
@@ -104,10 +104,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
   case 1024:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      checkCudaErrors(cudaFuncSetAttribute(
+      check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      checkCudaErrors(cudaFuncSetCacheConfig(
+      check_cuda_error(cudaFuncSetCacheConfig(
           batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>,
           cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>
@@ -123,10 +123,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
   case 2048:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      checkCudaErrors(cudaFuncSetAttribute(
+      check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      checkCudaErrors(cudaFuncSetCacheConfig(
+      check_cuda_error(cudaFuncSetCacheConfig(
           batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>,
           cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>
@@ -142,10 +142,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
   case 4096:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      checkCudaErrors(cudaFuncSetAttribute(
+      check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      checkCudaErrors(cudaFuncSetCacheConfig(
+      check_cuda_error(cudaFuncSetCacheConfig(
           batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>,
           cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>
@@ -161,10 +161,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
   case 8192:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      checkCudaErrors(cudaFuncSetAttribute(
+      check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      checkCudaErrors(cudaFuncSetCacheConfig(
+      check_cuda_error(cudaFuncSetCacheConfig(
           batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>,
           cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>
diff --git a/src/crypto/ggsw.cuh b/src/crypto/ggsw.cuh
index 8ab73ed99..5db8bc399 100644
--- a/src/crypto/ggsw.cuh
+++ b/src/crypto/ggsw.cuh
@@ -64,13 +64,13 @@ void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
     d_mem = (char *)cuda_malloc_async(shared_memory_size, stream, gpu_index);
     device_batch_fft_ggsw_vector<T, ST, params, NOSM>
         <<<gridSize, blockSize, 0, *stream>>>(dest, src, d_mem);
-    checkCudaErrors(cudaGetLastError());
+    check_cuda_error(cudaGetLastError());
     cuda_drop_async(d_mem, stream, gpu_index);
   } else {
     device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
         <<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src,
                                                                d_mem);
-    checkCudaErrors(cudaGetLastError());
+    check_cuda_error(cudaGetLastError());
   }
 }
 
diff --git a/src/device.cu b/src/device.cu
index c31e33d63..b9dab7594 100644
--- a/src/device.cu
+++ b/src/device.cu
@@ -2,7 +2,6 @@
 #include <cstdint>
 #include <cstdio>
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
 
 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t *cuda_create_stream(uint32_t gpu_index) {
@@ -25,7 +24,8 @@ int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) {
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
   cudaSetDevice(gpu_index);
   void *ptr;
-  checkCudaErrors(cudaMalloc((void **)&ptr, size));
+  cudaMalloc((void **)&ptr, size);
+  check_cuda_error(cudaGetLastError());
 
   return ptr;
 }
@@ -37,13 +37,14 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
   void *ptr;
 
   int support_async_alloc;
-  checkCudaErrors(cudaDeviceGetAttribute(
+  check_cuda_error(cudaDeviceGetAttribute(
       &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
 
-  if (support_async_alloc)
-    checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream));
-  else
-    checkCudaErrors(cudaMalloc((void **)&ptr, size));
+  if (support_async_alloc) {
+    check_cuda_error(cudaMallocAsync((void **)&ptr, size, *stream));
+  } else {
+    check_cuda_error(cudaMalloc((void **)&ptr, size));
+  }
   return ptr;
 }
 
@@ -91,7 +92,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
   }
 
   cudaSetDevice(gpu_index);
-  checkCudaErrors(
+  check_cuda_error(
       cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
   return 0;
 }
@@ -133,7 +134,7 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
   }
 
   cudaSetDevice(gpu_index);
-  checkCudaErrors(
+  check_cuda_error(
       cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
   return 0;
 }
@@ -152,7 +153,7 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
     return -2;
   }
   cudaSetDevice(gpu_index);
-  checkCudaErrors(cudaFree(ptr));
+  check_cuda_error(cudaFree(ptr));
   return 0;
 }
 
@@ -160,13 +161,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
 int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) {
 
   int support_async_alloc;
-  checkCudaErrors(cudaDeviceGetAttribute(
+  check_cuda_error(cudaDeviceGetAttribute(
       &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
 
-  if (support_async_alloc)
-    checkCudaErrors(cudaFreeAsync(ptr, *stream));
-  else
-    checkCudaErrors(cudaFree(ptr));
+  if (support_async_alloc) {
+    check_cuda_error(cudaFreeAsync(ptr, *stream));
+  } else {
+    check_cuda_error(cudaFree(ptr));
+  }
   return 0;
 }
 
diff --git a/src/fft/bnsmfft.cuh b/src/fft/bnsmfft.cuh
index 23cd9a05e..9b68b8065 100644
--- a/src/fft/bnsmfft.cuh
+++ b/src/fft/bnsmfft.cuh
@@ -1,5 +1,5 @@
-#ifndef GPU_BOOTSTRAP_FFT_1024_CUH
-#define GPU_BOOTSTRAP_FFT_1024_CUH
+#ifndef GPU_BOOTSTRAP_FFT_CUH
+#define GPU_BOOTSTRAP_FFT_CUH
 
 #include "complex/operations.cuh"
 #include "polynomial/functions.cuh"
@@ -21,9 +21,6 @@
  *     w_j,k = exp(-i pi j/2^k)
  *     is replaced with:
  *     \zeta_j,k = exp(-i pi (2j-1)/2^k)
- *   - this technique also implies a correction of the
- *     complex obtained after the FFT, which is done in the
- * forward_negacyclic_fft_inplace function of bootstrap.cuh
  */
 template <class params> __device__ void NSMFFT_direct(double2 *A) {
 
@@ -118,7 +115,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
 
   // none of the twiddles have equal real and imag part, so
   // complete complex multiplication has to be done
-  // here we have more than one twiddles
+  // here we have more than one twiddle
   while (m > 1) {
     tid = threadIdx.x;
     m >>= 1;
@@ -145,7 +142,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
 /*
  * global batch fft
  * does fft in half size
- * unrolling halfsize fft result in half size + 1 eleemnts
+ * unrolling half size fft result in half size + 1 elements
  * this function must be called with actual degree
  * function takes as input already compressed input
  */
@@ -174,4 +171,4 @@ __global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
   }
 }
 
-#endif // GPU_BOOTSTRAP_FFT_1024_CUH
+#endif // GPU_BOOTSTRAP_FFT_CUH
diff --git a/src/fft/smfft.cuh b/src/fft/smfft.cuh
deleted file mode 100644
index bc31d42a3..000000000
--- a/src/fft/smfft.cuh
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
-#ifndef GPU_BOOTSTRAP_SMFFT_CUH
-#define GPU_BOOTSTRAP_SMFFT_CUH
-
-#include "../complex/operations.cuh"
-#include "twiddles.cuh"
-
-__device__ __inline__ double2 Get_W_value_inverse(int index) {
-  double2 ctemp = _gTwiddles[index];
-  ctemp.y = -ctemp.y;
-  return (ctemp);
-}
-template <class params>
-__device__ double2 Get_after_inverse_fft_twiddle(int index) {
-  double2 ctemp;
-  switch (params::degree) {
-  case 512:
-    ctemp = INVERSE_TWIDDLES_512[index];
-    break;
-  case 1024:
-    ctemp = gTwiddles1024[index];
-    ctemp.x /= params::degree;
-    ctemp.y /= -params::degree;
-    break;
-  default:
-    break;
-  }
-
-  return ctemp;
-}
-
-__device__ __inline__ double shfl(double *value, int par) {
-#if (CUDART_VERSION >= 9000)
-  return (__shfl_sync(0xffffffff, (*value), par));
-#else
-  return (__shfl((*value), par));
-#endif
-}
-
-__device__ __inline__ double shfl_xor(double *value, int par) {
-#if (CUDART_VERSION >= 9000)
-  return (__shfl_xor_sync(0xffffffff, (*value), par));
-#else
-  return (__shfl_xor((*value), par));
-#endif
-}
-
-__device__ __inline__ double shfl_down(double *value, int par) {
-#if (CUDART_VERSION >= 9000)
-  return (__shfl_down_sync(0xffffffff, (*value), par));
-#else
-  return (__shfl_down((*value), par));
-#endif
-}
-
-__device__ __inline__ void
-reorder_16_register(double2 *A_DFT_value, double2 *B_DFT_value,
-                    double2 *C_DFT_value, double2 *D_DFT_value, int *local_id) {
-  double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
-  unsigned int target = (((unsigned int)__brev(((*local_id) & 15))) >> (28)) +
-                        16 * ((*local_id) >> 4);
-  Af2temp.x = shfl(&(A_DFT_value->x), target);
-  Af2temp.y = shfl(&(A_DFT_value->y), target);
-  Bf2temp.x = shfl(&(B_DFT_value->x), target);
-  Bf2temp.y = shfl(&(B_DFT_value->y), target);
-  Cf2temp.x = shfl(&(C_DFT_value->x), target);
-  Cf2temp.y = shfl(&(C_DFT_value->y), target);
-  Df2temp.x = shfl(&(D_DFT_value->x), target);
-  Df2temp.y = shfl(&(D_DFT_value->y), target);
-  __syncwarp();
-  (*A_DFT_value) = Af2temp;
-  (*B_DFT_value) = Bf2temp;
-  (*C_DFT_value) = Cf2temp;
-  (*D_DFT_value) = Df2temp;
-}
-
-__device__ __inline__ void reorder_32_register(double2 *A_DFT_value,
-                                               double2 *B_DFT_value,
-                                               double2 *C_DFT_value,
-                                               double2 *D_DFT_value) {
-  double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
-  unsigned int target = ((unsigned int)__brev(threadIdx.x)) >> (27);
-  Af2temp.x = shfl(&(A_DFT_value->x), target);
-  Af2temp.y = shfl(&(A_DFT_value->y), target);
-  Bf2temp.x = shfl(&(B_DFT_value->x), target);
-  Bf2temp.y = shfl(&(B_DFT_value->y), target);
-  Cf2temp.x = shfl(&(C_DFT_value->x), target);
-  Cf2temp.y = shfl(&(C_DFT_value->y), target);
-  Df2temp.x = shfl(&(D_DFT_value->x), target);
-  Df2temp.y = shfl(&(D_DFT_value->y), target);
-  __syncwarp();
-  (*A_DFT_value) = Af2temp;
-  (*B_DFT_value) = Bf2temp;
-  (*C_DFT_value) = Cf2temp;
-  (*D_DFT_value) = Df2temp;
-}
-
-template <class params>
-__device__ __inline__ void
-reorder_512(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
-            double2 *C_DFT_value, double2 *D_DFT_value) {
-  int local_id = threadIdx.x & (params::warp - 1);
-  int warp_id = threadIdx.x / params::warp;
-
-  // reorder elements within warp so we can save them in semi-transposed manner
-  // into shared memory
-  reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
-
-  // reorder elements within warp so we can save them in semi-transposed manner
-  // into shared memory
-  __syncthreads();
-  unsigned int sm_store_pos =
-      (local_id >> 1) + 16 * (local_id & 1) + warp_id * 132;
-  s_input[sm_store_pos] = *A_DFT_value;
-  s_input[sm_store_pos + 33] = *B_DFT_value;
-  s_input[66 + sm_store_pos] = *C_DFT_value;
-  s_input[66 + sm_store_pos + 33] = *D_DFT_value;
-
-  __syncthreads();
-
-  // Read shared memory to get reordered input
-  unsigned int sm_read_pos = (local_id & 15) * 32 + local_id + warp_id * 4;
-  __syncthreads();
-  *A_DFT_value = s_input[sm_read_pos + 0];
-  *B_DFT_value = s_input[sm_read_pos + 1];
-  *C_DFT_value = s_input[sm_read_pos + 2];
-  *D_DFT_value = s_input[sm_read_pos + 3];
-
-  __syncthreads();
-  reorder_16_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value,
-                      &local_id);
-
-  __syncthreads();
-}
-
-template <class params>
-__device__ __inline__ void
-reorder_1024(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
-             double2 *C_DFT_value, double2 *D_DFT_value) {
-  int local_id = threadIdx.x & (params::warp - 1);
-  int warp_id = threadIdx.x / params::warp;
-
-  // reorder elements within params::warp so we can save them in semi-transposed
-  // manner into shared memory
-  reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
-
-  // reorder elements within params::warp so we can save them in semi-transposed
-  // manner into shared memory
-  __syncthreads();
-  unsigned int sm_store_pos =
-      (local_id >> 0) + 32 * (local_id & 0) + warp_id * 132;
-  s_input[sm_store_pos] = *A_DFT_value;
-  s_input[sm_store_pos + 33] = *B_DFT_value;
-  s_input[66 + sm_store_pos] = *C_DFT_value;
-  s_input[66 + sm_store_pos + 33] = *D_DFT_value;
-
-  // Read shared memory to get reordered input
-  unsigned int sm_read_pos = (local_id & 31) * 32 + local_id + warp_id * 4;
-  __syncthreads();
-  *A_DFT_value = s_input[sm_read_pos + 0];
-  *B_DFT_value = s_input[sm_read_pos + 1];
-  *C_DFT_value = s_input[sm_read_pos + 2];
-  *D_DFT_value = s_input[sm_read_pos + 3];
-
-  __syncthreads();
-  reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
-}
-
-__device__ bool printOnce = true;
-
-template <class params> __device__ void do_SMFFT_CT_DIT(double2 *s_input) {
-  double2 A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value;
-  double2 W;
-  double2 Aftemp, Bftemp, Cftemp, Dftemp;
-
-  int j, m_param;
-  int parity, itemp;
-  int A_read_index, B_read_index, C_read_index, D_read_index;
-  int PoT, PoTp1, q;
-
-  int local_id = threadIdx.x & (params::warp - 1);
-  int warp_id = threadIdx.x / params::warp;
-  A_DFT_value = s_input[local_id + (warp_id << 2) * params::warp];
-  B_DFT_value =
-      s_input[local_id + (warp_id << 2) * params::warp + params::warp];
-  C_DFT_value =
-      s_input[local_id + (warp_id << 2) * params::warp + 2 * params::warp];
-  D_DFT_value =
-      s_input[local_id + (warp_id << 2) * params::warp + 3 * params::warp];
-
-  switch (params::log2_degree) {
-  case 9:
-    reorder_512<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
-                        &D_DFT_value);
-    break;
-  case 10:
-    reorder_1024<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
-                         &D_DFT_value);
-    break;
-  // case 11:
-  //	reorder_2048<params, opt>(s_input, &A_DFT_value, &B_DFT_value,
-  //&C_DFT_value, &D_DFT_value); 	break;
-  default:
-    break;
-  }
-
-  //----> FFT
-  PoT = 1;
-  PoTp1 = 2;
-
-  //--> First iteration
-  itemp = local_id & 1;
-  parity = (1 - itemp * 2);
-
-  A_DFT_value.x = parity * A_DFT_value.x + shfl_xor(&A_DFT_value.x, 1);
-  A_DFT_value.y = parity * A_DFT_value.y + shfl_xor(&A_DFT_value.y, 1);
-  B_DFT_value.x = parity * B_DFT_value.x + shfl_xor(&B_DFT_value.x, 1);
-  B_DFT_value.y = parity * B_DFT_value.y + shfl_xor(&B_DFT_value.y, 1);
-  C_DFT_value.x = parity * C_DFT_value.x + shfl_xor(&C_DFT_value.x, 1);
-  C_DFT_value.y = parity * C_DFT_value.y + shfl_xor(&C_DFT_value.y, 1);
-  D_DFT_value.x = parity * D_DFT_value.x + shfl_xor(&D_DFT_value.x, 1);
-  D_DFT_value.y = parity * D_DFT_value.y + shfl_xor(&D_DFT_value.y, 1);
-
-  //--> Second through Fifth iteration (no synchronization)
-  PoT = 2;
-  PoTp1 = 4;
-  for (q = 1; q < 5; q++) {
-    m_param = (local_id & (PoTp1 - 1));
-    itemp = m_param >> q;
-    parity = ((itemp << 1) - 1);
-    if (params::fft_direction)
-      W = Get_W_value_inverse((q - 1) * 257 + itemp * m_param);
-    else
-      W = _gTwiddles[(q - 1) * 257 + itemp * m_param];
-    Aftemp.x = W.x * A_DFT_value.x - W.y * A_DFT_value.y;
-    Aftemp.y = W.x * A_DFT_value.y + W.y * A_DFT_value.x;
-    Bftemp.x = W.x * B_DFT_value.x - W.y * B_DFT_value.y;
-    Bftemp.y = W.x * B_DFT_value.y + W.y * B_DFT_value.x;
-    Cftemp.x = W.x * C_DFT_value.x - W.y * C_DFT_value.y;
-    Cftemp.y = W.x * C_DFT_value.y + W.y * C_DFT_value.x;
-    Dftemp.x = W.x * D_DFT_value.x - W.y * D_DFT_value.y;
-    Dftemp.y = W.x * D_DFT_value.y + W.y * D_DFT_value.x;
-
-    A_DFT_value.x = Aftemp.x + parity * shfl_xor(&Aftemp.x, PoT);
-    A_DFT_value.y = Aftemp.y + parity * shfl_xor(&Aftemp.y, PoT);
-    B_DFT_value.x = Bftemp.x + parity * shfl_xor(&Bftemp.x, PoT);
-    B_DFT_value.y = Bftemp.y + parity * shfl_xor(&Bftemp.y, PoT);
-    C_DFT_value.x = Cftemp.x + parity * shfl_xor(&Cftemp.x, PoT);
-    C_DFT_value.y = Cftemp.y + parity * shfl_xor(&Cftemp.y, PoT);
-    D_DFT_value.x = Dftemp.x + parity * shfl_xor(&Dftemp.x, PoT);
-    D_DFT_value.y = Dftemp.y + parity * shfl_xor(&Dftemp.y, PoT);
-
-    PoT = PoT << 1;
-    PoTp1 = PoTp1 << 1;
-  }
-
-  itemp = local_id + (warp_id << 2) * params::warp;
-  s_input[itemp] = A_DFT_value;
-  s_input[itemp + params::warp] = B_DFT_value;
-  s_input[itemp + 2 * params::warp] = C_DFT_value;
-  s_input[itemp + 3 * params::warp] = D_DFT_value;
-
-  for (q = 5; q < (params::log2_degree - 1); q++) {
-    __syncthreads();
-    m_param = threadIdx.x & (PoT - 1);
-    j = threadIdx.x >> q;
-
-    if (params::fft_direction)
-      W = Get_W_value_inverse((q - 1) * 257 + m_param);
-    else
-      W = _gTwiddles[(q - 1) * 257 + m_param];
-
-    A_read_index = j * (PoTp1 << 1) + m_param;
-    B_read_index = j * (PoTp1 << 1) + m_param + PoT;
-    C_read_index = j * (PoTp1 << 1) + m_param + PoTp1;
-    D_read_index = j * (PoTp1 << 1) + m_param + 3 * PoT;
-
-    Aftemp = s_input[A_read_index];
-    Bftemp = s_input[B_read_index];
-    A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
-    A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
-    B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
-    B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
-
-    Cftemp = s_input[C_read_index];
-    Dftemp = s_input[D_read_index];
-    C_DFT_value.x = Cftemp.x + W.x * Dftemp.x - W.y * Dftemp.y;
-    C_DFT_value.y = Cftemp.y + W.x * Dftemp.y + W.y * Dftemp.x;
-    D_DFT_value.x = Cftemp.x - W.x * Dftemp.x + W.y * Dftemp.y;
-    D_DFT_value.y = Cftemp.y - W.x * Dftemp.y - W.y * Dftemp.x;
-
-    s_input[A_read_index] = A_DFT_value;
-    s_input[B_read_index] = B_DFT_value;
-    s_input[C_read_index] = C_DFT_value;
-    s_input[D_read_index] = D_DFT_value;
-
-    PoT = PoT << 1;
-    PoTp1 = PoTp1 << 1;
-  }
-
-  // last iteration
-  if (params::log2_degree > 6) {
-    __syncthreads();
-    m_param = threadIdx.x;
-
-    if (params::fft_direction)
-      W = Get_W_value_inverse((q - 1) * 257 + m_param);
-    else
-      W = _gTwiddles[(q - 1) * 257 + m_param];
-
-    A_read_index = m_param;
-    B_read_index = m_param + PoT;
-    C_read_index = m_param + (PoT >> 1);
-    D_read_index = m_param + 3 * (PoT >> 1);
-
-    Aftemp = s_input[A_read_index];
-    Bftemp = s_input[B_read_index];
-    A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
-    A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
-    B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
-    B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
-
-    Cftemp = s_input[C_read_index];
-    Dftemp = s_input[D_read_index];
-    C_DFT_value.x = Cftemp.x + W.y * Dftemp.x + W.x * Dftemp.y;
-    C_DFT_value.y = Cftemp.y + W.y * Dftemp.y - W.x * Dftemp.x;
-    D_DFT_value.x = Cftemp.x - W.y * Dftemp.x - W.x * Dftemp.y;
-    D_DFT_value.y = Cftemp.y - W.y * Dftemp.y + W.x * Dftemp.x;
-
-    s_input[A_read_index] = A_DFT_value;
-    s_input[B_read_index] = B_DFT_value;
-    s_input[C_read_index] = C_DFT_value;
-    s_input[D_read_index] = D_DFT_value;
-  }
-}
-
-template <class params>
-__global__ void SMFFT_DIT_external(double2 *d_input, double2 *d_output) {
-  __syncthreads();
-
-  extern __shared__ double2 sharedmemBSK[];
-
-  double2 *s_input = sharedmemBSK;
-
-  int cTid = threadIdx.x * params::opt;
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    double2 tmp;
-    switch (params::degree) {
-    case 512:
-      tmp = INVERSE_TWIDDLES_512[cTid];
-      tmp.x *= params::degree;
-      tmp.y *= -params::degree;
-      break;
-    case 1024:
-      tmp = gTwiddles1024[cTid];
-      break;
-    default:
-      break;
-    }
-
-    d_input[blockIdx.x * params::degree + cTid] *= tmp;
-    cTid++;
-  }
-
-  __syncthreads();
-
-  s_input[threadIdx.x] = d_input[threadIdx.x + blockIdx.x * params::degree];
-  s_input[threadIdx.x + params::quarter] =
-      d_input[threadIdx.x + blockIdx.x * params::degree + params::quarter];
-  s_input[threadIdx.x + params::half] =
-      d_input[threadIdx.x + blockIdx.x * params::degree + params::half];
-  s_input[threadIdx.x + params::three_quarters] =
-      d_input[threadIdx.x + blockIdx.x * params::degree +
-              params::three_quarters];
-
-  __syncthreads();
-
-  do_SMFFT_CT_DIT<params>(s_input);
-  if (threadIdx.x == 0 && blockIdx.x == 0) {
-    for (int i = 0; i < 1024; i++)
-      printf("smfft[%u] %.10f %.10f\n", i, s_input[i].x, s_input[i].y);
-  }
-  __syncthreads();
-
-
-
-  __syncthreads();
-  d_output[threadIdx.x + blockIdx.x * params::degree] = s_input[threadIdx.x];
-  d_output[threadIdx.x + blockIdx.x * params::degree + params::quarter] =
-      s_input[threadIdx.x + params::quarter];
-  d_output[threadIdx.x + blockIdx.x * params::degree + params::half] =
-      s_input[threadIdx.x + params::half];
-  d_output[threadIdx.x + blockIdx.x * params::degree + params::three_quarters] =
-      s_input[threadIdx.x + params::three_quarters];
-
-  __syncthreads();
-}
-
-#endif // GPU_BOOTSTRAP_SMFFT_CUH
-
-*/
\ No newline at end of file
diff --git a/src/keyswitch.cuh b/src/keyswitch.cuh
index c384c72c7..7dd339c51 100644
--- a/src/keyswitch.cuh
+++ b/src/keyswitch.cuh
@@ -203,7 +203,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
   keyswitch<<<grid, threads, shared_mem, *stream>>>(
       lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
       base_log, level_count, lwe_lower, lwe_upper, cutoff);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 }
 
 template <typename Torus>
diff --git a/src/multiplication.cuh b/src/multiplication.cuh
index 3f770966e..0fa7cc662 100644
--- a/src/multiplication.cuh
+++ b/src/multiplication.cuh
@@ -4,10 +4,9 @@
 #ifdef __CDT_PARSER__
 #undef __CUDA_RUNTIME_H__
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
 #endif
 
-#include "../include/helper_cuda.h"
+#include "device.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 
@@ -46,7 +45,7 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output,
   auto stream = static_cast<cudaStream_t *>(v_stream);
   cleartext_multiplication<<<grid, thds, 0, *stream>>>(
       output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 }
 
 #endif // CUDA_MULT_H
diff --git a/src/negation.cuh b/src/negation.cuh
index b6536f7be..e02ab3e24 100644
--- a/src/negation.cuh
+++ b/src/negation.cuh
@@ -4,10 +4,9 @@
 #ifdef __CDT_PARSER__
 #undef __CUDA_RUNTIME_H__
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
 #endif
 
-#include "../include/helper_cuda.h"
+#include "device.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 
@@ -40,7 +39,7 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
 
   auto stream = static_cast<cudaStream_t *>(v_stream);
   negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 }
 
 #endif // CUDA_NEGATE_H
diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh
index e40773820..6cc1e49aa 100644
--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -1,6 +1,6 @@
 #ifndef GPU_POLYNOMIAL_FUNCTIONS
 #define GPU_POLYNOMIAL_FUNCTIONS
-#include "helper_cuda.h"
+#include "device.h"
 #include "utils/timer.cuh"
 
 /*
diff --git a/src/polynomial/polynomial.cuh b/src/polynomial/polynomial.cuh
index e5022e21d..3c2c7d1c6 100644
--- a/src/polynomial/polynomial.cuh
+++ b/src/polynomial/polynomial.cuh
@@ -3,9 +3,8 @@
 
 #include "complex/operations.cuh"
 #include "crypto/torus.cuh"
+#include "device.h"
 #include "fft/bnsmfft.cuh"
-#include "fft/smfft.cuh"
-#include "helper_cuda.h"
 #include "parameters.cuh"
 #include "utils/timer.cuh"
 #include <cassert>
diff --git a/src/vertical_packing.cuh b/src/vertical_packing.cuh
index d3ef42446..917273ecd 100644
--- a/src/vertical_packing.cuh
+++ b/src/vertical_packing.cuh
@@ -8,9 +8,7 @@
 #include "crypto/torus.cuh"
 #include "device.h"
 #include "fft/bnsmfft.cuh"
-#include "fft/smfft.cuh"
 #include "fft/twiddles.cuh"
-#include "helper_cuda.h"
 #include "polynomial/functions.cuh"
 #include "polynomial/parameters.cuh"
 #include "polynomial/polynomial.cuh"
@@ -181,12 +179,12 @@ template <typename Torus, class params>
 __host__ void add_padding_to_lut_async(Torus *lut_out, Torus *lut_in,
                                        uint32_t glwe_dimension,
                                        uint32_t num_lut, cudaStream_t *stream) {
-  checkCudaErrors(cudaMemsetAsync(lut_out, 0,
-                                  num_lut * (glwe_dimension + 1) *
-                                      params::degree * sizeof(Torus),
-                                  *stream));
+  check_cuda_error(cudaMemsetAsync(lut_out, 0,
+                                   num_lut * (glwe_dimension + 1) *
+                                       params::degree * sizeof(Torus),
+                                   *stream));
   for (int i = 0; i < num_lut; i++)
-    checkCudaErrors(cudaMemcpyAsync(
+    check_cuda_error(cudaMemcpyAsync(
         lut_out + (2 * i + 1) * params::degree, lut_in + i * params::degree,
         params::degree * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
 }
@@ -304,10 +302,10 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
     d_mem = (char *)cuda_malloc_async(
         memory_needed_per_block * (1 << (r - 1)) * tau, stream, gpu_index);
   } else {
-    checkCudaErrors(cudaFuncSetAttribute(
+    check_cuda_error(cudaFuncSetAttribute(
         device_batch_cmux<Torus, STorus, params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
-    checkCudaErrors(
+    check_cuda_error(
         cudaFuncSetCacheConfig(device_batch_cmux<Torus, STorus, params, FULLSM>,
                                cudaFuncCachePreferShared));
   }
@@ -349,11 +347,11 @@ __host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
               polynomial_size, base_log, level_count,
               layer_idx, // r
               num_lut);
-    checkCudaErrors(cudaGetLastError());
+    check_cuda_error(cudaGetLastError());
   }
 
   for (int i = 0; i < tau; i++)
-    checkCudaErrors(cudaMemcpyAsync(
+    check_cuda_error(cudaMemcpyAsync(
         glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size,
         glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
 
@@ -480,11 +478,11 @@ __host__ void host_blind_rotate_and_sample_extraction(
     d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, stream,
                                       gpu_index);
   else {
-    checkCudaErrors(cudaFuncSetAttribute(
+    check_cuda_error(cudaFuncSetAttribute(
         device_blind_rotation_and_sample_extraction<Torus, STorus, params,
                                                     FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
-    checkCudaErrors(cudaFuncSetCacheConfig(
+    check_cuda_error(cudaFuncSetCacheConfig(
         device_blind_rotation_and_sample_extraction<Torus, STorus, params,
                                                     FULLSM>,
         cudaFuncCachePreferShared));
@@ -499,7 +497,7 @@ __host__ void host_blind_rotate_and_sample_extraction(
   batch_fft_ggsw_vector<Torus, STorus, params>(
       stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size,
       l_gadget, gpu_index, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   //
   dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -519,7 +517,7 @@ __host__ void host_blind_rotate_and_sample_extraction(
             glwe_dimension, // k
             polynomial_size, base_log, l_gadget, memory_needed_per_block,
             d_mem);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   //
   cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh
index de5bced06..9a8bb9acb 100644
--- a/src/wop_bootstrap.cuh
+++ b/src/wop_bootstrap.cuh
@@ -6,7 +6,7 @@
 #include "bit_extraction.cuh"
 #include "bootstrap.h"
 #include "circuit_bootstrap.cuh"
-#include "helper_cuda.h"
+#include "device.h"
 #include "utils/kernel_dimensions.cuh"
 #include "utils/timer.cuh"
 #include "vertical_packing.cuh"
@@ -77,7 +77,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
   cuda_memcpy_async_to_gpu(
       lut_vector_indexes, h_lut_vector_indexes,
       number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   uint32_t bits = sizeof(Torus) * 8;
   uint32_t delta_log = (bits - 1);
@@ -89,7 +89,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
       polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk,
       base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs,
       base_log_cbs, number_of_inputs, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   // Free memory
   cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index);
@@ -112,7 +112,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
       v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector, glwe_dimension,
       polynomial_size, base_log_cbs, level_count_cbs, r, tau,
       max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
 
   // Blind rotation + sample extraction
   // mbr = tau * p - r = log2(N)
@@ -151,7 +151,7 @@ __host__ void host_wop_pbs(
       (uint32_t *)cuda_malloc_async(sizeof(uint32_t), stream, gpu_index);
   cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes,
                            sizeof(uint32_t), stream, gpu_index);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   Torus *lut_pbs = (Torus *)cuda_malloc_async(
       (2 * polynomial_size) * sizeof(Torus), stream, gpu_index);
   Torus *lwe_array_in_buffer = (Torus *)cuda_malloc_async(
@@ -176,7 +176,7 @@ __host__ void host_wop_pbs(
       number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension,
       base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
       number_of_inputs, max_shared_memory);
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   cuda_drop_async(lut_pbs, stream, gpu_index);
   cuda_drop_async(lut_vector_indexes, stream, gpu_index);
   cuda_drop_async(lwe_array_in_buffer, stream, gpu_index);
@@ -192,7 +192,7 @@ __host__ void host_wop_pbs(
       number_of_inputs * number_of_bits_to_extract, number_of_inputs,
       max_shared_memory);
 
-  checkCudaErrors(cudaGetLastError());
+  check_cuda_error(cudaGetLastError());
   cuda_drop_async(lwe_array_out_bit_extract, stream, gpu_index);
 }
 #endif // WOP_PBS_H