diff --git a/src/addition.cuh b/src/addition.cuh index 90383d6e7..c33a7e5eb 100644 --- a/src/addition.cuh +++ b/src/addition.cuh @@ -58,6 +58,7 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output, auto stream = static_cast(v_stream); addition<<>>(output, input_1, input_2, num_entries); + checkCudaErrors(cudaGetLastError()); cudaStreamSynchronize(*stream); } @@ -84,6 +85,7 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index, cudaMemcpyDeviceToDevice, *stream)); plaintext_addition<<>>( output, lwe_input, plaintext_input, input_lwe_dimension, num_entries); + checkCudaErrors(cudaGetLastError()); cudaStreamSynchronize(*stream); } diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 6fcd8a51e..ff2b8777e 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -378,6 +378,8 @@ __host__ void host_bootstrap_amortized( bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, base_log, level_count, lwe_idx, 0); } + checkCudaErrors(cudaGetLastError()); + // Synchronize the streams before copying the result to lwe_array_out at the // right place cudaStreamSynchronize(*stream); diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index 82bc35b66..30a887134 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -493,6 +493,7 @@ __host__ void host_extract_bits( copy_and_shift_lwe<<>>( lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in, 1ll << (ciphertext_n_bits - delta_log - 1)); + checkCudaErrors(cudaGetLastError()); for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) { cuda_keyswitch_lwe_ciphertext_vector( @@ -502,6 +503,7 @@ __host__ void host_extract_bits( copy_small_lwe<<<1, 256, 0, *stream>>>( list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1, number_of_bits, number_of_bits - bit_idx - 1); + checkCudaErrors(cudaGetLastError()); if (bit_idx == number_of_bits - 1) { break; @@ -510,10 +512,12 @@ __host__ void host_extract_bits( add_to_body<<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer, lwe_dimension_out, 1ll << (ciphertext_n_bits - 2)); + checkCudaErrors(cudaGetLastError()); fill_lut_body_for_current_bit <<>>( lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx)); + checkCudaErrors(cudaGetLastError()); host_bootstrap_low_latency( v_stream, lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, @@ -524,6 +528,7 @@ __host__ void host_extract_bits( lwe_array_in_shifted_buffer, lwe_array_in_buffer, lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx), 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2)); + checkCudaErrors(cudaGetLastError()); } } diff --git a/src/keyswitch.cuh b/src/keyswitch.cuh index 86d2090cf..6c1afcc78 100644 --- a/src/keyswitch.cuh +++ b/src/keyswitch.cuh @@ -143,6 +143,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( keyswitch<<>>( lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower, lwe_upper, cutoff); + checkCudaErrors(cudaGetLastError()); cudaStreamSynchronize(*stream); } diff --git a/src/multiplication.cuh b/src/multiplication.cuh index 74db53fc8..40c5d67f7 100644 --- a/src/multiplication.cuh +++ b/src/multiplication.cuh @@ -7,6 +7,7 @@ #include #endif +#include "../include/helper_cuda.h" #include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" @@ -45,6 +46,7 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output, auto stream = static_cast(v_stream); cleartext_multiplication<<>>( output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); + checkCudaErrors(cudaGetLastError()); cudaStreamSynchronize(*stream); } diff --git a/src/negation.cuh b/src/negation.cuh index 7ab0e5e51..fd701ea3f 100644 --- a/src/negation.cuh +++ b/src/negation.cuh @@ -7,9 +7,11 @@ #include #endif +#include "../include/helper_cuda.h" #include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" + template __global__ void negation(T *output, T *input, uint32_t num_entries) { @@ -39,6 +41,7 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output, auto stream = static_cast(v_stream); negation<<>>(output, input, num_entries); + checkCudaErrors(cudaGetLastError()); cudaStreamSynchronize(*stream); }