diff --git a/src/addition.cuh b/src/addition.cuh
index 90383d6e7..c33a7e5eb 100644
--- a/src/addition.cuh
+++ b/src/addition.cuh
@@ -58,6 +58,7 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
 
   auto stream = static_cast<cudaStream_t *>(v_stream);
   addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
+  checkCudaErrors(cudaGetLastError());
 
   cudaStreamSynchronize(*stream);
 }
@@ -84,6 +85,7 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
                                   cudaMemcpyDeviceToDevice, *stream));
   plaintext_addition<<<grid, thds, 0, *stream>>>(
       output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
+  checkCudaErrors(cudaGetLastError());
 
   cudaStreamSynchronize(*stream);
 }
diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh
index 6fcd8a51e..ff2b8777e 100644
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -378,6 +378,8 @@ __host__ void host_bootstrap_amortized(
             bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
             base_log, level_count, lwe_idx, 0);
   }
+  checkCudaErrors(cudaGetLastError());
+
   // Synchronize the streams before copying the result to lwe_array_out at the
   // right place
   cudaStreamSynchronize(*stream);
diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh
index 82bc35b66..30a887134 100644
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -493,6 +493,7 @@ __host__ void host_extract_bits(
   copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
       lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
       1ll << (ciphertext_n_bits - delta_log - 1));
+  checkCudaErrors(cudaGetLastError());
 
   for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
     cuda_keyswitch_lwe_ciphertext_vector(
@@ -502,6 +503,7 @@ __host__ void host_extract_bits(
     copy_small_lwe<<<1, 256, 0, *stream>>>(
         list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1,
         number_of_bits, number_of_bits - bit_idx - 1);
+    checkCudaErrors(cudaGetLastError());
 
     if (bit_idx == number_of_bits - 1) {
       break;
@@ -510,10 +512,12 @@ __host__ void host_extract_bits(
     add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
                                              lwe_dimension_out,
                                              1ll << (ciphertext_n_bits - 2));
+    checkCudaErrors(cudaGetLastError());
 
     fill_lut_body_for_current_bit<Torus, params>
         <<<blocks, threads, 0, *stream>>>(
             lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
+    checkCudaErrors(cudaGetLastError());
 
     host_bootstrap_low_latency<Torus, params>(
         v_stream, lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes,
@@ -524,6 +528,7 @@ __host__ void host_extract_bits(
         lwe_array_in_shifted_buffer, lwe_array_in_buffer,
         lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),
         1ll << (ciphertext_n_bits - delta_log - bit_idx - 2));
+    checkCudaErrors(cudaGetLastError());
   }
 }
 
diff --git a/src/keyswitch.cuh b/src/keyswitch.cuh
index 86d2090cf..6c1afcc78 100644
--- a/src/keyswitch.cuh
+++ b/src/keyswitch.cuh
@@ -143,6 +143,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
   keyswitch<<<grid, threads, shared_mem, *stream>>>(
       lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
       base_log, level_count, lwe_lower, lwe_upper, cutoff);
+  checkCudaErrors(cudaGetLastError());
 
   cudaStreamSynchronize(*stream);
 }
diff --git a/src/multiplication.cuh b/src/multiplication.cuh
index 74db53fc8..40c5d67f7 100644
--- a/src/multiplication.cuh
+++ b/src/multiplication.cuh
@@ -7,6 +7,7 @@
 #include <helper_cuda.h>
 #endif
 
+#include "../include/helper_cuda.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 
@@ -45,6 +46,7 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output,
   auto stream = static_cast<cudaStream_t *>(v_stream);
   cleartext_multiplication<<<grid, thds, 0, *stream>>>(
       output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+  checkCudaErrors(cudaGetLastError());
 
   cudaStreamSynchronize(*stream);
 }
diff --git a/src/negation.cuh b/src/negation.cuh
index 7ab0e5e51..fd701ea3f 100644
--- a/src/negation.cuh
+++ b/src/negation.cuh
@@ -7,9 +7,11 @@
 #include <helper_cuda.h>
 #endif
 
+#include "../include/helper_cuda.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 
+
 template <typename T>
 __global__ void negation(T *output, T *input, uint32_t num_entries) {
 
@@ -39,6 +41,7 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
 
   auto stream = static_cast<cudaStream_t *>(v_stream);
   negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
+  checkCudaErrors(cudaGetLastError());
 
   cudaStreamSynchronize(*stream);
 }