feat(cuda): Check for errors after each kernel launch.

2026-02-08 11:35:02 -05:00 · 2022-11-24 10:49:37 -03:00
parent 9d25f9248d
commit d59b2f6dda
6 changed files with 15 additions and 0 deletions
--- a/src/addition.cuh
+++ b/src/addition.cuh
@@ -58,6 +58,7 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,

  auto stream = static_cast<cudaStream_t *>(v_stream);
  addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
+  checkCudaErrors(cudaGetLastError());

  cudaStreamSynchronize(*stream);
 }
@@ -84,6 +85,7 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
                                  cudaMemcpyDeviceToDevice, *stream));
  plaintext_addition<<<grid, thds, 0, *stream>>>(
      output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
+  checkCudaErrors(cudaGetLastError());

  cudaStreamSynchronize(*stream);
 }
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -378,6 +378,8 @@ __host__ void host_bootstrap_amortized(
            bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
            base_log, level_count, lwe_idx, 0);
  }
+  checkCudaErrors(cudaGetLastError());
+
  // Synchronize the streams before copying the result to lwe_array_out at the
  // right place
  cudaStreamSynchronize(*stream);
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -493,6 +493,7 @@ __host__ void host_extract_bits(
  copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
      lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
      1ll << (ciphertext_n_bits - delta_log - 1));
+  checkCudaErrors(cudaGetLastError());

  for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
    cuda_keyswitch_lwe_ciphertext_vector(
@@ -502,6 +503,7 @@ __host__ void host_extract_bits(
    copy_small_lwe<<<1, 256, 0, *stream>>>(
        list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1,
        number_of_bits, number_of_bits - bit_idx - 1);
+    checkCudaErrors(cudaGetLastError());

    if (bit_idx == number_of_bits - 1) {
      break;
@@ -510,10 +512,12 @@ __host__ void host_extract_bits(
    add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
                                             lwe_dimension_out,
                                             1ll << (ciphertext_n_bits - 2));
+    checkCudaErrors(cudaGetLastError());

    fill_lut_body_for_current_bit<Torus, params>
        <<<blocks, threads, 0, *stream>>>(
            lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
+    checkCudaErrors(cudaGetLastError());

    host_bootstrap_low_latency<Torus, params>(
        v_stream, lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes,
@@ -524,6 +528,7 @@ __host__ void host_extract_bits(
        lwe_array_in_shifted_buffer, lwe_array_in_buffer,
        lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),
        1ll << (ciphertext_n_bits - delta_log - bit_idx - 2));
+    checkCudaErrors(cudaGetLastError());
  }
 }

--- a/src/keyswitch.cuh
+++ b/src/keyswitch.cuh
@@ -143,6 +143,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
  keyswitch<<<grid, threads, shared_mem, *stream>>>(
      lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
      base_log, level_count, lwe_lower, lwe_upper, cutoff);
+  checkCudaErrors(cudaGetLastError());

  cudaStreamSynchronize(*stream);
 }
--- a/src/multiplication.cuh
+++ b/src/multiplication.cuh
@@ -7,6 +7,7 @@
 #include <helper_cuda.h>
 #endif

+#include "../include/helper_cuda.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"

@@ -45,6 +46,7 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output,
  auto stream = static_cast<cudaStream_t *>(v_stream);
  cleartext_multiplication<<<grid, thds, 0, *stream>>>(
      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+  checkCudaErrors(cudaGetLastError());

  cudaStreamSynchronize(*stream);
 }
--- a/src/negation.cuh
+++ b/src/negation.cuh
@@ -7,9 +7,11 @@
 #include <helper_cuda.h>
 #endif

+#include "../include/helper_cuda.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"

+
 template <typename T>
 __global__ void negation(T *output, T *input, uint32_t num_entries) {

@@ -39,6 +41,7 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,

  auto stream = static_cast<cudaStream_t *>(v_stream);
  negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
+  checkCudaErrors(cudaGetLastError());

  cudaStreamSynchronize(*stream);
 }