feat(cuda): Check for errors after each kernel launch.

This commit is contained in:
Pedro Alves
2022-11-24 10:49:37 -03:00
committed by Agnès Leroy
parent 9d25f9248d
commit d59b2f6dda
6 changed files with 15 additions and 0 deletions

View File

@@ -58,6 +58,7 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
auto stream = static_cast<cudaStream_t *>(v_stream);
addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}
@@ -84,6 +85,7 @@ __host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
cudaMemcpyDeviceToDevice, *stream));
plaintext_addition<<<grid, thds, 0, *stream>>>(
output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}

View File

@@ -378,6 +378,8 @@ __host__ void host_bootstrap_amortized(
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
base_log, level_count, lwe_idx, 0);
}
checkCudaErrors(cudaGetLastError());
// Synchronize the streams before copying the result to lwe_array_out at the
// right place
cudaStreamSynchronize(*stream);

View File

@@ -493,6 +493,7 @@ __host__ void host_extract_bits(
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
1ll << (ciphertext_n_bits - delta_log - 1));
checkCudaErrors(cudaGetLastError());
for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
cuda_keyswitch_lwe_ciphertext_vector(
@@ -502,6 +503,7 @@ __host__ void host_extract_bits(
copy_small_lwe<<<1, 256, 0, *stream>>>(
list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1,
number_of_bits, number_of_bits - bit_idx - 1);
checkCudaErrors(cudaGetLastError());
if (bit_idx == number_of_bits - 1) {
break;
@@ -510,10 +512,12 @@ __host__ void host_extract_bits(
add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
lwe_dimension_out,
1ll << (ciphertext_n_bits - 2));
checkCudaErrors(cudaGetLastError());
fill_lut_body_for_current_bit<Torus, params>
<<<blocks, threads, 0, *stream>>>(
lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
checkCudaErrors(cudaGetLastError());
host_bootstrap_low_latency<Torus, params>(
v_stream, lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes,
@@ -524,6 +528,7 @@ __host__ void host_extract_bits(
lwe_array_in_shifted_buffer, lwe_array_in_buffer,
lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),
1ll << (ciphertext_n_bits - delta_log - bit_idx - 2));
checkCudaErrors(cudaGetLastError());
}
}

View File

@@ -143,6 +143,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
keyswitch<<<grid, threads, shared_mem, *stream>>>(
lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
base_log, level_count, lwe_lower, lwe_upper, cutoff);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}

View File

@@ -7,6 +7,7 @@
#include <helper_cuda.h>
#endif
#include "../include/helper_cuda.h"
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
@@ -45,6 +46,7 @@ host_cleartext_multiplication(void *v_stream, uint32_t gpu_index, T *output,
auto stream = static_cast<cudaStream_t *>(v_stream);
cleartext_multiplication<<<grid, thds, 0, *stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}

View File

@@ -7,9 +7,11 @@
#include <helper_cuda.h>
#endif
#include "../include/helper_cuda.h"
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
template <typename T>
__global__ void negation(T *output, T *input, uint32_t num_entries) {
@@ -39,6 +41,7 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
auto stream = static_cast<cudaStream_t *>(v_stream);
negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
checkCudaErrors(cudaGetLastError());
cudaStreamSynchronize(*stream);
}