chore(gpu): add assert macro

This commit is contained in:
Andrei Stoian
2025-08-04 15:39:13 +02:00
committed by Andrei Stoian
parent 451458df97
commit 71f427de9e
12 changed files with 109 additions and 67 deletions

View File

@@ -19,6 +19,11 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
std::abort();
}
}
// The PANIC macro should be used to validate user-inputs to GPU functions
// it will execute in all targets, including production settings
// e.g., cudaMemCopy to the device should check that the destination pointer is
// a device pointer
#define PANIC(format, ...) \
{ \
std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \
@@ -26,6 +31,30 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
std::abort(); \
}
// This is a generic assertion checking macro with user defined printf-style
// message
#define PANIC_IF_FALSE(cond, format, ...) \
do { \
if (!(cond)) { \
PANIC(format "\n\n %s\n", ##__VA_ARGS__, #cond); \
} \
} while (0)
#ifndef GPU_ASSERTS_DISABLE
// The GPU assert should be used to validate assumptions in algorithms,
// for example, checking that two user-provided quantities have a certain
// relationship or that the size of the buffer provided to a function is
// sufficient when it is filled with some algorithm that depends on
// user-provided inputs e.g., OPRF corrections buffer should not have a size
// higher than the number of blocks in the datatype that is generated
#define GPU_ASSERT(cond, format, ...) \
PANIC_IF_FALSE(cond, format, ##__VA_ARGS__)
#else
#define GPU_ASSERT(cond) \
do { \
} while (0)
#endif
uint32_t cuda_get_device();
void cuda_set_device(uint32_t gpu_index);

View File

@@ -54,9 +54,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
uint32_t polynomial_size, uint32_t level_count,
uint32_t max_shared_memory) {
if (gpu_count != 1)
PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
"supported yet.")
PANIC_IF_FALSE(gpu_count == 1,
"GPU error (batch_fft_ggsw_vector): multi-GPU execution on %d "
"gpus is not supported yet.",
gpu_count);
cuda_set_device(gpu_indexes[0]);
int shared_memory_size = sizeof(double) * polynomial_size;

View File

@@ -124,8 +124,10 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
num_blocks_per_sample, num_threads_x);
int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
if (num_blocks_per_sample > 65536)
PANIC("Cuda error (Keyswitch): number of blocks per sample is too large");
PANIC_IF_FALSE(
num_blocks_per_sample <= 65536,
"Cuda error (Keyswitch): number of blocks per sample (%d) is too large",
num_blocks_per_sample);
// In multiplication of large integers (512, 1024, 2048), the number of
// samples can be larger than 65536, so we need to set it in the first

View File

@@ -204,8 +204,9 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
// Shared memory requirement is 8192 bytes for 64-bit Torus elements
uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
if (shared_mem_size > 8192)
PANIC("GEMM kernel error: shared memory required might be too large");
// Sanity check: the shared memory size is a constant defined by the algorithm
GPU_ASSERT(shared_mem_size <= 8192,
"GEMM kernel error: shared memory required might be too large");
tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,

View File

@@ -298,15 +298,14 @@ __host__ void host_improve_noise_modulus_switch(
const double input_variance, const double r_sigma, const double bound,
uint32_t log_modulus) {
if (lwe_size < 512) {
PANIC("The lwe_size is less than 512, this is not supported\n");
return;
}
PANIC_IF_FALSE(lwe_size >= 512,
"The lwe_size (%d) is less than 512, this is not supported\n",
lwe_size);
PANIC_IF_FALSE(
lwe_size <= 1024,
"The lwe_size (%d) is greater than 1024, this is not supported\n",
lwe_size);
if (lwe_size > 1024) {
PANIC("The lwe_size is greater than 1024, this is not supported\n");
return;
}
cuda_set_device(gpu_index);
// This reduction requires a power of two num of threads

View File

@@ -196,14 +196,14 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
return;
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
if (attr_dest.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
}
PANIC_IF_FALSE(
attr_dest.type == cudaMemoryTypeDevice,
"Cuda error: invalid dest device pointer in copy from GPU to GPU.");
cudaPointerAttributes attr_src;
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
if (attr_src.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
}
PANIC_IF_FALSE(
attr_src.type == cudaMemoryTypeDevice,
"Cuda error: invalid src device pointer in copy from GPU to GPU.");
cuda_set_device(gpu_index);
if (attr_src.device == attr_dest.device) {
check_cuda_error(
@@ -227,14 +227,14 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
return;
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
if (attr_dest.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
}
PANIC_IF_FALSE(
attr_dest.type == cudaMemoryTypeDevice,
"Cuda error: invalid dest device pointer in copy from GPU to GPU.");
cudaPointerAttributes attr_src;
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
if (attr_src.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
}
PANIC_IF_FALSE(
attr_src.type == cudaMemoryTypeDevice,
"Cuda error: invalid src device pointer in copy from GPU to GPU.");
cuda_set_device(gpu_index);
if (attr_src.device == attr_dest.device) {
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));

View File

@@ -20,12 +20,15 @@ __host__ void host_integer_radix_bitop_kb(
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
if (lwe_array_out->num_radix_blocks != lwe_array_1->num_radix_blocks ||
lwe_array_out->num_radix_blocks != lwe_array_2->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be equal")
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
PANIC("Cuda error: input and output lwe dimension must be equal")
PANIC_IF_FALSE(
lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks,
"Cuda error: input and output num radix blocks must be equal");
PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension &&
lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension,
"Cuda error: input and output lwe dimension must be equal");
auto lut = mem_ptr->lut;
uint64_t degrees[lwe_array_1->num_radix_blocks];
if (mem_ptr->op == BITOP_TYPE::BITAND) {

View File

@@ -25,10 +25,10 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
const uint32_t input_start_lwe_index =
input->num_radix_blocks - output->num_radix_blocks;
if (input->num_radix_blocks <= output->num_radix_blocks) {
PANIC("Cuda error: input num blocks should be greater than output num "
"blocks");
}
PANIC_IF_FALSE(input->num_radix_blocks > output->num_radix_blocks,
"Cuda error: input num blocks (%d) should be greater than "
"output num blocks (%d)",
input->num_radix_blocks, output->num_radix_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
@@ -70,9 +70,7 @@ __host__ void host_extend_radix_with_sign_msb(
PUSH_RANGE("cast/extend")
const uint32_t input_blocks = input->num_radix_blocks;
if (input_blocks == 0) {
PANIC("Cuda error: input blocks cannot be zero");
}
PANIC_IF_FALSE(input_blocks > 0, "Cuda error: input blocks cannot be zero");
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
0, input_blocks, input, 0,

View File

@@ -15,14 +15,18 @@ zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
lwe_array_input->num_radix_blocks < num_radix_blocks)
PANIC("Cuda error: input or output radix ciphertexts does not have enough "
"blocks")
if (lwe_array_out->lwe_dimension != lwe_array_input->lwe_dimension ||
lwe_array_input->lwe_dimension != lwe_condition->lwe_dimension)
PANIC("Cuda error: input and output radix ciphertexts must have the same "
"lwe dimension")
PANIC_IF_FALSE(
lwe_array_out->num_radix_blocks >= num_radix_blocks &&
lwe_array_input->num_radix_blocks >= num_radix_blocks,
"Cuda error: input or output radix ciphertexts does not have enough "
"blocks");
PANIC_IF_FALSE(
lwe_array_out->lwe_dimension == lwe_array_input->lwe_dimension &&
lwe_array_input->lwe_dimension == lwe_condition->lwe_dimension,
"Cuda error: input and output radix ciphertexts must have the same "
"lwe dimension");
cuda_set_device(gpu_indexes[0]);
auto params = mem_ptr->params;

View File

@@ -101,17 +101,20 @@ __host__ void host_radix_blocks_rotate_right(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *dst,
CudaRadixCiphertextFFI *src, uint32_t rotations, uint32_t num_blocks) {
if (src == dst) {
PANIC("Cuda error (blocks_rotate_right): the source and destination "
"pointers should be different");
}
if (dst->lwe_dimension != src->lwe_dimension)
PANIC("Cuda error: input and output should have the same "
"lwe dimension")
if (dst->num_radix_blocks < num_blocks || src->num_radix_blocks < num_blocks)
PANIC("Cuda error: input and output should have more blocks than asked for "
"in the "
"function call")
PANIC_IF_FALSE(src != dst,
"Cuda error (blocks_rotate_right): the source and destination "
"pointers should be different");
PANIC_IF_FALSE(dst->lwe_dimension == src->lwe_dimension,
"Cuda error: input and output should have the same "
"lwe dimension");
PANIC_IF_FALSE(
dst->num_radix_blocks >= num_blocks &&
src->num_radix_blocks >= num_blocks,
"Cuda error: input and output should have more blocks than asked for "
"in the "
"function call");
auto lwe_size = src->lwe_dimension + 1;

View File

@@ -229,8 +229,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
PANIC_IF_FALSE(base_log <= 64,
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
base_log);
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;

View File

@@ -31,13 +31,13 @@ __host__ void host_expand_without_verification(
: mem_ptr->casting_params.small_lwe_dimension);
auto d_lwe_compact_input_indexes = mem_ptr->d_lwe_compact_input_indexes;
auto d_body_id_per_compact_list = mem_ptr->d_body_id_per_compact_list;
if (sizeof(Torus) == 8) {
cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
d_lwe_compact_input_indexes, d_body_id_per_compact_list);
} else
PANIC("Cuda error: expand is only supported on 64 bits")
GPU_ASSERT(sizeof(Torus) == 8,
"Cuda error: expand is only supported on 64 bits");
cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
d_lwe_compact_input_indexes, d_body_id_per_compact_list);
auto ksks = casting_keys;
auto lwe_array_input = expanded_lwes;