mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): add assert macro
This commit is contained in:
committed by
Andrei Stoian
parent
451458df97
commit
71f427de9e
@@ -19,6 +19,11 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
// The PANIC macro should be used to validate user-inputs to GPU functions
|
||||
// it will execute in all targets, including production settings
|
||||
// e.g., cudaMemCopy to the device should check that the destination pointer is
|
||||
// a device pointer
|
||||
#define PANIC(format, ...) \
|
||||
{ \
|
||||
std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \
|
||||
@@ -26,6 +31,30 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
|
||||
std::abort(); \
|
||||
}
|
||||
|
||||
// This is a generic assertion checking macro with user defined printf-style
|
||||
// message
|
||||
#define PANIC_IF_FALSE(cond, format, ...) \
|
||||
do { \
|
||||
if (!(cond)) { \
|
||||
PANIC(format "\n\n %s\n", ##__VA_ARGS__, #cond); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifndef GPU_ASSERTS_DISABLE
|
||||
// The GPU assert should be used to validate assumptions in algorithms,
|
||||
// for example, checking that two user-provided quantities have a certain
|
||||
// relationship or that the size of the buffer provided to a function is
|
||||
// sufficient when it is filled with some algorithm that depends on
|
||||
// user-provided inputs e.g., OPRF corrections buffer should not have a size
|
||||
// higher than the number of blocks in the datatype that is generated
|
||||
#define GPU_ASSERT(cond, format, ...) \
|
||||
PANIC_IF_FALSE(cond, format, ##__VA_ARGS__)
|
||||
#else
|
||||
#define GPU_ASSERT(cond) \
|
||||
do { \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
uint32_t cuda_get_device();
|
||||
void cuda_set_device(uint32_t gpu_index);
|
||||
|
||||
|
||||
@@ -54,9 +54,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t max_shared_memory) {
|
||||
if (gpu_count != 1)
|
||||
PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
|
||||
"supported yet.")
|
||||
PANIC_IF_FALSE(gpu_count == 1,
|
||||
"GPU error (batch_fft_ggsw_vector): multi-GPU execution on %d "
|
||||
"gpus is not supported yet.",
|
||||
gpu_count);
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
@@ -124,8 +124,10 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
num_blocks_per_sample, num_threads_x);
|
||||
|
||||
int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
|
||||
if (num_blocks_per_sample > 65536)
|
||||
PANIC("Cuda error (Keyswitch): number of blocks per sample is too large");
|
||||
PANIC_IF_FALSE(
|
||||
num_blocks_per_sample <= 65536,
|
||||
"Cuda error (Keyswitch): number of blocks per sample (%d) is too large",
|
||||
num_blocks_per_sample);
|
||||
|
||||
// In multiplication of large integers (512, 1024, 2048), the number of
|
||||
// samples can be larger than 65536, so we need to set it in the first
|
||||
|
||||
@@ -204,8 +204,9 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
|
||||
|
||||
// Shared memory requirement is 8192 bytes for 64-bit Torus elements
|
||||
uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
|
||||
if (shared_mem_size > 8192)
|
||||
PANIC("GEMM kernel error: shared memory required might be too large");
|
||||
// Sanity check: the shared memory size is a constant defined by the algorithm
|
||||
GPU_ASSERT(shared_mem_size <= 8192,
|
||||
"GEMM kernel error: shared memory required might be too large");
|
||||
|
||||
tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
|
||||
num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
|
||||
|
||||
@@ -298,15 +298,14 @@ __host__ void host_improve_noise_modulus_switch(
|
||||
const double input_variance, const double r_sigma, const double bound,
|
||||
uint32_t log_modulus) {
|
||||
|
||||
if (lwe_size < 512) {
|
||||
PANIC("The lwe_size is less than 512, this is not supported\n");
|
||||
return;
|
||||
}
|
||||
PANIC_IF_FALSE(lwe_size >= 512,
|
||||
"The lwe_size (%d) is less than 512, this is not supported\n",
|
||||
lwe_size);
|
||||
PANIC_IF_FALSE(
|
||||
lwe_size <= 1024,
|
||||
"The lwe_size (%d) is greater than 1024, this is not supported\n",
|
||||
lwe_size);
|
||||
|
||||
if (lwe_size > 1024) {
|
||||
PANIC("The lwe_size is greater than 1024, this is not supported\n");
|
||||
return;
|
||||
}
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
// This reduction requires a power of two num of threads
|
||||
|
||||
@@ -196,14 +196,14 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
return;
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
if (attr_dest.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
PANIC_IF_FALSE(
|
||||
attr_dest.type == cudaMemoryTypeDevice,
|
||||
"Cuda error: invalid dest device pointer in copy from GPU to GPU.");
|
||||
cudaPointerAttributes attr_src;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
|
||||
if (attr_src.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
PANIC_IF_FALSE(
|
||||
attr_src.type == cudaMemoryTypeDevice,
|
||||
"Cuda error: invalid src device pointer in copy from GPU to GPU.");
|
||||
cuda_set_device(gpu_index);
|
||||
if (attr_src.device == attr_dest.device) {
|
||||
check_cuda_error(
|
||||
@@ -227,14 +227,14 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
return;
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
if (attr_dest.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
PANIC_IF_FALSE(
|
||||
attr_dest.type == cudaMemoryTypeDevice,
|
||||
"Cuda error: invalid dest device pointer in copy from GPU to GPU.");
|
||||
cudaPointerAttributes attr_src;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
|
||||
if (attr_src.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
PANIC_IF_FALSE(
|
||||
attr_src.type == cudaMemoryTypeDevice,
|
||||
"Cuda error: invalid src device pointer in copy from GPU to GPU.");
|
||||
cuda_set_device(gpu_index);
|
||||
if (attr_src.device == attr_dest.device) {
|
||||
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
|
||||
|
||||
@@ -20,12 +20,15 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
if (lwe_array_out->num_radix_blocks != lwe_array_1->num_radix_blocks ||
|
||||
lwe_array_out->num_radix_blocks != lwe_array_2->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be equal")
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimension must be equal")
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
|
||||
lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks,
|
||||
"Cuda error: input and output num radix blocks must be equal");
|
||||
|
||||
PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension &&
|
||||
lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension,
|
||||
"Cuda error: input and output lwe dimension must be equal");
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
uint64_t degrees[lwe_array_1->num_radix_blocks];
|
||||
if (mem_ptr->op == BITOP_TYPE::BITAND) {
|
||||
|
||||
@@ -25,10 +25,10 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
|
||||
const uint32_t input_start_lwe_index =
|
||||
input->num_radix_blocks - output->num_radix_blocks;
|
||||
|
||||
if (input->num_radix_blocks <= output->num_radix_blocks) {
|
||||
PANIC("Cuda error: input num blocks should be greater than output num "
|
||||
"blocks");
|
||||
}
|
||||
PANIC_IF_FALSE(input->num_radix_blocks > output->num_radix_blocks,
|
||||
"Cuda error: input num blocks (%d) should be greater than "
|
||||
"output num blocks (%d)",
|
||||
input->num_radix_blocks, output->num_radix_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
|
||||
@@ -70,9 +70,7 @@ __host__ void host_extend_radix_with_sign_msb(
|
||||
PUSH_RANGE("cast/extend")
|
||||
const uint32_t input_blocks = input->num_radix_blocks;
|
||||
|
||||
if (input_blocks == 0) {
|
||||
PANIC("Cuda error: input blocks cannot be zero");
|
||||
}
|
||||
PANIC_IF_FALSE(input_blocks > 0, "Cuda error: input blocks cannot be zero");
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
0, input_blocks, input, 0,
|
||||
|
||||
@@ -15,14 +15,18 @@ zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
|
||||
lwe_array_input->num_radix_blocks < num_radix_blocks)
|
||||
PANIC("Cuda error: input or output radix ciphertexts does not have enough "
|
||||
"blocks")
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_input->lwe_dimension ||
|
||||
lwe_array_input->lwe_dimension != lwe_condition->lwe_dimension)
|
||||
PANIC("Cuda error: input and output radix ciphertexts must have the same "
|
||||
"lwe dimension")
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out->num_radix_blocks >= num_radix_blocks &&
|
||||
lwe_array_input->num_radix_blocks >= num_radix_blocks,
|
||||
"Cuda error: input or output radix ciphertexts does not have enough "
|
||||
"blocks");
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out->lwe_dimension == lwe_array_input->lwe_dimension &&
|
||||
lwe_array_input->lwe_dimension == lwe_condition->lwe_dimension,
|
||||
"Cuda error: input and output radix ciphertexts must have the same "
|
||||
"lwe dimension");
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
|
||||
@@ -101,17 +101,20 @@ __host__ void host_radix_blocks_rotate_right(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *dst,
|
||||
CudaRadixCiphertextFFI *src, uint32_t rotations, uint32_t num_blocks) {
|
||||
if (src == dst) {
|
||||
PANIC("Cuda error (blocks_rotate_right): the source and destination "
|
||||
"pointers should be different");
|
||||
}
|
||||
if (dst->lwe_dimension != src->lwe_dimension)
|
||||
PANIC("Cuda error: input and output should have the same "
|
||||
"lwe dimension")
|
||||
if (dst->num_radix_blocks < num_blocks || src->num_radix_blocks < num_blocks)
|
||||
PANIC("Cuda error: input and output should have more blocks than asked for "
|
||||
"in the "
|
||||
"function call")
|
||||
PANIC_IF_FALSE(src != dst,
|
||||
"Cuda error (blocks_rotate_right): the source and destination "
|
||||
"pointers should be different");
|
||||
|
||||
PANIC_IF_FALSE(dst->lwe_dimension == src->lwe_dimension,
|
||||
"Cuda error: input and output should have the same "
|
||||
"lwe dimension");
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
dst->num_radix_blocks >= num_blocks &&
|
||||
src->num_radix_blocks >= num_blocks,
|
||||
"Cuda error: input and output should have more blocks than asked for "
|
||||
"in the "
|
||||
"function call");
|
||||
|
||||
auto lwe_size = src->lwe_dimension + 1;
|
||||
|
||||
|
||||
@@ -229,8 +229,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
|
||||
PANIC_IF_FALSE(base_log <= 64,
|
||||
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
|
||||
base_log);
|
||||
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
|
||||
|
||||
@@ -31,13 +31,13 @@ __host__ void host_expand_without_verification(
|
||||
: mem_ptr->casting_params.small_lwe_dimension);
|
||||
auto d_lwe_compact_input_indexes = mem_ptr->d_lwe_compact_input_indexes;
|
||||
auto d_body_id_per_compact_list = mem_ptr->d_body_id_per_compact_list;
|
||||
if (sizeof(Torus) == 8) {
|
||||
cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
|
||||
lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
|
||||
d_lwe_compact_input_indexes, d_body_id_per_compact_list);
|
||||
|
||||
} else
|
||||
PANIC("Cuda error: expand is only supported on 64 bits")
|
||||
GPU_ASSERT(sizeof(Torus) == 8,
|
||||
"Cuda error: expand is only supported on 64 bits");
|
||||
|
||||
cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
|
||||
lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
|
||||
d_lwe_compact_input_indexes, d_body_id_per_compact_list);
|
||||
|
||||
auto ksks = casting_keys;
|
||||
auto lwe_array_input = expanded_lwes;
|
||||
|
||||
Reference in New Issue
Block a user