fix(gpu): add upper bound to lwe_chunk_size calculation

This commit is contained in:
Guillermo Oyarzun
2025-11-05 13:22:27 +01:00
committed by Agnès Leroy
parent 6f105cd82e
commit 12426573fa
7 changed files with 79 additions and 60 deletions

View File

@@ -97,12 +97,13 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint64_t full_sm_keybundle);
uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t level_count, uint64_t full_sm_keybundle);
template <typename Torus, class params>
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
uint64_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t level_count,
uint64_t full_sm_keybundle);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
@@ -110,7 +111,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_acc_step_two = NULL;
int8_t *d_mem_acc_cg = NULL;
int8_t *d_mem_acc_tbc = NULL;
uint32_t lwe_chunk_size;
uint64_t lwe_chunk_size;
double2 *keybundle_fft;
Torus *global_accumulator;
double2 *global_join_buffer;
@@ -120,7 +121,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
uint32_t input_lwe_ciphertext_count, uint64_t lwe_chunk_size,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
@@ -295,7 +296,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_acc_step_two = NULL;
int8_t *d_mem_acc_cg = NULL;
int8_t *d_mem_acc_tbc = NULL;
uint32_t lwe_chunk_size;
uint64_t lwe_chunk_size;
double *keybundle_fft;
__uint128_t *global_accumulator;
double *global_join_buffer;
@@ -306,7 +307,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
uint64_t lwe_chunk_size, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
cuda_set_device(gpu_index);

View File

@@ -30,7 +30,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t num_many_lut, uint32_t lut_stride) {
@@ -193,7 +193,7 @@ template <typename Torus>
uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t lwe_chunk_size) {
uint32_t grouping_factor, uint64_t lwe_chunk_size) {
uint64_t buffer_size = 0;
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
@@ -280,9 +280,9 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
level_count, full_sm_keybundle);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
@@ -317,12 +317,12 @@ __host__ void execute_cg_external_product_loop(
auto lwe_chunk_size = buffer->lwe_chunk_size;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
uint32_t keybundle_size_per_input =
uint64_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
auto d_mem = buffer->d_mem_acc_cg;
auto keybundle_fft = buffer->keybundle_fft;

View File

@@ -456,9 +456,9 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint64_t full_sm_keybundle) {
uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t level_count, uint64_t full_sm_keybundle) {
int max_blocks_per_sm;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
@@ -479,6 +479,22 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
size_t total_mem, free_mem;
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
// Estimate the size of one chunk
uint64_t size_one_chunk = max_num_pbs * polynomial_size *
(glwe_dimension + 1) * (glwe_dimension + 1) *
level_count * sizeof(Torus);
// We calculate the maximum number of chunks that can fit in the 50% of free
// memory. We don't want the pbs temp array uses more than 50% of the free
// memory if 1 chunk doesn't fit in the 50% of free memory we panic
uint32_t max_num_chunks =
static_cast<uint32_t>(free_mem / (2 * size_one_chunk));
PANIC_IF_FALSE(
max_num_chunks > 0,
"Cuda error (multi-bit PBS): Not enough GPU memory to allocate PBS "
"temporary arrays.");
int x = num_sms * max_blocks_per_sm;
int count = 0;
@@ -500,7 +516,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
// applied only to few number of samples(8) because it can have a negative
// effect of over saturation.
if (max_num_pbs <= 8) {
return num_sms / 2;
return (max_num_chunks > num_sms / 2) ? num_sms / 2 : max_num_chunks;
}
#endif
@@ -514,8 +530,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
}
}
}
return divisor;
return (max_num_chunks > divisor) ? divisor : max_num_chunks;
}
template uint64_t scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(

View File

@@ -45,8 +45,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint32_t level_count, uint32_t lwe_offset, uint64_t lwe_chunk_size,
uint64_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
extern __shared__ int8_t sharedmem[];
@@ -164,8 +164,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input) {
uint32_t lwe_offset, uint64_t lwe_chunk_size,
uint64_t keybundle_size_per_input) {
constexpr uint32_t polynomial_size = 2048;
constexpr uint32_t grouping_factor = 4;
@@ -387,7 +387,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
double2 *join_buffer, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t iteration, uint32_t lwe_chunk_size,
uint32_t level_count, uint32_t iteration, uint64_t lwe_chunk_size,
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t num_many_lut, uint32_t lut_stride) {
// We use shared memory for the polynomials that are used often during the
@@ -658,9 +658,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
level_count, full_sm_keybundle);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
@@ -679,10 +679,10 @@ __host__ void execute_compute_keybundle(
cuda_set_device(gpu_index);
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
uint32_t keybundle_size_per_input =
uint64_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
@@ -859,8 +859,9 @@ __host__ void host_multi_bit_programmable_bootstrap(
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
// Accumulate
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
uint32_t chunk_size =
std::min((uint32_t)lwe_chunk_size,
(lwe_dimension / grouping_factor) - lwe_offset);
for (uint32_t j = 0; j < chunk_size; j++) {
bool is_first_iter = (j + lwe_offset) == 0;
bool is_last_iter =

View File

@@ -307,8 +307,9 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
uint64_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t level_count,
uint64_t full_sm_keybundle) {
int max_blocks_per_sm;

View File

@@ -23,8 +23,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
const __uint128_t *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint32_t level_count, uint32_t lwe_offset, uint64_t lwe_chunk_size,
uint64_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
extern __shared__ int8_t sharedmem[];
@@ -237,7 +237,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
const double *__restrict__ keybundle_array,
__uint128_t *global_accumulator, double *global_accumulator_fft,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t iteration, uint32_t lwe_chunk_size, int8_t *device_mem,
uint32_t iteration, uint64_t lwe_chunk_size, int8_t *device_mem,
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
uint32_t lut_stride) {
// We use shared memory for the polynomials that are used often during the
@@ -372,7 +372,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
__uint128_t *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t num_many_lut, uint32_t lut_stride) {
@@ -546,10 +546,10 @@ __host__ void execute_compute_keybundle_128(
cuda_set_device(gpu_index);
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
uint32_t keybundle_size_per_input =
uint64_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2) * 4;
@@ -703,8 +703,9 @@ __host__ void host_multi_bit_programmable_bootstrap_128(
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
// Accumulate
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
uint64_t chunk_size =
std::min((uint32_t)lwe_chunk_size,
(lwe_dimension / grouping_factor) - lwe_offset);
for (uint32_t j = 0; j < chunk_size; j++) {
bool is_first_iter = (j + lwe_offset) == 0;
bool is_last_iter =
@@ -761,12 +762,12 @@ __host__ void execute_cg_external_product_loop_128(
auto lwe_chunk_size = buffer->lwe_chunk_size;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
uint32_t keybundle_size_per_input =
uint64_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2) * 4;
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
auto d_mem = buffer->d_mem_acc_cg;
auto keybundle_fft = buffer->keybundle_fft;
@@ -994,8 +995,8 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap_128(
}
auto lwe_chunk_size = get_lwe_chunk_size_128<InputTorus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size,
full_sm_keybundle);
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
level_count, full_sm_keybundle);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer_128<InputTorus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
@@ -1079,8 +1080,8 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap_128(
}
auto lwe_chunk_size = get_lwe_chunk_size_128<InputTorus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size,
full_sm_keybundle);
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
level_count, full_sm_keybundle);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer_128<InputTorus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,

View File

@@ -30,7 +30,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
bool support_dsm, uint32_t num_many_lut, uint32_t lut_stride) {
@@ -207,7 +207,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
uint32_t num_many_lut, uint32_t lut_stride) {
constexpr uint32_t level_count = 1;
@@ -502,9 +502,9 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
level_count, full_sm_keybundle);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
@@ -544,12 +544,12 @@ __host__ void execute_tbc_external_product_loop(
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint32_t keybundle_size_per_input =
uint64_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
auto d_mem = buffer->d_mem_acc_tbc;
auto keybundle_fft = buffer->keybundle_fft;