mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-07 22:04:10 -05:00
fix(gpu): add upper bound to lwe_chunk_size calculation
This commit is contained in:
committed by
Agnès Leroy
parent
6f105cd82e
commit
12426573fa
@@ -97,12 +97,13 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle);
|
||||
uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
uint32_t level_count, uint64_t full_sm_keybundle);
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint64_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, uint32_t level_count,
|
||||
uint64_t full_sm_keybundle);
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_keybundle = NULL;
|
||||
@@ -110,7 +111,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_acc_step_two = NULL;
|
||||
int8_t *d_mem_acc_cg = NULL;
|
||||
int8_t *d_mem_acc_tbc = NULL;
|
||||
uint32_t lwe_chunk_size;
|
||||
uint64_t lwe_chunk_size;
|
||||
double2 *keybundle_fft;
|
||||
Torus *global_accumulator;
|
||||
double2 *global_join_buffer;
|
||||
@@ -120,7 +121,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
|
||||
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint64_t lwe_chunk_size,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
@@ -295,7 +296,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_acc_step_two = NULL;
|
||||
int8_t *d_mem_acc_cg = NULL;
|
||||
int8_t *d_mem_acc_tbc = NULL;
|
||||
uint32_t lwe_chunk_size;
|
||||
uint64_t lwe_chunk_size;
|
||||
double *keybundle_fft;
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
@@ -306,7 +307,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
|
||||
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
|
||||
uint64_t lwe_chunk_size, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
@@ -30,7 +30,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
Torus *global_accumulator, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
|
||||
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
|
||||
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
@@ -193,7 +193,7 @@ template <typename Torus>
|
||||
uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t lwe_chunk_size) {
|
||||
uint32_t grouping_factor, uint64_t lwe_chunk_size) {
|
||||
|
||||
uint64_t buffer_size = 0;
|
||||
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
|
||||
@@ -280,9 +280,9 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
|
||||
level_count, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
@@ -317,12 +317,12 @@ __host__ void execute_cg_external_product_loop(
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
uint64_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
uint64_t chunk_size = std::min(
|
||||
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_cg;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
|
||||
@@ -456,9 +456,9 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle) {
|
||||
uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
uint32_t level_count, uint64_t full_sm_keybundle) {
|
||||
|
||||
int max_blocks_per_sm;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
@@ -479,6 +479,22 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
size_t total_mem, free_mem;
|
||||
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
|
||||
// Estimate the size of one chunk
|
||||
uint64_t size_one_chunk = max_num_pbs * polynomial_size *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
level_count * sizeof(Torus);
|
||||
|
||||
// We calculate the maximum number of chunks that can fit in the 50% of free
|
||||
// memory. We don't want the pbs temp array uses more than 50% of the free
|
||||
// memory if 1 chunk doesn't fit in the 50% of free memory we panic
|
||||
uint32_t max_num_chunks =
|
||||
static_cast<uint32_t>(free_mem / (2 * size_one_chunk));
|
||||
PANIC_IF_FALSE(
|
||||
max_num_chunks > 0,
|
||||
"Cuda error (multi-bit PBS): Not enough GPU memory to allocate PBS "
|
||||
"temporary arrays.");
|
||||
int x = num_sms * max_blocks_per_sm;
|
||||
int count = 0;
|
||||
|
||||
@@ -500,7 +516,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
// applied only to few number of samples(8) because it can have a negative
|
||||
// effect of over saturation.
|
||||
if (max_num_pbs <= 8) {
|
||||
return num_sms / 2;
|
||||
return (max_num_chunks > num_sms / 2) ? num_sms / 2 : max_num_chunks;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -514,8 +530,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return divisor;
|
||||
return (max_num_chunks > divisor) ? divisor : max_num_chunks;
|
||||
}
|
||||
|
||||
template uint64_t scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
|
||||
@@ -45,8 +45,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input, int8_t *device_mem,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint64_t lwe_chunk_size,
|
||||
uint64_t keybundle_size_per_input, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
@@ -164,8 +164,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input) {
|
||||
uint32_t lwe_offset, uint64_t lwe_chunk_size,
|
||||
uint64_t keybundle_size_per_input) {
|
||||
|
||||
constexpr uint32_t polynomial_size = 2048;
|
||||
constexpr uint32_t grouping_factor = 4;
|
||||
@@ -387,7 +387,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
|
||||
double2 *join_buffer, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t iteration, uint32_t lwe_chunk_size,
|
||||
uint32_t level_count, uint32_t iteration, uint64_t lwe_chunk_size,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
@@ -658,9 +658,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
|
||||
level_count, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
@@ -679,10 +679,10 @@ __host__ void execute_compute_keybundle(
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
uint64_t chunk_size = std::min(
|
||||
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
uint64_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
@@ -859,8 +859,9 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
// Accumulate
|
||||
uint32_t chunk_size = std::min(
|
||||
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
uint32_t chunk_size =
|
||||
std::min((uint32_t)lwe_chunk_size,
|
||||
(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
for (uint32_t j = 0; j < chunk_size; j++) {
|
||||
bool is_first_iter = (j + lwe_offset) == 0;
|
||||
bool is_last_iter =
|
||||
|
||||
@@ -307,8 +307,9 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint64_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, uint32_t level_count,
|
||||
uint64_t full_sm_keybundle) {
|
||||
|
||||
int max_blocks_per_sm;
|
||||
|
||||
@@ -23,8 +23,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
|
||||
const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
|
||||
const __uint128_t *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input, int8_t *device_mem,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint64_t lwe_chunk_size,
|
||||
uint64_t keybundle_size_per_input, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
@@ -237,7 +237,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
const double *__restrict__ keybundle_array,
|
||||
__uint128_t *global_accumulator, double *global_accumulator_fft,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t iteration, uint32_t lwe_chunk_size, int8_t *device_mem,
|
||||
uint32_t iteration, uint64_t lwe_chunk_size, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
@@ -372,7 +372,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
__uint128_t *global_accumulator, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
|
||||
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
|
||||
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
@@ -546,10 +546,10 @@ __host__ void execute_compute_keybundle_128(
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
uint64_t chunk_size = std::min(
|
||||
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
uint64_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2) * 4;
|
||||
|
||||
@@ -703,8 +703,9 @@ __host__ void host_multi_bit_programmable_bootstrap_128(
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
// Accumulate
|
||||
uint32_t chunk_size = std::min(
|
||||
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
uint64_t chunk_size =
|
||||
std::min((uint32_t)lwe_chunk_size,
|
||||
(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
for (uint32_t j = 0; j < chunk_size; j++) {
|
||||
bool is_first_iter = (j + lwe_offset) == 0;
|
||||
bool is_last_iter =
|
||||
@@ -761,12 +762,12 @@ __host__ void execute_cg_external_product_loop_128(
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
uint64_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2) * 4;
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
uint64_t chunk_size = std::min(
|
||||
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_cg;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
@@ -994,8 +995,8 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap_128(
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size_128<InputTorus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size,
|
||||
full_sm_keybundle);
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
|
||||
level_count, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer_128<InputTorus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
@@ -1079,8 +1080,8 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap_128(
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size_128<InputTorus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size,
|
||||
full_sm_keybundle);
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
|
||||
level_count, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer_128<InputTorus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
|
||||
@@ -30,7 +30,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
Torus *global_accumulator, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
|
||||
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
|
||||
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
bool support_dsm, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
@@ -207,7 +207,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
|
||||
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset,
|
||||
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
|
||||
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
constexpr uint32_t level_count = 1;
|
||||
@@ -502,9 +502,9 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
|
||||
level_count, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
@@ -544,12 +544,12 @@ __host__ void execute_tbc_external_product_loop(
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
uint64_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
uint64_t chunk_size = std::min(
|
||||
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_tbc;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
|
||||
Reference in New Issue
Block a user