diff --git a/backends/concrete-cuda/implementation/src/bit_extraction.cuh b/backends/concrete-cuda/implementation/src/bit_extraction.cuh index 3428e07f3..d3de451e7 100644 --- a/backends/concrete-cuda/implementation/src/bit_extraction.cuh +++ b/backends/concrete-cuda/implementation/src/bit_extraction.cuh @@ -130,12 +130,11 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value, } template -__host__ __device__ int -get_buffer_size_extract_bits(uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t polynomial_size, - uint32_t number_of_inputs) { +__host__ __device__ uint64_t get_buffer_size_extract_bits( + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t number_of_inputs) { - int buffer_size = + uint64_t buffer_size = sizeof(Torus) * number_of_inputs // lut_vector_indexes + ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs + (glwe_dimension * polynomial_size + 1) * @@ -159,7 +158,7 @@ scratch_extract_bits(void *v_stream, uint32_t gpu_index, cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); - int buffer_size = + uint64_t buffer_size = get_buffer_size_extract_bits(glwe_dimension, lwe_dimension, polynomial_size, number_of_inputs) + get_buffer_size_bootstrap_low_latency( diff --git a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cuh b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cuh index 9ad8c9d25..c8d1c7ac4 100644 --- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cuh +++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cuh @@ -212,9 +212,8 @@ __global__ void device_bootstrap_amortized( } template -__host__ __device__ int -get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size, - uint32_t glwe_dimension) { +__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_amortized( + uint32_t polynomial_size, uint32_t glwe_dimension) { return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator rotated @@ -224,23 +223,23 @@ get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size, } template -__host__ __device__ int +__host__ __device__ uint64_t get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size) { return sizeof(double2) * polynomial_size / 2; // accumulator fft } template -__host__ __device__ int get_buffer_size_bootstrap_amortized( +__host__ __device__ uint64_t get_buffer_size_bootstrap_amortized( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { - int full_sm = get_buffer_size_full_sm_bootstrap_amortized( + uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized( polynomial_size, glwe_dimension); - int partial_sm = + uint64_t partial_sm = get_buffer_size_partial_sm_bootstrap_amortized(polynomial_size); - int partial_dm = full_sm - partial_sm; - int full_dm = full_sm; - int device_mem = 0; + uint64_t partial_dm = full_sm - partial_sm; + uint64_t full_dm = full_sm; + uint64_t device_mem = 0; if (max_shared_memory < partial_sm) { device_mem = full_dm * input_lwe_ciphertext_count; } else if (max_shared_memory < full_sm) { @@ -260,9 +259,9 @@ __host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index, cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); - int full_sm = get_buffer_size_full_sm_bootstrap_amortized( + uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized( polynomial_size, glwe_dimension); - int partial_sm = + uint64_t partial_sm = get_buffer_size_partial_sm_bootstrap_amortized(polynomial_size); if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { cudaFuncSetAttribute(device_bootstrap_amortized, @@ -279,7 +278,7 @@ __host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index, cudaFuncCachePreferShared)); } if (allocate_gpu_memory) { - int buffer_size = get_buffer_size_bootstrap_amortized( + uint64_t buffer_size = get_buffer_size_bootstrap_amortized( glwe_dimension, polynomial_size, input_lwe_ciphertext_count, max_shared_memory); *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index); @@ -297,15 +296,15 @@ __host__ void host_bootstrap_amortized( uint32_t lwe_idx, uint32_t max_shared_memory) { cudaSetDevice(gpu_index); - int SM_FULL = get_buffer_size_full_sm_bootstrap_amortized( + uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized( polynomial_size, glwe_dimension); - int SM_PART = + uint64_t SM_PART = get_buffer_size_partial_sm_bootstrap_amortized(polynomial_size); - int DM_PART = SM_FULL - SM_PART; + uint64_t DM_PART = SM_FULL - SM_PART; - int DM_FULL = SM_FULL; + uint64_t DM_FULL = SM_FULL; auto stream = static_cast(v_stream); diff --git a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh index 3a0d2d70b..17fb66a6a 100644 --- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh +++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh @@ -135,7 +135,7 @@ __global__ void device_bootstrap_low_latency( Torus *lwe_array_in, double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - int device_memory_size_per_block) { + uint64_t device_memory_size_per_block) { grid_group grid = this_grid(); @@ -144,14 +144,15 @@ __global__ void device_bootstrap_low_latency( // much faster than global memory extern __shared__ int8_t sharedmem[]; int8_t *selected_memory; - int block_index = - blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; uint32_t glwe_dimension = gridDim.y - 1; - if constexpr (SMD == FULLSM) + if constexpr (SMD == FULLSM) { selected_memory = sharedmem; - else + } else { + int block_index = blockIdx.x + blockIdx.y * gridDim.x + + blockIdx.z * gridDim.x * gridDim.y; selected_memory = &device_mem[block_index * device_memory_size_per_block]; + } // We always compute the pointer with most restrictive alignment to avoid // alignment issues @@ -245,7 +246,7 @@ __global__ void device_bootstrap_low_latency( } template -__host__ __device__ int +__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_low_latency(uint32_t polynomial_size) { return sizeof(Torus) * polynomial_size + // accumulator_rotated sizeof(Torus) * polynomial_size + // accumulator @@ -253,23 +254,23 @@ get_buffer_size_full_sm_bootstrap_low_latency(uint32_t polynomial_size) { } template -__host__ __device__ int +__host__ __device__ uint64_t get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) { return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body } template -__host__ __device__ int get_buffer_size_bootstrap_low_latency( +__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { - int full_sm = + uint64_t full_sm = get_buffer_size_full_sm_bootstrap_low_latency(polynomial_size); - int partial_sm = + uint64_t partial_sm = get_buffer_size_partial_sm_bootstrap_low_latency(polynomial_size); - int partial_dm = full_sm - partial_sm; - int full_dm = full_sm; - int device_mem = 0; + uint64_t partial_dm = full_sm - partial_sm; + uint64_t full_dm = full_sm; + uint64_t device_mem = 0; if (max_shared_memory < partial_sm) { device_mem = full_dm * input_lwe_ciphertext_count * level_count * (glwe_dimension + 1); @@ -277,9 +278,9 @@ __host__ __device__ int get_buffer_size_bootstrap_low_latency( device_mem = partial_dm * input_lwe_ciphertext_count * level_count * (glwe_dimension + 1); } - int buffer_size = device_mem + (glwe_dimension + 1) * level_count * - input_lwe_ciphertext_count * - polynomial_size / 2 * sizeof(double2); + uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count * + input_lwe_ciphertext_count * + polynomial_size / 2 * sizeof(double2); return buffer_size + buffer_size % sizeof(double2); } @@ -292,9 +293,9 @@ __host__ void scratch_bootstrap_low_latency( cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); - int full_sm = + uint64_t full_sm = get_buffer_size_full_sm_bootstrap_low_latency(polynomial_size); - int partial_sm = + uint64_t partial_sm = get_buffer_size_partial_sm_bootstrap_low_latency(polynomial_size); if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { check_cuda_error(cudaFuncSetAttribute( @@ -313,7 +314,7 @@ __host__ void scratch_bootstrap_low_latency( check_cuda_error(cudaGetLastError()); } if (allocate_gpu_memory) { - int buffer_size = get_buffer_size_bootstrap_low_latency( + uint64_t buffer_size = get_buffer_size_bootstrap_low_latency( glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, max_shared_memory); *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index); @@ -339,15 +340,15 @@ __host__ void host_bootstrap_low_latency( // With SM each block corresponds to either the mask or body, no need to // duplicate data for each - int full_sm = + uint64_t full_sm = get_buffer_size_full_sm_bootstrap_low_latency(polynomial_size); - int partial_sm = + uint64_t partial_sm = get_buffer_size_partial_sm_bootstrap_low_latency(polynomial_size); - int full_dm = full_sm; + uint64_t full_dm = full_sm; - int partial_dm = full_dm - partial_sm; + uint64_t partial_dm = full_dm - partial_sm; int8_t *d_mem = pbs_buffer; double2 *buffer_fft = diff --git a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cuh b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cuh index 7e50c66bd..4c4c78b46 100644 --- a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cuh +++ b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cuh @@ -101,21 +101,23 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src, } template -__host__ __device__ int -get_buffer_size_cbs(uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t level_count_cbs, - uint32_t number_of_inputs) { +__host__ __device__ uint64_t get_buffer_size_cbs(uint32_t glwe_dimension, + uint32_t lwe_dimension, + uint32_t polynomial_size, + uint32_t level_count_cbs, + uint32_t number_of_inputs) { - int buffer_size = number_of_inputs * level_count_cbs * (glwe_dimension + 1) * - (glwe_dimension * polynomial_size + 1) * - sizeof(Torus) + // lwe_array_in_fp_ks_buffer - number_of_inputs * level_count_cbs * - (glwe_dimension * polynomial_size + 1) * - sizeof(Torus) + // lwe_array_out_pbs_buffer - number_of_inputs * level_count_cbs * (lwe_dimension + 1) * - sizeof(Torus) + // lwe_array_in_shifted_buffer - level_count_cbs * (glwe_dimension + 1) * polynomial_size * - sizeof(Torus); // lut_vector_cbs + uint64_t buffer_size = + number_of_inputs * level_count_cbs * (glwe_dimension + 1) * + (glwe_dimension * polynomial_size + 1) * + sizeof(Torus) + // lwe_array_in_fp_ks_buffer + number_of_inputs * level_count_cbs * + (glwe_dimension * polynomial_size + 1) * + sizeof(Torus) + // lwe_array_out_pbs_buffer + number_of_inputs * level_count_cbs * (lwe_dimension + 1) * + sizeof(Torus) + // lwe_array_in_shifted_buffer + level_count_cbs * (glwe_dimension + 1) * polynomial_size * + sizeof(Torus); // lut_vector_cbs return buffer_size + buffer_size % sizeof(double2); } @@ -132,7 +134,7 @@ __host__ void scratch_circuit_bootstrap( int pbs_count = number_of_inputs * level_count_cbs; // allocate and initialize device pointers for circuit bootstrap if (allocate_gpu_memory) { - int buffer_size = + uint64_t buffer_size = get_buffer_size_cbs(glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs) + diff --git a/backends/concrete-cuda/implementation/src/vertical_packing.cuh b/backends/concrete-cuda/implementation/src/vertical_packing.cuh index 9af502147..65bc587f3 100644 --- a/backends/concrete-cuda/implementation/src/vertical_packing.cuh +++ b/backends/concrete-cuda/implementation/src/vertical_packing.cuh @@ -210,9 +210,8 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in, } template -__host__ __device__ int -get_memory_needed_per_block_cmux_tree(uint32_t glwe_dimension, - uint32_t polynomial_size) { +__host__ __device__ uint64_t get_memory_needed_per_block_cmux_tree( + uint32_t glwe_dimension, uint32_t polynomial_size) { return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // glwe_sub sizeof(double2) * polynomial_size / 2 * (glwe_dimension + 1) + // res_fft @@ -220,27 +219,27 @@ get_memory_needed_per_block_cmux_tree(uint32_t glwe_dimension, } template -__host__ __device__ int -get_buffer_size_cmux_tree(uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t r, uint32_t tau, - uint32_t max_shared_memory) { +__host__ __device__ uint64_t get_buffer_size_cmux_tree( + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t r, uint32_t tau, uint32_t max_shared_memory) { - int memory_needed_per_block = get_memory_needed_per_block_cmux_tree( - glwe_dimension, polynomial_size); - int num_lut = (1 << r); - int ggsw_size = polynomial_size * (glwe_dimension + 1) * - (glwe_dimension + 1) * level_count; - int glwe_size = (glwe_dimension + 1) * polynomial_size; - int device_mem = 0; + uint64_t memory_needed_per_block = + get_memory_needed_per_block_cmux_tree(glwe_dimension, + polynomial_size); + uint64_t num_lut = (1 << r); + uint64_t ggsw_size = polynomial_size * (glwe_dimension + 1) * + (glwe_dimension + 1) * level_count; + uint64_t glwe_size = (glwe_dimension + 1) * polynomial_size; + uint64_t device_mem = 0; if (max_shared_memory < memory_needed_per_block) { device_mem = memory_needed_per_block * (1 << (r - 1)) * tau; } if (max_shared_memory < polynomial_size * sizeof(double)) { device_mem += polynomial_size * sizeof(double); } - int buffer_size = r * ggsw_size * sizeof(double) + - num_lut * tau * glwe_size * sizeof(Torus) + - num_lut * tau * glwe_size * sizeof(Torus) + device_mem; + uint64_t buffer_size = r * ggsw_size * sizeof(double) + + num_lut * tau * glwe_size * sizeof(Torus) + + num_lut * tau * glwe_size * sizeof(Torus) + device_mem; return buffer_size + buffer_size % sizeof(double2); } @@ -253,8 +252,9 @@ scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer, cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); - int memory_needed_per_block = get_memory_needed_per_block_cmux_tree( - glwe_dimension, polynomial_size); + uint64_t memory_needed_per_block = + get_memory_needed_per_block_cmux_tree(glwe_dimension, + polynomial_size); if (max_shared_memory >= memory_needed_per_block) { check_cuda_error(cudaFuncSetAttribute( device_batch_cmux, @@ -265,7 +265,7 @@ scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer, } if (allocate_gpu_memory) { - int buffer_size = get_buffer_size_cmux_tree( + uint64_t buffer_size = get_buffer_size_cmux_tree( glwe_dimension, polynomial_size, level_count, r, tau, max_shared_memory); *cmux_tree_buffer = @@ -308,8 +308,9 @@ host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, return; } - int memory_needed_per_block = get_memory_needed_per_block_cmux_tree( - glwe_dimension, polynomial_size); + uint64_t memory_needed_per_block = + get_memory_needed_per_block_cmux_tree(glwe_dimension, + polynomial_size); dim3 thds(polynomial_size / params::opt, 1, 1); @@ -467,7 +468,7 @@ __global__ void device_blind_rotation_and_sample_extraction( } template -__host__ __device__ int +__host__ __device__ uint64_t get_memory_needed_per_block_blind_rotation_sample_extraction( uint32_t glwe_dimension, uint32_t polynomial_size) { return sizeof(Torus) * polynomial_size * @@ -479,14 +480,14 @@ get_memory_needed_per_block_blind_rotation_sample_extraction( } template -__host__ __device__ int get_buffer_size_blind_rotation_sample_extraction( +__host__ __device__ uint64_t get_buffer_size_blind_rotation_sample_extraction( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory) { - int memory_needed_per_block = + uint64_t memory_needed_per_block = get_memory_needed_per_block_blind_rotation_sample_extraction( glwe_dimension, polynomial_size); - int device_mem = 0; + uint64_t device_mem = 0; if (max_shared_memory < memory_needed_per_block) { device_mem = memory_needed_per_block * tau; } @@ -495,8 +496,8 @@ __host__ __device__ int get_buffer_size_blind_rotation_sample_extraction( } int ggsw_size = polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * level_count; - int buffer_size = mbr_size * ggsw_size * sizeof(double) // d_ggsw_fft_in - + device_mem; + uint64_t buffer_size = mbr_size * ggsw_size * sizeof(double) // d_ggsw_fft_in + + device_mem; return buffer_size + buffer_size % sizeof(double2); } @@ -509,7 +510,7 @@ __host__ void scratch_blind_rotation_sample_extraction( cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); - int memory_needed_per_block = + uint64_t memory_needed_per_block = get_memory_needed_per_block_blind_rotation_sample_extraction( glwe_dimension, polynomial_size); if (max_shared_memory >= memory_needed_per_block) { @@ -524,9 +525,10 @@ __host__ void scratch_blind_rotation_sample_extraction( } if (allocate_gpu_memory) { - int buffer_size = get_buffer_size_blind_rotation_sample_extraction( - glwe_dimension, polynomial_size, level_count, mbr_size, tau, - max_shared_memory); + uint64_t buffer_size = + get_buffer_size_blind_rotation_sample_extraction( + glwe_dimension, polynomial_size, level_count, mbr_size, tau, + max_shared_memory); *br_se_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index); check_cuda_error(cudaGetLastError()); } @@ -542,7 +544,7 @@ __host__ void host_blind_rotate_and_sample_extraction( cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); - int memory_needed_per_block = + uint64_t memory_needed_per_block = get_memory_needed_per_block_blind_rotation_sample_extraction( glwe_dimension, polynomial_size); diff --git a/backends/concrete-cuda/implementation/src/wop_bootstrap.cuh b/backends/concrete-cuda/implementation/src/wop_bootstrap.cuh index eda2cf359..28fe5bab6 100644 --- a/backends/concrete-cuda/implementation/src/wop_bootstrap.cuh +++ b/backends/concrete-cuda/implementation/src/wop_bootstrap.cuh @@ -27,14 +27,15 @@ __global__ void device_build_lut(Torus *lut_out, Torus *lut_in, } template -__host__ __device__ int -get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count_cbs, uint32_t tau, - uint32_t number_of_inputs) { +__host__ __device__ uint64_t get_buffer_size_cbs_vp(uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t level_count_cbs, + uint32_t tau, + uint32_t number_of_inputs) { int ggsw_size = level_count_cbs * (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size; - int buffer_size = + uint64_t buffer_size = number_of_inputs * level_count_cbs * sizeof(Torus) + // lut_vector_indexes number_of_inputs * ggsw_size * sizeof(Torus) + // ggsw_out_cbs tau * (glwe_dimension + 1) * polynomial_size * @@ -58,7 +59,7 @@ __host__ void scratch_circuit_bootstrap_vertical_packing( (Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus)); uint32_t r = number_of_inputs - params::log2_degree; uint32_t mbr_size = number_of_inputs - r; - int buffer_size = + uint64_t buffer_size = get_buffer_size_cbs_vp(glwe_dimension, polynomial_size, level_count_cbs, tau, number_of_inputs) + get_buffer_size_cbs(glwe_dimension, lwe_dimension, polynomial_size, @@ -83,7 +84,7 @@ __host__ void scratch_circuit_bootstrap_vertical_packing( h_lut_vector_indexes[index] = index % level_count_cbs; } // lut_vector_indexes is the last buffer in the cbs_vp_buffer - int lut_vector_indexes_size = + uint64_t lut_vector_indexes_size = number_of_inputs * level_count_cbs * sizeof(Torus); int8_t *d_lut_vector_indexes = (int8_t *)*cbs_vp_buffer + @@ -192,13 +193,13 @@ __host__ void host_circuit_bootstrap_vertical_packing( } template -__host__ __device__ int +__host__ __device__ uint64_t get_buffer_size_wop_pbs(uint32_t lwe_dimension, uint32_t number_of_bits_of_message_including_padding) { - int buffer_size = (lwe_dimension + 1) * - (number_of_bits_of_message_including_padding) * - sizeof(Torus); // lwe_array_out_bit_extract + uint64_t buffer_size = (lwe_dimension + 1) * + (number_of_bits_of_message_including_padding) * + sizeof(Torus); // lwe_array_out_bit_extract return buffer_size + buffer_size % sizeof(double2); } @@ -216,7 +217,7 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer, cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); - int bit_extract_buffer_size = + uint64_t bit_extract_buffer_size = get_buffer_size_extract_bits(glwe_dimension, lwe_dimension, polynomial_size, number_of_inputs) + get_buffer_size_bootstrap_low_latency( @@ -228,7 +229,7 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer, uint32_t r = cbs_vp_number_of_inputs - params::log2_degree; uint32_t mbr_size = cbs_vp_number_of_inputs - r; if (allocate_gpu_memory) { - int buffer_size = + uint64_t buffer_size = bit_extract_buffer_size + get_buffer_size_wop_pbs( lwe_dimension, number_of_bits_of_message_including_padding) +