refactor(cuda): introduce scratch for amortized PBS

This commit is contained in:
Agnes Leroy
2023-02-16 10:04:50 +01:00
committed by Agnès Leroy
parent 2a487ffbfd
commit 5cd0cb5d19
10 changed files with 612 additions and 247 deletions

View File

@@ -17,19 +17,40 @@ void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
void *test_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
void *test_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
@@ -67,26 +88,6 @@ void cuda_extract_bits_64(
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
uint32_t max_shared_memory);
void cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory);
void cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory);
void scratch_cuda_circuit_bootstrap_vertical_packing_32(
void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,

View File

@@ -0,0 +1,42 @@
#ifndef CUDA_CIRCUIT_BOOTSTRAP_H
#define CUDA_CIRCUIT_BOOTSTRAP_H
#include <cstdint>
extern "C" {
void scratch_cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void scratch_cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory);
void cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory);
void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
int8_t **cbs_buffer);
}
#endif // CUDA_CIRCUIT_BOOTSTRAP_H

View File

@@ -3,7 +3,8 @@ set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h)
file(GLOB SOURCES
"*.cu"
"*.h"

View File

@@ -93,11 +93,16 @@ extern "C" void cuda_boolean_and_32(
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_amortized_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level_count,
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
@@ -196,11 +201,16 @@ extern "C" void cuda_boolean_nand_32(
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_amortized_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level_count,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
@@ -299,11 +309,16 @@ extern "C" void cuda_boolean_nor_32(
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_amortized_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level_count,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
@@ -394,11 +409,16 @@ extern "C" void cuda_boolean_or_32(
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_amortized_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level_count,
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
@@ -510,11 +530,16 @@ extern "C" void cuda_boolean_xor_32(
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_amortized_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level_count,
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
@@ -633,11 +658,16 @@ extern "C" void cuda_boolean_xnor_32(
stream, gpu_index);
check_cuda_error(cudaGetLastError());
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_amortized_32(
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, true);
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
lwe_buffer_4, bootstrapping_key, input_lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level_count,
lwe_buffer_4, bootstrapping_key, pbs_buffer, input_lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_buffer_4, stream, gpu_index);

View File

@@ -1,15 +1,113 @@
#include "bootstrap_amortized.cuh"
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 32 bits inputs, into `cmux_tree_buffer`. It also
* configures SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<256>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 64 bits inputs, into `cmux_tree_buffer`. It also
* configures SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<256>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/* Perform the programmable bootstrapping on a batch of input u32 LWE
* ciphertexts. See the corresponding operation on 64 bits for more details.
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(
("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
@@ -25,7 +123,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
host_bootstrap_amortized<uint32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -33,7 +131,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
host_bootstrap_amortized<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -41,7 +139,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
host_bootstrap_amortized<uint32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -49,7 +147,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
host_bootstrap_amortized<uint32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -57,7 +155,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
host_bootstrap_amortized<uint32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -65,7 +163,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
host_bootstrap_amortized<uint32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -142,9 +240,10 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory) {
assert(
("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
@@ -160,7 +259,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
host_bootstrap_amortized<uint64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -168,7 +267,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
host_bootstrap_amortized<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -176,7 +275,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
host_bootstrap_amortized<uint64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -184,7 +283,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
host_bootstrap_amortized<uint64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -192,7 +291,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
host_bootstrap_amortized<uint64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -200,7 +299,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
host_bootstrap_amortized<uint64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
@@ -208,3 +307,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
break;
}
}
/*
* This cleanup function frees the data for the amortized PBS on GPU in
* pbs_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*pbs_buffer, stream, gpu_index);
}

View File

@@ -213,27 +213,100 @@ __global__ void device_bootstrap_amortized(
glwe_dimension);
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size,
uint32_t glwe_dimension) {
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
sizeof(Torus) * polynomial_size *
(glwe_dimension + 1) + // accumulator rotated
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1) + // accumulator fft
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1); // calculate buffer fft
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size,
uint32_t glwe_dimension) {
return sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1); // calculate buffer fft
}
template <typename Torus>
__host__ __device__ int get_buffer_size_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
int partial_sm = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
int partial_dm = full_sm - partial_sm;
int full_dm = full_sm;
int device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count;
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count;
}
return device_mem;
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
int8_t **pbs_buffer,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
int partial_sm = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm);
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
check_cuda_error(cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
int buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, class params>
__host__ void host_bootstrap_amortized(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
int SM_FULL =
sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
sizeof(Torus) * polynomial_size *
(glwe_dimension + 1) + // accumulator rotated
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1) + // accumulator fft
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1); // calculate buffer fft
int SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
int SM_PART = sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1); // calculate buffer fft
int SM_PART = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
int DM_PART = SM_FULL - SM_PART;
@@ -241,8 +314,6 @@ __host__ void host_bootstrap_amortized(
auto stream = static_cast<cudaStream_t *>(v_stream);
int8_t *d_mem;
// Create a 1-dimensional grid of threads
// where each block handles 1 sample and each thread
// handles opt polynomial coefficients
@@ -257,23 +328,15 @@ __host__ void host_bootstrap_amortized(
// from one of three templates (no use, partial use or full use
// of shared memory)
if (max_shared_memory < SM_PART) {
d_mem = (int8_t *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count,
stream, gpu_index);
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
} else if (max_shared_memory < SM_FULL) {
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
d_mem = (int8_t *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count,
stream, gpu_index);
device_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, DM_PART);
} else {
// For devices with compute capability 7.x a single thread block can
@@ -281,22 +344,13 @@ __host__ void host_bootstrap_amortized(
// device then has to be allocated dynamically.
// For lower compute capabilities, this call
// just does nothing and the amount of shared memory used is 48 KB
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
check_cuda_error(cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
d_mem = (int8_t *)cuda_malloc_async(0, stream, gpu_index);
device_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, 0);
}
check_cuda_error(cudaGetLastError());
cuda_drop_async(d_mem, stream, gpu_index);
}
template <typename Torus, class params>

View File

@@ -1,4 +1,99 @@
#include "circuit_bootstrap.cuh"
#include "circuit_bootstrap.h"
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
* configures SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 512:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 1024:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<1024>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 2048:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<2048>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 4096:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<4096>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 8192:
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<8192>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
* configures SM options on the GPU in case FULLSM mode is going to be used.
*/
void scratch_cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 512:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 1024:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 2048:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 4096:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
case 8192:
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
allocate_gpu_memory);
break;
default:
break;
}
}
/*
* Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts.
@@ -6,13 +101,12 @@
*/
void cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory) {
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
@@ -28,61 +122,51 @@ void cuda_circuit_bootstrap_32(
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_samples <= number_of_sm / 4. / 2. / level_bsk));
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
switch (polynomial_size) {
case 512:
host_circuit_bootstrap<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 1024:
host_circuit_bootstrap<uint32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 2048:
host_circuit_bootstrap<uint32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 4096:
host_circuit_bootstrap<uint32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 8192:
host_circuit_bootstrap<uint32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
default:
@@ -96,31 +180,20 @@ void cuda_circuit_bootstrap_32(
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'ggsw_out' output batch of ggsw with size:
* 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
* 'number_of_inputs' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
* polynomial_size * sizeof(u64)
* - 'lwe_array_in' input batch of lwe ciphertexts, with size:
* 'number_of_samples' * '(lwe_dimension' + 1) * sizeof(u64)
* 'number_of_inputs' * '(lwe_dimension' + 1) * sizeof(u64)
* - 'fourier_bsk' bootstrapping key in fourier domain with size:
* 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 *
* 'polynomial_size' / 2 * sizeof(double2)
* - 'fp_ksk_array' batch of fp-keyswitch keys with size:
* ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 *
* 'polynomial_size' * sizeof(u64)
* The following 5 parameters are used during calculations, they are not actual
* inputs of the function, they are just allocated memory for calculation
* - 'cbs_buffer': buffer used during calculations, it is not an actual
* inputs of the function, just allocated memory for calculation
* process, like this, memory can be allocated once and can be used as much
* as needed for different calls of circuit_bootstrap function
* - 'lwe_array_in_shifted_buffer' with size:
* 'number_of_samples' * 'level_cbs' * ('lwe_dimension' + 1) * sizeof(u64)
* - 'lut_vector' with size:
* 'level_cbs' * ('glwe_dimension' + 1) * 'polynomial_size' * sizeof(u64)
* - 'lut_vector_indexes' stores the index corresponding to which test
* vector to use
* - 'lwe_array_out_pbs_buffer' with size
* 'number_of_samples' * 'level_cbs' * ('polynomial_size' + 1) * sizeof(u64)
* - 'lwe_array_in_fp_ks_buffer' with size
* 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1) *
* ('polynomial_size' + 1) * sizeof(u64)
*
* This function calls a wrapper to a device kernel that performs the
* circuit bootstrap. The kernel is templatized based on integer discretization
@@ -128,13 +201,12 @@ void cuda_circuit_bootstrap_32(
*/
void cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory) {
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
@@ -150,65 +222,66 @@ void cuda_circuit_bootstrap_64(
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"level_count_bsk",
number_of_samples <= number_of_sm / 4. / 2. / level_bsk));
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
// The number of samples should be lower than the number of streaming
switch (polynomial_size) {
case 512:
host_circuit_bootstrap<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 1024:
host_circuit_bootstrap<uint64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 2048:
host_circuit_bootstrap<uint64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 4096:
host_circuit_bootstrap<uint64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
case 8192:
host_circuit_bootstrap<uint64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
max_shared_memory);
break;
default:
break;
}
}
/*
* This cleanup function frees the data for the circuit bootstrap on GPU in
* cbs_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
int8_t **cbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*cbs_buffer, stream, gpu_index);
}

View File

@@ -1,5 +1,5 @@
#ifndef CBS_H
#define CBS_H
#ifndef CBS_CUH
#define CBS_CUH
#include "bit_extraction.cuh"
#include "bootstrap.h"
@@ -96,6 +96,51 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
}
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_cbs(uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs) {
return number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(polynomial_size + 1) *
sizeof(Torus) + // lwe_array_in_fp_ks_buffer
number_of_inputs * level_count_cbs * (polynomial_size + 1) *
sizeof(Torus) + // lwe_array_out_pbs_buffer
number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
sizeof(Torus) + // lwe_array_in_shifted_buffer
level_count_cbs * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus); // lut_vector_cbs
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_circuit_bootstrap(
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int pbs_count = number_of_inputs * level_count_cbs;
// allocate and initialize device pointers for circuit bootstrap and vertical
// packing
if (allocate_gpu_memory) {
int buffer_size =
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
number_of_inputs) +
get_buffer_size_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, pbs_count, max_shared_memory);
*cbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
}
scratch_bootstrap_amortized<Torus, STorus, params>(
v_stream, gpu_index, cbs_buffer, glwe_dimension, polynomial_size,
pbs_count, max_shared_memory, false);
}
/*
* Host function for cuda circuit bootstrap.
* It executes device functions in specific order and manages
@@ -104,24 +149,37 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
template <typename Torus, class params>
__host__ void host_circuit_bootstrap(
void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in,
double2 *fourier_bsk, Torus *fp_ksk_array,
Torus *lwe_array_in_shifted_buffer, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_out_pbs_buffer,
Torus *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory) {
double2 *fourier_bsk, Torus *fp_ksk_array, Torus *lut_vector_indexes,
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
uint32_t lwe_size = lwe_dimension + 1;
int pbs_count = number_of_samples * level_cbs;
int pbs_count = number_of_inputs * level_cbs;
dim3 blocks(level_cbs, number_of_samples, 1);
dim3 blocks(level_cbs, number_of_inputs, 1);
int threads = 256;
Torus *lwe_array_in_fp_ks_buffer = (Torus *)cbs_buffer;
Torus *lwe_array_out_pbs_buffer =
(Torus *)lwe_array_in_fp_ks_buffer +
(ptrdiff_t)(number_of_inputs * level_cbs * (glwe_dimension + 1) *
(polynomial_size + 1));
Torus *lwe_array_in_shifted_buffer =
(Torus *)lwe_array_out_pbs_buffer +
(ptrdiff_t)(number_of_inputs * level_cbs * (polynomial_size + 1));
Torus *lut_vector =
(Torus *)lwe_array_in_shifted_buffer +
(ptrdiff_t)(number_of_inputs * level_cbs * (lwe_dimension + 1));
int8_t *pbs_buffer =
(int8_t *)lut_vector + (ptrdiff_t)(level_cbs * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus));
// Shift message LSB on padding bit, at this point we expect to have messages
// with only 1 bit of information
shift_lwe_cbs<Torus, params><<<blocks, threads, 0, *stream>>>(
@@ -143,7 +201,7 @@ __host__ void host_circuit_bootstrap(
// MSB and no bit of padding
host_bootstrap_amortized<Torus, params>(
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk,
lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
pbs_count, level_cbs, 0, max_shared_memory);
@@ -161,4 +219,4 @@ __host__ void host_circuit_bootstrap(
level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1);
}
#endif // CBS_H
#endif // CBS_CUH

View File

@@ -1,7 +1,6 @@
#ifndef VERTICAL_PACKING_CUH
#define VERTICAL_PACKING_CUH
#include "../include/vertical_packing.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
#include "crypto/gadget.cuh"

View File

@@ -28,26 +28,17 @@ __global__ void device_build_lut(Torus *lut_out, Torus *lut_in,
template <typename Torus>
__host__ __device__ int
get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau) {
get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t tau,
uint32_t number_of_inputs) {
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
return number_of_inputs * level_count_cbs *
sizeof(Torus) // lut_vector_indexes
+ number_of_inputs * ggsw_size * sizeof(Torus) // ggsw_out
+
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(polynomial_size + 1) * sizeof(Torus) // lwe_array_in_fp_ks_buffer
+ number_of_inputs * level_count_cbs * (polynomial_size + 1) *
sizeof(Torus) // lwe_array_out_pbs_buffer
+ number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
sizeof(Torus) // lwe_array_in_shifted_buffer
+ level_count_cbs * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus) // lut_vector_cbs
+ tau * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus); // glwe_array_out
sizeof(Torus) + // lut_vector_indexes
number_of_inputs * ggsw_size * sizeof(Torus) + // ggsw_out_cbs
tau * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus); // glwe_array_out_cmux_tree
}
template <typename Torus, typename STorus, typename params>
@@ -70,9 +61,14 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
// packing
if (allocate_gpu_memory) {
int buffer_size =
get_buffer_size_cbs_vp<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
number_of_inputs, tau) +
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, tau, number_of_inputs) +
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
number_of_inputs) +
get_buffer_size_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, number_of_inputs * level_count_cbs,
max_shared_memory) +
get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, r, tau,
max_shared_memory) +
@@ -96,6 +92,10 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
uint32_t bits = sizeof(Torus) * 8;
*cbs_delta_log = (bits - 1);
scratch_circuit_bootstrap<Torus, STorus, params>(
v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
false);
scratch_cmux_tree<Torus, STorus, params>(
v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size,
level_count_cbs, r, tau, max_shared_memory, false);
@@ -107,13 +107,8 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
/*
* Cleanup functions free the necessary data on the GPU and on the CPU.
* Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
* frees the data for the circuit bootstrap and vertical packing on GPU:
* - ggsw_out
* - lwe_array_in_fp_ks_buffer
* - lwe_array_out_pbs_buffer
* - lwe_array_in_shifted buffer
* - lut_vector_cbs
* - lut_vector_indexes
* frees the data for the circuit bootstrap and vertical packing on GPU
* contained in cbs_vp_buffer
*/
__host__ void
cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index,
@@ -139,32 +134,25 @@ __host__ void host_circuit_bootstrap_vertical_packing(
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
Torus *lut_vector_indexes = (Torus *)cbs_vp_buffer;
Torus *ggsw_out = (Torus *)lut_vector_indexes +
(ptrdiff_t)(number_of_inputs * level_count_cbs);
Torus *lwe_array_in_fp_ks_buffer =
(Torus *)ggsw_out + (ptrdiff_t)(number_of_inputs * ggsw_size);
Torus *lwe_array_out_pbs_buffer =
(Torus *)lwe_array_in_fp_ks_buffer +
(ptrdiff_t)(number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(polynomial_size + 1));
Torus *lwe_array_in_shifted_buffer =
(Torus *)lwe_array_out_pbs_buffer +
(ptrdiff_t)(number_of_inputs * level_count_cbs * (polynomial_size + 1));
Torus *lut_vector_cbs =
(Torus *)lwe_array_in_shifted_buffer +
(ptrdiff_t)(number_of_inputs * level_count_cbs * (lwe_dimension + 1));
Torus *glwe_array_out =
(Torus *)lut_vector_cbs +
(ptrdiff_t)(level_count_cbs * (glwe_dimension + 1) * polynomial_size);
Torus *lut_vector_indexes = (Torus *)cbs_vp_buffer;
int8_t *cbs_buffer =
(int8_t *)lut_vector_indexes +
(ptrdiff_t)(number_of_inputs * level_count_cbs * sizeof(Torus));
int8_t *ggsw_out_cbs =
cbs_buffer +
(ptrdiff_t)(get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
number_of_inputs) +
get_buffer_size_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size,
number_of_inputs * level_count_cbs, max_shared_memory));
host_circuit_bootstrap<Torus, params>(
v_stream, gpu_index, ggsw_out, lwe_array_in, fourier_bsk, cbs_fpksk,
lwe_array_in_shifted_buffer, lut_vector_cbs, lut_vector_indexes,
lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, cbs_delta_log,
polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk,
base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs,
base_log_cbs, number_of_inputs, max_shared_memory);
v_stream, gpu_index, (Torus *)ggsw_out_cbs, lwe_array_in, fourier_bsk,
cbs_fpksk, lut_vector_indexes, cbs_buffer, cbs_delta_log, polynomial_size,
glwe_dimension, lwe_dimension, level_count_bsk, base_log_bsk,
level_count_pksk, base_log_pksk, level_count_cbs, base_log_cbs,
number_of_inputs, max_shared_memory);
check_cuda_error(cudaGetLastError());
// number_of_inputs = tau * p is the total number of GGSWs
@@ -172,29 +160,33 @@ __host__ void host_circuit_bootstrap_vertical_packing(
// lsb GGSW is for the last blind rotation.
uint32_t r = number_of_inputs - params::log2_degree;
int8_t *cmux_tree_buffer =
(int8_t *)glwe_array_out +
(ptrdiff_t)(tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus));
ggsw_out_cbs + (ptrdiff_t)(number_of_inputs * ggsw_size * sizeof(Torus));
int8_t *glwe_array_out_cmux_tree =
cmux_tree_buffer + (ptrdiff_t)(get_buffer_size_cmux_tree<Torus>(
glwe_dimension, polynomial_size, level_count_cbs,
r, tau, max_shared_memory));
// CMUX Tree
// r = tau * p - log2(N)
host_cmux_tree<Torus, STorus, params>(
v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector,
cmux_tree_buffer, glwe_dimension, polynomial_size, base_log_cbs,
level_count_cbs, r, tau, max_shared_memory);
v_stream, gpu_index, (Torus *)glwe_array_out_cmux_tree,
(Torus *)ggsw_out_cbs, lut_vector, cmux_tree_buffer, glwe_dimension,
polynomial_size, base_log_cbs, level_count_cbs, r, tau,
max_shared_memory);
check_cuda_error(cudaGetLastError());
// Blind rotation + sample extraction
// mbr = tau * p - r = log2(N)
Torus *br_ggsw = (Torus *)ggsw_out +
Torus *br_ggsw = (Torus *)ggsw_out_cbs +
(ptrdiff_t)(r * level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size);
int8_t *br_se_buffer =
cmux_tree_buffer + (ptrdiff_t)(get_buffer_size_cmux_tree<Torus>(
glwe_dimension, polynomial_size, level_count_cbs,
r, tau, max_shared_memory));
glwe_array_out_cmux_tree +
(ptrdiff_t)(tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus));
host_blind_rotate_and_sample_extraction<Torus, STorus, params>(
v_stream, gpu_index, lwe_array_out, br_ggsw, glwe_array_out, br_se_buffer,
number_of_inputs - r, tau, glwe_dimension, polynomial_size, base_log_cbs,
level_count_cbs, max_shared_memory);
v_stream, gpu_index, lwe_array_out, br_ggsw,
(Torus *)glwe_array_out_cmux_tree, br_se_buffer, number_of_inputs - r,
tau, glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs,
max_shared_memory);
}
template <typename Torus>
@@ -238,9 +230,14 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
uint32_t r = cbs_vp_number_of_inputs - params::log2_degree;
uint32_t mbr_size = cbs_vp_number_of_inputs - r;
int buffer_size =
get_buffer_size_cbs_vp<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
cbs_vp_number_of_inputs, tau) +
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, tau,
cbs_vp_number_of_inputs) +
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension, polynomial_size,
level_count_cbs, cbs_vp_number_of_inputs) +
get_buffer_size_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size,
cbs_vp_number_of_inputs * level_count_cbs, max_shared_memory) +
get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, r, tau,
max_shared_memory) +