mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
refactor(cuda): introduce scratch for amortized PBS
This commit is contained in:
@@ -17,19 +17,40 @@ void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
|
||||
void *test_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
|
||||
void *test_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
|
||||
@@ -67,26 +88,6 @@ void cuda_extract_bits_64(
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_circuit_bootstrap_32(
|
||||
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
|
||||
void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
|
||||
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_circuit_bootstrap_64(
|
||||
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
|
||||
void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
|
||||
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory);
|
||||
|
||||
void scratch_cuda_circuit_bootstrap_vertical_packing_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
|
||||
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
|
||||
42
include/circuit_bootstrap.h
Normal file
42
include/circuit_bootstrap.h
Normal file
@@ -0,0 +1,42 @@
|
||||
#ifndef CUDA_CIRCUIT_BOOTSTRAP_H
|
||||
#define CUDA_CIRCUIT_BOOTSTRAP_H
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void scratch_cuda_circuit_bootstrap_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_circuit_bootstrap_64(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_circuit_bootstrap_32(
|
||||
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
|
||||
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
|
||||
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_circuit_bootstrap_64(
|
||||
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
|
||||
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
|
||||
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **cbs_buffer);
|
||||
}
|
||||
|
||||
#endif // CUDA_CIRCUIT_BOOTSTRAP_H
|
||||
@@ -3,7 +3,8 @@ set(SOURCES
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h)
|
||||
file(GLOB SOURCES
|
||||
"*.cu"
|
||||
"*.h"
|
||||
|
||||
@@ -93,11 +93,16 @@ extern "C" void cuda_boolean_and_32(
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_amortized_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
|
||||
@@ -196,11 +201,16 @@ extern "C" void cuda_boolean_nand_32(
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_amortized_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
|
||||
@@ -299,11 +309,16 @@ extern "C" void cuda_boolean_nor_32(
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_amortized_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
|
||||
@@ -394,11 +409,16 @@ extern "C" void cuda_boolean_or_32(
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_amortized_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_2, stream, gpu_index);
|
||||
@@ -510,11 +530,16 @@ extern "C" void cuda_boolean_xor_32(
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_amortized_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_3, stream, gpu_index);
|
||||
@@ -633,11 +658,16 @@ extern "C" void cuda_boolean_xnor_32(
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int8_t *pbs_buffer = nullptr;
|
||||
scratch_cuda_bootstrap_amortized_32(
|
||||
v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, true);
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
|
||||
lwe_buffer_4, bootstrapping_key, input_lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level_count,
|
||||
lwe_buffer_4, bootstrapping_key, pbs_buffer, input_lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
|
||||
input_lwe_ciphertext_count, 1, 0, max_shared_memory);
|
||||
cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(lwe_buffer_4, stream, gpu_index);
|
||||
|
||||
@@ -1,15 +1,113 @@
|
||||
#include "bootstrap_amortized.cuh"
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 32 bits inputs, into `cmux_tree_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 64 bits inputs, into `cmux_tree_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform the programmable bootstrapping on a batch of input u32 LWE
|
||||
* ciphertexts. See the corresponding operation on 64 bits for more details.
|
||||
*/
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
|
||||
@@ -25,7 +123,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_amortized<uint32_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -33,7 +131,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_amortized<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -41,7 +139,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_amortized<uint32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -49,7 +147,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_amortized<uint32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -57,7 +155,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_amortized<uint32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -65,7 +163,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_amortized<uint32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -142,9 +240,10 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
|
||||
@@ -160,7 +259,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_amortized<uint64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -168,7 +267,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_amortized<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -176,7 +275,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_amortized<uint64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -184,7 +283,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_amortized<uint64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -192,7 +291,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_amortized<uint64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -200,7 +299,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_amortized<uint64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
@@ -208,3 +307,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data for the amortized PBS on GPU in
|
||||
* pbs_buffer for 32 or 64 bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer) {
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
// Free memory
|
||||
cuda_drop_async(*pbs_buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -213,27 +213,100 @@ __global__ void device_bootstrap_amortized(
|
||||
glwe_dimension);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension) {
|
||||
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
|
||||
sizeof(Torus) * polynomial_size *
|
||||
(glwe_dimension + 1) + // accumulator rotated
|
||||
sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1) + // accumulator fft
|
||||
sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1); // calculate buffer fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension) {
|
||||
return sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1); // calculate buffer fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int get_buffer_size_bootstrap_amortized(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
int partial_sm = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
int partial_dm = full_sm - partial_sm;
|
||||
int full_dm = full_sm;
|
||||
int device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count;
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count;
|
||||
}
|
||||
return device_mem;
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
int partial_sm = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm);
|
||||
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_amortized(
|
||||
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int SM_FULL =
|
||||
sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
|
||||
sizeof(Torus) * polynomial_size *
|
||||
(glwe_dimension + 1) + // accumulator rotated
|
||||
sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1) + // accumulator fft
|
||||
sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1); // calculate buffer fft
|
||||
int SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
|
||||
int SM_PART = sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1); // calculate buffer fft
|
||||
int SM_PART = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
|
||||
int DM_PART = SM_FULL - SM_PART;
|
||||
|
||||
@@ -241,8 +314,6 @@ __host__ void host_bootstrap_amortized(
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int8_t *d_mem;
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
// where each block handles 1 sample and each thread
|
||||
// handles opt polynomial coefficients
|
||||
@@ -257,23 +328,15 @@ __host__ void host_bootstrap_amortized(
|
||||
// from one of three templates (no use, partial use or full use
|
||||
// of shared memory)
|
||||
if (max_shared_memory < SM_PART) {
|
||||
d_mem = (int8_t *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count,
|
||||
stream, gpu_index);
|
||||
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
|
||||
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
d_mem = (int8_t *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count,
|
||||
stream, gpu_index);
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, lwe_idx, DM_PART);
|
||||
} else {
|
||||
// For devices with compute capability 7.x a single thread block can
|
||||
@@ -281,22 +344,13 @@ __host__ void host_bootstrap_amortized(
|
||||
// device then has to be allocated dynamically.
|
||||
// For lower compute capabilities, this call
|
||||
// just does nothing and the amount of shared memory used is 48 KB
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
d_mem = (int8_t *)cuda_malloc_async(0, stream, gpu_index);
|
||||
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, lwe_idx, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
|
||||
@@ -1,4 +1,99 @@
|
||||
#include "circuit_bootstrap.cuh"
|
||||
#include "circuit_bootstrap.h"
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_circuit_bootstrap_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_circuit_bootstrap<uint32_t, int32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_circuit_bootstrap_64(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_circuit_bootstrap<uint64_t, int64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts.
|
||||
@@ -6,13 +101,12 @@
|
||||
*/
|
||||
void cuda_circuit_bootstrap_32(
|
||||
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
|
||||
void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
|
||||
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory) {
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
|
||||
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
|
||||
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
@@ -28,61 +122,51 @@ void cuda_circuit_bootstrap_32(
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / level_bsk));
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_circuit_bootstrap<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
|
||||
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_circuit_bootstrap<uint32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
|
||||
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_circuit_bootstrap<uint32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
|
||||
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_circuit_bootstrap<uint32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
|
||||
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_circuit_bootstrap<uint32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
|
||||
(uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
@@ -96,31 +180,20 @@ void cuda_circuit_bootstrap_32(
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - 'ggsw_out' output batch of ggsw with size:
|
||||
* 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
|
||||
* 'number_of_inputs' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
|
||||
* polynomial_size * sizeof(u64)
|
||||
* - 'lwe_array_in' input batch of lwe ciphertexts, with size:
|
||||
* 'number_of_samples' * '(lwe_dimension' + 1) * sizeof(u64)
|
||||
* 'number_of_inputs' * '(lwe_dimension' + 1) * sizeof(u64)
|
||||
* - 'fourier_bsk' bootstrapping key in fourier domain with size:
|
||||
* 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 *
|
||||
* 'polynomial_size' / 2 * sizeof(double2)
|
||||
* - 'fp_ksk_array' batch of fp-keyswitch keys with size:
|
||||
* ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 *
|
||||
* 'polynomial_size' * sizeof(u64)
|
||||
* The following 5 parameters are used during calculations, they are not actual
|
||||
* inputs of the function, they are just allocated memory for calculation
|
||||
* - 'cbs_buffer': buffer used during calculations, it is not an actual
|
||||
* inputs of the function, just allocated memory for calculation
|
||||
* process, like this, memory can be allocated once and can be used as much
|
||||
* as needed for different calls of circuit_bootstrap function
|
||||
* - 'lwe_array_in_shifted_buffer' with size:
|
||||
* 'number_of_samples' * 'level_cbs' * ('lwe_dimension' + 1) * sizeof(u64)
|
||||
* - 'lut_vector' with size:
|
||||
* 'level_cbs' * ('glwe_dimension' + 1) * 'polynomial_size' * sizeof(u64)
|
||||
* - 'lut_vector_indexes' stores the index corresponding to which test
|
||||
* vector to use
|
||||
* - 'lwe_array_out_pbs_buffer' with size
|
||||
* 'number_of_samples' * 'level_cbs' * ('polynomial_size' + 1) * sizeof(u64)
|
||||
* - 'lwe_array_in_fp_ks_buffer' with size
|
||||
* 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1) *
|
||||
* ('polynomial_size' + 1) * sizeof(u64)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* circuit bootstrap. The kernel is templatized based on integer discretization
|
||||
@@ -128,13 +201,12 @@ void cuda_circuit_bootstrap_32(
|
||||
*/
|
||||
void cuda_circuit_bootstrap_64(
|
||||
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
|
||||
void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
|
||||
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory) {
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
|
||||
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
|
||||
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
@@ -150,65 +222,66 @@ void cuda_circuit_bootstrap_64(
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"level_count_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / level_bsk));
|
||||
number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_circuit_bootstrap<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
|
||||
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_circuit_bootstrap<uint64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
|
||||
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_circuit_bootstrap<uint64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
|
||||
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_circuit_bootstrap<uint64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
|
||||
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_circuit_bootstrap<uint64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
|
||||
(double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
|
||||
(uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
|
||||
(uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
|
||||
base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data for the circuit bootstrap on GPU in
|
||||
* cbs_buffer for 32 or 64 bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **cbs_buffer) {
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
// Free memory
|
||||
cuda_drop_async(*cbs_buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#ifndef CBS_H
|
||||
#define CBS_H
|
||||
#ifndef CBS_CUH
|
||||
#define CBS_CUH
|
||||
|
||||
#include "bit_extraction.cuh"
|
||||
#include "bootstrap.h"
|
||||
@@ -96,6 +96,51 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_cbs(uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t number_of_inputs) {
|
||||
|
||||
return number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
|
||||
(polynomial_size + 1) *
|
||||
sizeof(Torus) + // lwe_array_in_fp_ks_buffer
|
||||
number_of_inputs * level_count_cbs * (polynomial_size + 1) *
|
||||
sizeof(Torus) + // lwe_array_out_pbs_buffer
|
||||
number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
|
||||
sizeof(Torus) + // lwe_array_in_shifted_buffer
|
||||
level_count_cbs * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus); // lut_vector_cbs
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_circuit_bootstrap(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int pbs_count = number_of_inputs * level_count_cbs;
|
||||
// allocate and initialize device pointers for circuit bootstrap and vertical
|
||||
// packing
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size =
|
||||
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs,
|
||||
number_of_inputs) +
|
||||
get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size, pbs_count, max_shared_memory);
|
||||
*cbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
}
|
||||
|
||||
scratch_bootstrap_amortized<Torus, STorus, params>(
|
||||
v_stream, gpu_index, cbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_count, max_shared_memory, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Host function for cuda circuit bootstrap.
|
||||
* It executes device functions in specific order and manages
|
||||
@@ -104,24 +149,37 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_circuit_bootstrap(
|
||||
void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in,
|
||||
double2 *fourier_bsk, Torus *fp_ksk_array,
|
||||
Torus *lwe_array_in_shifted_buffer, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_out_pbs_buffer,
|
||||
Torus *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
|
||||
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory) {
|
||||
double2 *fourier_bsk, Torus *fp_ksk_array, Torus *lut_vector_indexes,
|
||||
int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
|
||||
uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
|
||||
uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
|
||||
uint32_t lwe_size = lwe_dimension + 1;
|
||||
int pbs_count = number_of_samples * level_cbs;
|
||||
int pbs_count = number_of_inputs * level_cbs;
|
||||
|
||||
dim3 blocks(level_cbs, number_of_samples, 1);
|
||||
dim3 blocks(level_cbs, number_of_inputs, 1);
|
||||
int threads = 256;
|
||||
|
||||
Torus *lwe_array_in_fp_ks_buffer = (Torus *)cbs_buffer;
|
||||
Torus *lwe_array_out_pbs_buffer =
|
||||
(Torus *)lwe_array_in_fp_ks_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_cbs * (glwe_dimension + 1) *
|
||||
(polynomial_size + 1));
|
||||
Torus *lwe_array_in_shifted_buffer =
|
||||
(Torus *)lwe_array_out_pbs_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_cbs * (polynomial_size + 1));
|
||||
Torus *lut_vector =
|
||||
(Torus *)lwe_array_in_shifted_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_cbs * (lwe_dimension + 1));
|
||||
int8_t *pbs_buffer =
|
||||
(int8_t *)lut_vector + (ptrdiff_t)(level_cbs * (glwe_dimension + 1) *
|
||||
polynomial_size * sizeof(Torus));
|
||||
|
||||
// Shift message LSB on padding bit, at this point we expect to have messages
|
||||
// with only 1 bit of information
|
||||
shift_lwe_cbs<Torus, params><<<blocks, threads, 0, *stream>>>(
|
||||
@@ -143,7 +201,7 @@ __host__ void host_circuit_bootstrap(
|
||||
// MSB and no bit of padding
|
||||
host_bootstrap_amortized<Torus, params>(
|
||||
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk,
|
||||
lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
|
||||
pbs_count, level_cbs, 0, max_shared_memory);
|
||||
|
||||
@@ -161,4 +219,4 @@ __host__ void host_circuit_bootstrap(
|
||||
level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1);
|
||||
}
|
||||
|
||||
#endif // CBS_H
|
||||
#endif // CBS_CUH
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#ifndef VERTICAL_PACKING_CUH
|
||||
#define VERTICAL_PACKING_CUH
|
||||
|
||||
#include "../include/vertical_packing.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
|
||||
@@ -28,26 +28,17 @@ __global__ void device_build_lut(Torus *lut_out, Torus *lut_in,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t number_of_inputs, uint32_t tau) {
|
||||
get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs, uint32_t tau,
|
||||
uint32_t number_of_inputs) {
|
||||
|
||||
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * polynomial_size;
|
||||
return number_of_inputs * level_count_cbs *
|
||||
sizeof(Torus) // lut_vector_indexes
|
||||
+ number_of_inputs * ggsw_size * sizeof(Torus) // ggsw_out
|
||||
+
|
||||
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
|
||||
(polynomial_size + 1) * sizeof(Torus) // lwe_array_in_fp_ks_buffer
|
||||
+ number_of_inputs * level_count_cbs * (polynomial_size + 1) *
|
||||
sizeof(Torus) // lwe_array_out_pbs_buffer
|
||||
+ number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
|
||||
sizeof(Torus) // lwe_array_in_shifted_buffer
|
||||
+ level_count_cbs * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus) // lut_vector_cbs
|
||||
+ tau * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus); // glwe_array_out
|
||||
sizeof(Torus) + // lut_vector_indexes
|
||||
number_of_inputs * ggsw_size * sizeof(Torus) + // ggsw_out_cbs
|
||||
tau * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus); // glwe_array_out_cmux_tree
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
@@ -70,9 +61,14 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
|
||||
// packing
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size =
|
||||
get_buffer_size_cbs_vp<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs,
|
||||
number_of_inputs, tau) +
|
||||
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, tau, number_of_inputs) +
|
||||
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs,
|
||||
number_of_inputs) +
|
||||
get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size, number_of_inputs * level_count_cbs,
|
||||
max_shared_memory) +
|
||||
get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, r, tau,
|
||||
max_shared_memory) +
|
||||
@@ -96,6 +92,10 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
|
||||
|
||||
uint32_t bits = sizeof(Torus) * 8;
|
||||
*cbs_delta_log = (bits - 1);
|
||||
scratch_circuit_bootstrap<Torus, STorus, params>(
|
||||
v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
|
||||
false);
|
||||
scratch_cmux_tree<Torus, STorus, params>(
|
||||
v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size,
|
||||
level_count_cbs, r, tau, max_shared_memory, false);
|
||||
@@ -107,13 +107,8 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
|
||||
/*
|
||||
* Cleanup functions free the necessary data on the GPU and on the CPU.
|
||||
* Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
|
||||
* frees the data for the circuit bootstrap and vertical packing on GPU:
|
||||
* - ggsw_out
|
||||
* - lwe_array_in_fp_ks_buffer
|
||||
* - lwe_array_out_pbs_buffer
|
||||
* - lwe_array_in_shifted buffer
|
||||
* - lut_vector_cbs
|
||||
* - lut_vector_indexes
|
||||
* frees the data for the circuit bootstrap and vertical packing on GPU
|
||||
* contained in cbs_vp_buffer
|
||||
*/
|
||||
__host__ void
|
||||
cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index,
|
||||
@@ -139,32 +134,25 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
|
||||
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * polynomial_size;
|
||||
Torus *lut_vector_indexes = (Torus *)cbs_vp_buffer;
|
||||
Torus *ggsw_out = (Torus *)lut_vector_indexes +
|
||||
(ptrdiff_t)(number_of_inputs * level_count_cbs);
|
||||
Torus *lwe_array_in_fp_ks_buffer =
|
||||
(Torus *)ggsw_out + (ptrdiff_t)(number_of_inputs * ggsw_size);
|
||||
Torus *lwe_array_out_pbs_buffer =
|
||||
(Torus *)lwe_array_in_fp_ks_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
|
||||
(polynomial_size + 1));
|
||||
Torus *lwe_array_in_shifted_buffer =
|
||||
(Torus *)lwe_array_out_pbs_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_count_cbs * (polynomial_size + 1));
|
||||
Torus *lut_vector_cbs =
|
||||
(Torus *)lwe_array_in_shifted_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_count_cbs * (lwe_dimension + 1));
|
||||
Torus *glwe_array_out =
|
||||
(Torus *)lut_vector_cbs +
|
||||
(ptrdiff_t)(level_count_cbs * (glwe_dimension + 1) * polynomial_size);
|
||||
|
||||
Torus *lut_vector_indexes = (Torus *)cbs_vp_buffer;
|
||||
int8_t *cbs_buffer =
|
||||
(int8_t *)lut_vector_indexes +
|
||||
(ptrdiff_t)(number_of_inputs * level_count_cbs * sizeof(Torus));
|
||||
int8_t *ggsw_out_cbs =
|
||||
cbs_buffer +
|
||||
(ptrdiff_t)(get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs,
|
||||
number_of_inputs) +
|
||||
get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size,
|
||||
number_of_inputs * level_count_cbs, max_shared_memory));
|
||||
host_circuit_bootstrap<Torus, params>(
|
||||
v_stream, gpu_index, ggsw_out, lwe_array_in, fourier_bsk, cbs_fpksk,
|
||||
lwe_array_in_shifted_buffer, lut_vector_cbs, lut_vector_indexes,
|
||||
lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, cbs_delta_log,
|
||||
polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk,
|
||||
base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs,
|
||||
base_log_cbs, number_of_inputs, max_shared_memory);
|
||||
v_stream, gpu_index, (Torus *)ggsw_out_cbs, lwe_array_in, fourier_bsk,
|
||||
cbs_fpksk, lut_vector_indexes, cbs_buffer, cbs_delta_log, polynomial_size,
|
||||
glwe_dimension, lwe_dimension, level_count_bsk, base_log_bsk,
|
||||
level_count_pksk, base_log_pksk, level_count_cbs, base_log_cbs,
|
||||
number_of_inputs, max_shared_memory);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// number_of_inputs = tau * p is the total number of GGSWs
|
||||
@@ -172,29 +160,33 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
// lsb GGSW is for the last blind rotation.
|
||||
uint32_t r = number_of_inputs - params::log2_degree;
|
||||
int8_t *cmux_tree_buffer =
|
||||
(int8_t *)glwe_array_out +
|
||||
(ptrdiff_t)(tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
ggsw_out_cbs + (ptrdiff_t)(number_of_inputs * ggsw_size * sizeof(Torus));
|
||||
int8_t *glwe_array_out_cmux_tree =
|
||||
cmux_tree_buffer + (ptrdiff_t)(get_buffer_size_cmux_tree<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count_cbs,
|
||||
r, tau, max_shared_memory));
|
||||
// CMUX Tree
|
||||
// r = tau * p - log2(N)
|
||||
host_cmux_tree<Torus, STorus, params>(
|
||||
v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector,
|
||||
cmux_tree_buffer, glwe_dimension, polynomial_size, base_log_cbs,
|
||||
level_count_cbs, r, tau, max_shared_memory);
|
||||
v_stream, gpu_index, (Torus *)glwe_array_out_cmux_tree,
|
||||
(Torus *)ggsw_out_cbs, lut_vector, cmux_tree_buffer, glwe_dimension,
|
||||
polynomial_size, base_log_cbs, level_count_cbs, r, tau,
|
||||
max_shared_memory);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Blind rotation + sample extraction
|
||||
// mbr = tau * p - r = log2(N)
|
||||
Torus *br_ggsw = (Torus *)ggsw_out +
|
||||
Torus *br_ggsw = (Torus *)ggsw_out_cbs +
|
||||
(ptrdiff_t)(r * level_count_cbs * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * polynomial_size);
|
||||
int8_t *br_se_buffer =
|
||||
cmux_tree_buffer + (ptrdiff_t)(get_buffer_size_cmux_tree<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count_cbs,
|
||||
r, tau, max_shared_memory));
|
||||
glwe_array_out_cmux_tree +
|
||||
(ptrdiff_t)(tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
host_blind_rotate_and_sample_extraction<Torus, STorus, params>(
|
||||
v_stream, gpu_index, lwe_array_out, br_ggsw, glwe_array_out, br_se_buffer,
|
||||
number_of_inputs - r, tau, glwe_dimension, polynomial_size, base_log_cbs,
|
||||
level_count_cbs, max_shared_memory);
|
||||
v_stream, gpu_index, lwe_array_out, br_ggsw,
|
||||
(Torus *)glwe_array_out_cmux_tree, br_se_buffer, number_of_inputs - r,
|
||||
tau, glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -238,9 +230,14 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
uint32_t r = cbs_vp_number_of_inputs - params::log2_degree;
|
||||
uint32_t mbr_size = cbs_vp_number_of_inputs - r;
|
||||
int buffer_size =
|
||||
get_buffer_size_cbs_vp<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs,
|
||||
cbs_vp_number_of_inputs, tau) +
|
||||
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, tau,
|
||||
cbs_vp_number_of_inputs) +
|
||||
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension, polynomial_size,
|
||||
level_count_cbs, cbs_vp_number_of_inputs) +
|
||||
get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size,
|
||||
cbs_vp_number_of_inputs * level_count_cbs, max_shared_memory) +
|
||||
get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, r, tau,
|
||||
max_shared_memory) +
|
||||
|
||||
Reference in New Issue
Block a user