chore(backend/cuda): reduces memory consumption in the bootstrap benchmark

This commit is contained in:
Pedro Alves
2023-03-28 18:19:02 -03:00
committed by Agnès Leroy
parent d9652b8936
commit c236dffdd8
7 changed files with 141 additions and 67 deletions

View File

@@ -147,6 +147,14 @@ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
uint32_t gpu_index,
int8_t **cbs_vp_buffer);
uint64_t get_buffer_size_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}
#ifdef __CUDACC__

View File

@@ -1,5 +1,16 @@
#include "bootstrap_amortized.cuh"
/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
return get_buffer_size_bootstrap_amortized<uint64_t>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
}
/*
* Runs standard checks to validate the inputs
*/

View File

@@ -1,5 +1,16 @@
#include "bootstrap_low_latency.cuh"
/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory);
}
/*
* Runs standard checks to validate the inputs
*/

View File

@@ -39,8 +39,6 @@ protected:
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
int8_t *amortized_pbs_buffer;
int8_t *lowlat_pbs_buffer;
public:
void SetUp(const ::benchmark::State &state) {
@@ -56,11 +54,10 @@ public:
bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
&amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pbs_base_log, pbs_level,
message_modulus, carry_modulus, &payload_modulus, &delta,
input_lwe_ciphertext_count, 1, 1, gpu_index);
lwe_dimension, glwe_dimension, polynomial_size,
lwe_modular_variance, glwe_modular_variance, pbs_base_log,
pbs_level, message_modulus, carry_modulus, &payload_modulus,
&delta, input_lwe_ciphertext_count, 1, 1, gpu_index);
// We keep the following for the benchmarks with copies
lwe_ct_array = (uint64_t *)malloc(
@@ -71,31 +68,56 @@ public:
bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
gpu_index);
free(lwe_ct_array);
cudaDeviceReset();
}
};
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_AmortizedPBS)
(benchmark::State &st) {
void *v_stream = (void *)stream;
size_t free, total;
cudaMemGetInfo(&free, &total);
uint64_t buffer_size = get_buffer_size_bootstrap_amortized_64(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
if (buffer_size > free)
st.SkipWithError("Not enough free memory in the device. Skipping...");
int8_t *pbs_buffer;
scratch_cuda_bootstrap_amortized_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true);
for (auto _ : st) {
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, input_lwe_ciphertext_count,
input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(v_stream);
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
input_lwe_ciphertext_count, input_lwe_ciphertext_count, 0,
cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
cuda_synchronize_stream(stream);
}
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
(benchmark::State &st) {
void *v_stream = (void *)stream;
size_t free, total;
cudaMemGetInfo(&free, &total);
uint64_t buffer_size = get_buffer_size_bootstrap_amortized_64(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
if (buffer_size > free)
st.SkipWithError("Not enough free memory in the device. Skipping...");
int8_t *pbs_buffer;
scratch_cuda_bootstrap_amortized_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true);
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
@@ -107,37 +129,66 @@ BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, input_lwe_ciphertext_count,
input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
input_lwe_ciphertext_count, input_lwe_ciphertext_count, 0,
cuda_get_max_shared_memory(gpu_index));
cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
cuda_synchronize_stream(stream);
}
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
cuda_synchronize_stream(stream);
}
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
(benchmark::State &st) {
size_t free, total;
cudaMemGetInfo(&free, &total);
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
if (buffer_size > free)
st.SkipWithError("Not enough free memory in the device. Skipping...");
int8_t *pbs_buffer;
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index), true);
for (auto _ : st) {
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, 1, 1, 0,
cuda_get_max_shared_memory(gpu_index));
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
1, 1, 0, cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
cuda_synchronize_stream(stream);
}
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
(benchmark::State &st) {
void *v_stream = (void *)stream;
size_t free, total;
cudaMemGetInfo(&free, &total);
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
if (buffer_size > free)
st.SkipWithError("Not enough free memory in the device. Skipping...");
int8_t *pbs_buffer;
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index), true);
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
@@ -148,17 +199,18 @@ BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, 1, 1, 0,
cuda_get_max_shared_memory(gpu_index));
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
1, 1, 0, cuda_get_max_shared_memory(gpu_index));
cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
cuda_synchronize_stream(stream);
}
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
cuda_synchronize_stream(stream);
}
static void

View File

@@ -16,9 +16,8 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
double **d_fourier_bsk_array, uint64_t **plaintexts,
uint64_t **d_lut_pbs_identity,
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array,
int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
uint64_t **d_lwe_ct_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance,
int pbs_base_log, int pbs_level, int message_modulus,
int carry_modulus, int *payload_modulus, uint64_t *delta,
@@ -30,9 +29,7 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *d_lut_pbs_identity,
uint64_t *d_lut_pbs_indexes,
uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array,
int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
int gpu_index);
uint64_t *d_lwe_ct_out_array, int gpu_index);
void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,

View File

@@ -7,9 +7,8 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
double **d_fourier_bsk_array, uint64_t **plaintexts,
uint64_t **d_lut_pbs_identity,
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array,
int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
uint64_t **d_lwe_ct_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance,
int pbs_base_log, int pbs_level, int message_modulus,
int carry_modulus, int *payload_modulus, uint64_t *delta,
@@ -93,13 +92,6 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
scratch_cuda_bootstrap_amortized_64(
stream, gpu_index, amortized_pbs_buffer, glwe_dimension, polynomial_size,
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, lowlat_pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
cuda_synchronize_stream(v_stream);
free(lwe_ct_in_array);
@@ -112,9 +104,7 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *d_lut_pbs_identity,
uint64_t *d_lut_pbs_indexes,
uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array,
int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
int gpu_index) {
uint64_t *d_lwe_ct_out_array, int gpu_index) {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
@@ -128,8 +118,6 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &amortized_pbs_buffer);
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &lowlat_pbs_buffer);
cuda_destroy_stream(stream, gpu_index);
}

View File

@@ -50,8 +50,6 @@ protected:
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_out_array;
int8_t *amortized_pbs_buffer;
int8_t *lowlat_pbs_buffer;
public:
// Test arithmetic functions
@@ -75,11 +73,10 @@ public:
bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
&amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pbs_base_log, pbs_level,
message_modulus, carry_modulus, &payload_modulus, &delta,
number_of_inputs, repetitions, samples, gpu_index);
lwe_dimension, glwe_dimension, polynomial_size,
lwe_modular_variance, glwe_modular_variance, pbs_base_log,
pbs_level, message_modulus, carry_modulus, &payload_modulus,
&delta, number_of_inputs, repetitions, samples, gpu_index);
lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
@@ -91,11 +88,16 @@ public:
bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
gpu_index);
}
};
TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
int8_t *pbs_buffer;
scratch_cuda_bootstrap_amortized_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
@@ -112,10 +114,9 @@ TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, amortized_pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, 1, 0,
cuda_get_max_shared_memory(gpu_index));
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
@@ -147,9 +148,15 @@ TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
}
}
}
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
}
TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
int8_t *pbs_buffer;
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
if (number_of_inputs > number_of_sm * 4 / (glwe_dimension + 1) / pbs_level)
@@ -170,10 +177,9 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, lowlat_pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, 1, 0,
cuda_get_max_shared_memory(gpu_index));
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
@@ -204,6 +210,7 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
}
}
}
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
}
// Defines for which parameters set the PBS will be tested.