mirror of
https://github.com/zama-ai/concrete.git
synced 2026-04-17 03:00:54 -04:00
chore(backend/cuda): reduces memory consumption in the bootstrap benchmark
This commit is contained in:
@@ -147,6 +147,14 @@ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **cbs_vp_buffer);
|
||||
|
||||
uint64_t get_buffer_size_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
|
||||
uint64_t get_buffer_size_bootstrap_low_latency_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
@@ -1,5 +1,16 @@
|
||||
#include "bootstrap_amortized.cuh"
|
||||
|
||||
/*
|
||||
* Returns the buffer size for 64 bits executions
|
||||
*/
|
||||
uint64_t get_buffer_size_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
return get_buffer_size_bootstrap_amortized<uint64_t>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
|
||||
@@ -1,5 +1,16 @@
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
|
||||
/*
|
||||
* Returns the buffer size for 64 bits executions
|
||||
*/
|
||||
uint64_t get_buffer_size_bootstrap_low_latency_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
return get_buffer_size_bootstrap_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
|
||||
@@ -39,8 +39,6 @@ protected:
|
||||
Csprng *csprng;
|
||||
cudaStream_t *stream;
|
||||
int gpu_index = 0;
|
||||
int8_t *amortized_pbs_buffer;
|
||||
int8_t *lowlat_pbs_buffer;
|
||||
|
||||
public:
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
@@ -56,11 +54,10 @@ public:
|
||||
bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
|
||||
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
|
||||
&amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, lwe_modular_variance,
|
||||
glwe_modular_variance, pbs_base_log, pbs_level,
|
||||
message_modulus, carry_modulus, &payload_modulus, &delta,
|
||||
input_lwe_ciphertext_count, 1, 1, gpu_index);
|
||||
lwe_dimension, glwe_dimension, polynomial_size,
|
||||
lwe_modular_variance, glwe_modular_variance, pbs_base_log,
|
||||
pbs_level, message_modulus, carry_modulus, &payload_modulus,
|
||||
&delta, input_lwe_ciphertext_count, 1, 1, gpu_index);
|
||||
|
||||
// We keep the following for the benchmarks with copies
|
||||
lwe_ct_array = (uint64_t *)malloc(
|
||||
@@ -71,31 +68,56 @@ public:
|
||||
bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
|
||||
d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
|
||||
amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
|
||||
gpu_index);
|
||||
free(lwe_ct_array);
|
||||
cudaDeviceReset();
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_AmortizedPBS)
|
||||
(benchmark::State &st) {
|
||||
void *v_stream = (void *)stream;
|
||||
size_t free, total;
|
||||
cudaMemGetInfo(&free, &total);
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_amortized_64(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
if (buffer_size > free)
|
||||
st.SkipWithError("Not enough free memory in the device. Skipping...");
|
||||
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
|
||||
amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count,
|
||||
input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(v_stream);
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
input_lwe_ciphertext_count, input_lwe_ciphertext_count, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
|
||||
(benchmark::State &st) {
|
||||
void *v_stream = (void *)stream;
|
||||
size_t free, total;
|
||||
cudaMemGetInfo(&free, &total);
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_amortized_64(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
if (buffer_size > free)
|
||||
st.SkipWithError("Not enough free memory in the device. Skipping...");
|
||||
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
for (auto _ : st) {
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
|
||||
@@ -107,37 +129,66 @@ BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
|
||||
amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count,
|
||||
input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
input_lwe_ciphertext_count, input_lwe_ciphertext_count, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
|
||||
(benchmark::State &st) {
|
||||
size_t free, total;
|
||||
cudaMemGetInfo(&free, &total);
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
|
||||
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
if (buffer_size > free)
|
||||
st.SkipWithError("Not enough free memory in the device. Skipping...");
|
||||
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
|
||||
lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, 1, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
1, 1, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
|
||||
(benchmark::State &st) {
|
||||
void *v_stream = (void *)stream;
|
||||
size_t free, total;
|
||||
cudaMemGetInfo(&free, &total);
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
|
||||
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
if (buffer_size > free)
|
||||
st.SkipWithError("Not enough free memory in the device. Skipping...");
|
||||
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
for (auto _ : st) {
|
||||
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
|
||||
@@ -148,17 +199,18 @@ BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
|
||||
lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, 1, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
1, 1, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
cuda_synchronize_stream(v_stream);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -16,9 +16,8 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
double **d_fourier_bsk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lut_pbs_identity,
|
||||
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_ct_out_array,
|
||||
int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
|
||||
int lwe_dimension, int glwe_dimension, int polynomial_size,
|
||||
uint64_t **d_lwe_ct_out_array, int lwe_dimension,
|
||||
int glwe_dimension, int polynomial_size,
|
||||
double lwe_modular_variance, double glwe_modular_variance,
|
||||
int pbs_base_log, int pbs_level, int message_modulus,
|
||||
int carry_modulus, int *payload_modulus, uint64_t *delta,
|
||||
@@ -30,9 +29,7 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *d_lut_pbs_identity,
|
||||
uint64_t *d_lut_pbs_indexes,
|
||||
uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_ct_out_array,
|
||||
int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
|
||||
int gpu_index);
|
||||
uint64_t *d_lwe_ct_out_array, int gpu_index);
|
||||
|
||||
void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
|
||||
|
||||
@@ -7,9 +7,8 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
double **d_fourier_bsk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lut_pbs_identity,
|
||||
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_ct_out_array,
|
||||
int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
|
||||
int lwe_dimension, int glwe_dimension, int polynomial_size,
|
||||
uint64_t **d_lwe_ct_out_array, int lwe_dimension,
|
||||
int glwe_dimension, int polynomial_size,
|
||||
double lwe_modular_variance, double glwe_modular_variance,
|
||||
int pbs_base_log, int pbs_level, int message_modulus,
|
||||
int carry_modulus, int *payload_modulus, uint64_t *delta,
|
||||
@@ -93,13 +92,6 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream, gpu_index);
|
||||
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, gpu_index, amortized_pbs_buffer, glwe_dimension, polynomial_size,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, gpu_index, lowlat_pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
cuda_synchronize_stream(v_stream);
|
||||
|
||||
free(lwe_ct_in_array);
|
||||
@@ -112,9 +104,7 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
uint64_t *d_lut_pbs_identity,
|
||||
uint64_t *d_lut_pbs_indexes,
|
||||
uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_ct_out_array,
|
||||
int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
|
||||
int gpu_index) {
|
||||
uint64_t *d_lwe_ct_out_array, int gpu_index) {
|
||||
void *v_stream = (void *)stream;
|
||||
cuda_synchronize_stream(v_stream);
|
||||
|
||||
@@ -128,8 +118,6 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
|
||||
cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
|
||||
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &amortized_pbs_buffer);
|
||||
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &lowlat_pbs_buffer);
|
||||
cuda_destroy_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
|
||||
@@ -50,8 +50,6 @@ protected:
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *lwe_ct_out_array;
|
||||
int8_t *amortized_pbs_buffer;
|
||||
int8_t *lowlat_pbs_buffer;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
@@ -75,11 +73,10 @@ public:
|
||||
bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
|
||||
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
|
||||
&amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, lwe_modular_variance,
|
||||
glwe_modular_variance, pbs_base_log, pbs_level,
|
||||
message_modulus, carry_modulus, &payload_modulus, &delta,
|
||||
number_of_inputs, repetitions, samples, gpu_index);
|
||||
lwe_dimension, glwe_dimension, polynomial_size,
|
||||
lwe_modular_variance, glwe_modular_variance, pbs_base_log,
|
||||
pbs_level, message_modulus, carry_modulus, &payload_modulus,
|
||||
&delta, number_of_inputs, repetitions, samples, gpu_index);
|
||||
|
||||
lwe_ct_out_array =
|
||||
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
|
||||
@@ -91,11 +88,16 @@ public:
|
||||
bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
|
||||
d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
|
||||
amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
|
||||
gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
@@ -112,10 +114,9 @@ TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, amortized_pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, number_of_inputs, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
@@ -147,9 +148,15 @@ TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
|
||||
}
|
||||
|
||||
TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
if (number_of_inputs > number_of_sm * 4 / (glwe_dimension + 1) / pbs_level)
|
||||
@@ -170,10 +177,9 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, lowlat_pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, number_of_inputs, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
@@ -204,6 +210,7 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
|
||||
Reference in New Issue
Block a user