bench(backend): add a benchmark tool for concrete-cuda

This commit is contained in:
Pedro Alves
2023-03-13 15:18:12 -03:00
committed by Agnès Leroy
parent 2728046ae7
commit 08e8012061
28 changed files with 2793 additions and 1174 deletions

View File

@@ -104,7 +104,7 @@ jobs:
if: ${{ !cancelled() }}
run: |
cd backends/concrete-cuda/implementation/build
./test/test_concrete_cuda
./test_and_benchmark/test/test_concrete_cuda
- name: Export variables for CUDA 11.1
run: |
@@ -124,7 +124,7 @@ jobs:
if: ${{ !cancelled() }}
run: |
cd backends/concrete-cuda/implementation/build-old-cuda
./test/test_concrete_cuda --gtest_filter="Wop*"
./test_and_benchmark/test/test_concrete_cuda --gtest_filter="Wop*"
stop-runner:
name: Stop EC2 runner

View File

@@ -66,7 +66,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUD
set(INCLUDE_DIR include)
add_subdirectory(src)
add_subdirectory(test)
add_subdirectory(test_and_benchmark)
target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR})
# This is required for rust cargo build

View File

@@ -1,7 +1,7 @@
#!/bin/bash
find ./{include,src,test} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-11 -i -style='file'
find ./{include,src,test_and_benchmark} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-11 -i -style='file'
cmake-format -i CMakeLists.txt -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py
find ./{include,src,test} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py'
find ./{include,src,test_and_benchmark} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py'

View File

@@ -1,340 +0,0 @@
#include "../include/bootstrap.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int repetitions;
int samples;
} BootstrapTestParams;
class BootstrapTestPrimitives_u64
: public ::testing::TestWithParam<BootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int payload_modulus;
int number_of_inputs;
int repetitions;
int samples;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (int)GetParam().lwe_modular_variance;
glwe_modular_variance = (int)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
repetitions = (int)GetParam().repetitions;
samples = (int)GetParam().samples;
payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng,
repetitions);
generate_lwe_secret_keys(&lwe_sk_out_array,
glwe_dimension * polynomial_size, csprng,
repetitions);
generate_lwe_bootstrap_keys(
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array,
lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, csprng, glwe_modular_variance, repetitions);
plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs,
repetitions, samples);
// Create the LUT
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
[](int x) -> int { return x; });
// Copy the LUT
d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream,
gpu_index);
d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream, gpu_index);
cuda_synchronize_stream(v_stream);
cuda_memset_async(d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lut_pbs_identity, lut_pbs_identity,
polynomial_size * (glwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
free(lut_pbs_identity);
d_lwe_ct_out_array =
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * number_of_inputs * repetitions * samples *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs *
repetitions * samples * sizeof(uint64_t));
// Create the input/output ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in = lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension,
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
}
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
free(lwe_ct_in_array);
}
void TearDown() {
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_lut_pbs_identity, stream, gpu_index);
cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
};
TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
uint64_t *lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_amortized_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
free(lwe_ct_out_array);
}
TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
if (number_of_inputs > number_of_sm * 4 / (glwe_dimension + 1) / pbs_level)
GTEST_SKIP() << "The Low Latency PBS does not support this configuration";
uint64_t *lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
int8_t *pbs_buffer = nullptr;
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
free(lwe_ct_out_array);
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<BootstrapTestParams> pbs_params_u64 =
::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// message_modulus, carry_modulus, number_of_inputs, repetitions,
// samples
(BootstrapTestParams){567, 5, 256, 0.000007069849454709433,
0.00000000000000029403601535432533, 15, 1, 2, 1,
5, 2, 50},
(BootstrapTestParams){623, 6, 256, 7.52316384526264e-37,
7.52316384526264e-37, 9, 3, 2, 2, 5, 2, 50},
(BootstrapTestParams){694, 3, 512, 0.000007069849454709433,
0.00000000000000029403601535432533, 18, 1, 2, 1,
5, 2, 50},
(BootstrapTestParams){769, 2, 1024, 0.000007069849454709433,
0.00000000000000029403601535432533, 23, 1, 2, 1,
5, 2, 50},
(BootstrapTestParams){754, 1, 2048, 0.000007069849454709433,
0.00000000000000029403601535432533, 23, 1, 4, 1,
5, 2, 50},
(BootstrapTestParams){847, 1, 4096, 0.000007069849454709433,
0.00000000000000029403601535432533, 2, 12, 2, 1,
2, 1, 50},
(BootstrapTestParams){881, 1, 8192, 0.000007069849454709433,
0.00000000000000029403601535432533, 22, 1, 2, 1,
2, 1, 25},
(BootstrapTestParams){976, 1, 16384, 0.000007069849454709433,
0.00000000000000029403601535432533, 11, 3, 4, 1,
2, 1, 10});
std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
BootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_number_of_inputs_" +
std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(BootstrapInstantiation, BootstrapTestPrimitives_u64,
pbs_params_u64, printParamName);

View File

@@ -1,340 +0,0 @@
#include "../include/device.h"
#include "../include/linear_algebra.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int lwe_dimension;
double noise_variance;
int message_modulus;
int carry_modulus;
int number_of_inputs;
} LinearAlgebraTestParams;
class LinearAlgebraTestPrimitives_u64
: public ::testing::TestWithParam<LinearAlgebraTestParams> {
protected:
int lwe_dimension;
double noise_variance;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int payload_modulus;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_array;
uint64_t *d_lwe_in_1_ct;
uint64_t *d_lwe_in_2_ct;
uint64_t *d_lwe_out_ct;
uint64_t *lwe_in_1_ct;
uint64_t *lwe_in_2_ct;
uint64_t *lwe_out_ct;
uint64_t *plaintexts_1;
uint64_t *plaintexts_2;
int num_samples;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
noise_variance = (double)GetParam().noise_variance;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_array, lwe_dimension, csprng, REPETITIONS);
plaintexts_1 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
REPETITIONS, SAMPLES);
plaintexts_2 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
REPETITIONS, SAMPLES);
d_lwe_in_1_ct = (uint64_t *)cuda_malloc_async(
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
d_lwe_in_2_ct = (uint64_t *)cuda_malloc_async(
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
lwe_in_1_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t));
lwe_in_2_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t));
lwe_out_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t));
cuda_synchronize_stream(v_stream);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
cuda_drop_async(d_lwe_in_1_ct, stream, gpu_index);
cuda_drop_async(d_lwe_in_2_ct, stream, gpu_index);
cuda_drop_async(d_lwe_out_ct, stream, gpu_index);
free(lwe_in_1_ct);
free(lwe_in_2_ct);
free(lwe_out_ct);
free(lwe_sk_array);
free(plaintexts_1);
free(plaintexts_2);
}
};
TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk, lwe_in_2_ct + i * (lwe_dimension + 1), plaintext_2,
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
// Execute addition
cuda_add_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(
d_lwe_in_2_ct,
&plaintexts_2[r * SAMPLES * number_of_inputs + s * number_of_inputs],
number_of_inputs * sizeof(uint64_t), stream, gpu_index);
// Execute addition
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta);
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, cleartext_multiplication) {
void *v_stream = (void *)stream;
uint64_t delta_2 =
((uint64_t)(1) << 63) / (uint64_t)(payload_modulus * payload_modulus);
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
uint64_t *cleartext_array =
(uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i] /
delta * delta_2;
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
cleartext_array[i] = plaintext_2 / delta;
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext_1,
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, cleartext_array,
number_of_inputs * sizeof(uint64_t), stream,
gpu_index);
// Execute cleartext multiplication
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i] /
delta * delta_2;
uint64_t cleartext = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i] /
delta;
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta_2 >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta_2;
EXPECT_EQ(decoded, plaintext / delta_2 * cleartext);
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk, lwe_in_1_ct + i * (lwe_dimension + 1), plaintext,
lwe_dimension, noise_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
// Execute addition
cuda_negate_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, -plaintext / delta);
}
}
}
}
// Defines for which parameters set the linear algebra operations will be
// tested. It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<LinearAlgebraTestParams>
linear_algebra_params_u64 = ::testing::Values(
// n, lwe_std_dev, message_modulus, carry_modulus, number_of_inputs
(LinearAlgebraTestParams){600, 7.52316384526264e-37, 2, 2, 10});
std::string
printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {
LinearAlgebraTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension);
}
INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation,
LinearAlgebraTestPrimitives_u64,
linear_algebra_params_u64, printParamName);

View File

@@ -0,0 +1,2 @@
add_subdirectory(test)
add_subdirectory(benchmark)

View File

@@ -0,0 +1,48 @@
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
find_package(CUDAToolkit)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
# Disable the Google Benchmark requirement on Google Test
set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
set(BENCHMARK_ENABLE_TESTING OFF)
FetchContent_Declare(
googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.7.1)
FetchContent_MakeAvailable(googlebenchmark)
# Enable ExternalProject CMake module
include(ExternalProject)
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation/target/release")
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation")
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
add_library(concrete_cpu_lib STATIC IMPORTED)
set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
set(BINARY benchmark_concrete_cuda)
file(
GLOB_RECURSE BENCH_SOURCES
LIST_DIRECTORIES false
benchmark*.cpp main.cpp)
set(SOURCES ${BENCH_SOURCES})
add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
set_target_properties(benchmark_concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(
benchmark_concrete_cuda
PUBLIC benchmark::benchmark concrete_cpu_lib concrete_cuda
PRIVATE CUDA::cudart)

View File

@@ -0,0 +1,78 @@
# benchmark_concrete_cuda
This benchmark tool is written over Google Benchmark library. It measures the performance of the
CUDA-accelerated functions of the concrete-framework and helps to identify potential
bottlenecks.
## How to Compile
The first step in compiling code with CMake is to create a build directory. This directory will
contain all the files generated during the build process, such as object files and executables.
We recommend creating this directory outside of the source directory, but inside the
implementation folder, to keep the source directory clean.
```bash
$ cd concrete-open-source/backends/concrete-cuda/implementation
$ mkdir build
$ cd build
```
Run CMake to generate the build files and then use make to compile the project.
```bash
$ cmake ..
$ make
```
## How to Run Benchmarks
To run benchmarks, you can simply execute the `benchmark_concrete_cuda` executable with no arguments:
```bash
$ benchmark/benchmark_concrete_cuda
```
This will run all the benchmarks in the code.
## How to Filter Benchmarks
You can filter benchmarks by specifying a regular expression as an argument. Only benchmarks whose name matches the regular expression will be executed.
For example, to run only benchmarks whose name contains the word "Bootstrap", you can execute:
```bash
$ benchmark/benchmark_concrete_cuda --benchmark_filter=Bootstrap
```
## How to Set the Time Unit
By default, benchmarks are reported in seconds. However, you can change the time unit to one of the following:
* `ns` (nanoseconds)
* `us` (microseconds)
* `ms` (milliseconds)
* `s` (seconds)
To set the time unit, use the --benchmark_time_unit option followed by the desired time unit:
```bash
$ benchmark/benchmark_concrete_cuda --benchmark_time_unit=us
```
## How to Set the Number of Iterations
By default, each benchmark is executed for a number of iterations that is automatically determined by the Google Benchmark library.
However, you can increase the minimum time used for each measurement to increase the number of
iterations by using --benchmark_min_time. For instance:
```bash
$ benchmark/benchmark_concrete_cuda --benchmark_min_time=10
```
will force the tool to run at least 10s of iterations.
## Conclusion
With these options, you can easily run benchmarks, filter benchmarks, set the time unit, and the number of iterations of benchmark_concrete_cuda. If you have any questions or issues, please feel free to contact us.
To learn more about Google Benchmark library, please refer to the [official user guide](https://github.com/google/benchmark/blob/main/docs/user_guide.md).

View File

@@ -0,0 +1,113 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <setup_and_teardown.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int number_of_bits_of_message_including_padding;
int number_of_bits_to_extract;
int number_of_inputs;
} BitExtractionBenchmarkParams;
class BitExtractionBenchmark_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance = 7.52316384526264e-37;
double glwe_modular_variance = 7.52316384526264e-37;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int number_of_bits_of_message_including_padding;
int number_of_bits_to_extract;
int number_of_inputs;
uint64_t delta;
int delta_log;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *plaintexts;
double *d_fourier_bsk;
uint64_t *d_ksk;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
int8_t *bit_extract_buffer;
uint64_t *lwe_sk_in;
uint64_t *lwe_sk_out;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
ks_base_log = state.range(5);
ks_level = state.range(6);
number_of_bits_of_message_including_padding = state.range(7);
number_of_bits_to_extract = state.range(8);
number_of_inputs = state.range(9);
bit_extraction_setup(
stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_fourier_bsk, &d_ksk,
&plaintexts, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
&bit_extract_buffer, lwe_dimension, glwe_dimension, polynomial_size,
lwe_modular_variance, glwe_modular_variance, ks_base_log, ks_level,
pbs_base_log, pbs_level, number_of_bits_of_message_including_padding,
number_of_bits_to_extract, &delta_log, &delta, number_of_inputs, 1, 1,
gpu_index);
}
void TearDown() {
bit_extraction_teardown(stream, csprng, lwe_sk_in, lwe_sk_out,
d_fourier_bsk, d_ksk, plaintexts, d_lwe_ct_in_array,
d_lwe_ct_out_array, bit_extract_buffer, gpu_index);
}
};
BENCHMARK_DEFINE_F(BitExtractionBenchmark_u64, BitExtraction)
(benchmark::State &st) {
for (auto _ : st) {
// Execute bit extract
cuda_extract_bits_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_ct_in_array, bit_extract_buffer, (void *)d_ksk,
(void *)d_fourier_bsk, number_of_bits_to_extract, delta_log,
glwe_dimension * polynomial_size, lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
number_of_inputs, cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
}
static void
BitExtractionBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
std::vector<BitExtractionBenchmarkParams> params = {
(BitExtractionBenchmarkParams){585, 1, 1024, 10, 2, 4, 7, 5, 5, 1} //,
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, x.ks_base_log, x.ks_level,
x.number_of_bits_of_message_including_padding,
x.number_of_bits_to_extract, x.number_of_inputs});
}
BENCHMARK_REGISTER_F(BitExtractionBenchmark_u64, BitExtraction)
->Apply(BitExtractionBenchmarkGenerateParams);

View File

@@ -0,0 +1,195 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <setup_and_teardown.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int input_lwe_ciphertext_count;
} BootstrapBenchmarkParams;
class BootstrapBenchmark_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int input_lwe_ciphertext_count;
double lwe_modular_variance = 0.000007069849454709433;
double glwe_modular_variance = 0.00000000000000029403601535432533;
int pbs_base_log;
int pbs_level;
int message_modulus = 4;
int carry_modulus = 4;
int payload_modulus;
uint64_t delta;
double *d_fourier_bsk_array;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_array;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
int8_t *amortized_pbs_buffer;
int8_t *lowlat_pbs_buffer;
public:
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
input_lwe_ciphertext_count = state.range(5);
bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
&amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pbs_base_log, pbs_level,
message_modulus, carry_modulus, &payload_modulus, &delta,
input_lwe_ciphertext_count, 1, 1, gpu_index);
// We keep the following for the benchmarks with copies
lwe_ct_array = (uint64_t *)malloc(
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint64_t));
}
void TearDown() {
bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
free(lwe_ct_array);
}
};
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, AmortizedPBS)(benchmark::State &st) {
void *v_stream = (void *)stream;
for (auto _ : st) {
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, input_lwe_ciphertext_count,
input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(v_stream);
}
}
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
(benchmark::State &st) {
void *v_stream = (void *)stream;
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(uint64_t),
stream, gpu_index);
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, input_lwe_ciphertext_count,
input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
}
}
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, LowLatencyPBS)
(benchmark::State &st) {
for (auto _ : st) {
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, 1, 1, 0,
cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusLowLatencyPBS)
(benchmark::State &st) {
void *v_stream = (void *)stream;
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(uint64_t),
stream, gpu_index);
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, 1, 1, 0,
cuda_get_max_shared_memory(gpu_index));
cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
}
}
static void
BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
// input_lwe_ciphertext_count
std::vector<BootstrapBenchmarkParams> params = {
(BootstrapBenchmarkParams){567, 5, 256, 15, 1, 1},
(BootstrapBenchmarkParams){577, 6, 256, 12, 3, 1},
(BootstrapBenchmarkParams){553, 4, 512, 12, 3, 1},
(BootstrapBenchmarkParams){769, 2, 1024, 23, 1, 1},
(BootstrapBenchmarkParams){714, 2, 1024, 15, 2, 1},
(BootstrapBenchmarkParams){694, 2, 1024, 8, 5, 1},
(BootstrapBenchmarkParams){881, 1, 8192, 22, 1, 1},
(BootstrapBenchmarkParams){879, 1, 8192, 11, 3, 1},
};
// Add to the list of parameters to benchmark
for (auto x : params)
for (int num_samples = 1; num_samples <= 10000; num_samples *= 10) {
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, num_samples});
}
}
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, AmortizedPBS)
->Apply(BootstrapBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, LowLatencyPBS)
->Apply(BootstrapBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
->Apply(BootstrapBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, CopiesPlusLowLatencyPBS)
->Apply(BootstrapBenchmarkGenerateParams);

View File

@@ -0,0 +1,120 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int number_of_inputs;
} CircuitBootstrapBenchmarkParams;
class CircuitBootstrapBenchmark_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance = 7.52316384526264e-37;
double glwe_modular_variance = 7.52316384526264e-37;
int pbs_base_log;
int pbs_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int number_of_inputs;
int number_of_bits_of_message_including_padding;
int ggsw_size;
uint64_t delta;
int delta_log;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in;
uint64_t *lwe_sk_out;
uint64_t *plaintexts;
double *d_fourier_bsk;
uint64_t *d_pksk;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_ggsw_ct_out_array;
uint64_t *d_lut_vector_indexes;
int8_t *cbs_buffer;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
pksk_base_log = state.range(5);
pksk_level = state.range(6);
cbs_base_log = state.range(7);
cbs_level = state.range(8);
number_of_inputs = state.range(9);
// We generate binary messages
number_of_bits_of_message_including_padding = 2;
ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
polynomial_size;
circuit_bootstrap_setup(
stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_fourier_bsk, &d_pksk,
&plaintexts, &d_lwe_ct_in_array, &d_ggsw_ct_out_array,
&d_lut_vector_indexes, &cbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, lwe_modular_variance, glwe_modular_variance,
pksk_base_log, pksk_level, pbs_base_log, pbs_level, cbs_level,
number_of_bits_of_message_including_padding, ggsw_size, &delta_log,
&delta, number_of_inputs, 1, 1, gpu_index);
}
void TearDown() {
circuit_bootstrap_teardown(stream, csprng, lwe_sk_in, lwe_sk_out,
d_fourier_bsk, d_pksk, plaintexts,
d_lwe_ct_in_array, d_lut_vector_indexes,
d_ggsw_ct_out_array, cbs_buffer, gpu_index);
}
};
BENCHMARK_DEFINE_F(CircuitBootstrapBenchmark_u64, CircuitBootstrap)
(benchmark::State &st) {
for (auto _ : st) {
// Execute circuit bootstrap
cuda_circuit_bootstrap_64(
stream, gpu_index, (void *)d_ggsw_ct_out_array,
(void *)d_lwe_ct_in_array, (void *)d_fourier_bsk, (void *)d_pksk,
(void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
pksk_base_log, cbs_level, cbs_base_log, number_of_inputs,
cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
}
static void
CircuitBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
std::vector<CircuitBootstrapBenchmarkParams> params = {
(CircuitBootstrapBenchmarkParams){10, 2, 512, 11, 2, 15, 2, 10, 1, 10} //,
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, x.pksk_base_log, x.pksk_level,
x.cbs_base_log, x.cbs_level, x.number_of_inputs});
}
BENCHMARK_REGISTER_F(CircuitBootstrapBenchmark_u64, CircuitBootstrap)
->Apply(CircuitBootstrapBenchmarkGenerateParams);

View File

@@ -0,0 +1,94 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <functional>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int glwe_dimension;
int polynomial_size;
int r;
int tau;
int base_log;
int level_count;
} CMUXTreeBenchmarkParams;
class CMUXTreeBenchmark_u64 : public benchmark::Fixture {
protected:
int glwe_dimension;
int polynomial_size;
int r_lut;
int tau;
double glwe_modular_variance = 0.00000000000000029403601535432533;
int base_log;
int level_count;
uint64_t delta;
int delta_log = 60;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *d_lut_identity;
uint64_t *d_ggsw_bit_array;
uint64_t *plaintexts;
uint64_t *d_glwe_out;
uint64_t *glwe_sk;
int8_t *cmux_tree_buffer = nullptr;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
glwe_dimension = state.range(0);
polynomial_size = state.range(1);
r_lut = state.range(2);
tau = state.range(3);
base_log = state.range(4);
level_count = state.range(5);
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << delta_log);
cmux_tree_setup(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
glwe_dimension, polynomial_size, base_log, level_count,
glwe_modular_variance, r_lut, tau, delta_log, 1, 1,
gpu_index);
}
void TearDown() {
cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
gpu_index);
}
};
BENCHMARK_DEFINE_F(CMUXTreeBenchmark_u64, CMUXTree)(benchmark::State &st) {
for (auto _ : st) {
// Execute scratch/CMUX tree/cleanup
cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
(void *)d_ggsw_bit_array, (void *)d_lut_identity,
cmux_tree_buffer, glwe_dimension, polynomial_size,
base_log, level_count, r_lut, tau,
cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
}
// k, N, r, tau, base_log, level_count
static void CMUXTreeBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
std::vector<CMUXTreeBenchmarkParams> params = {
(CMUXTreeBenchmarkParams){2, 256, 10, 6, 6, 3},
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.glwe_dimension, x.polynomial_size, x.r, x.tau, x.base_log,
x.level_count});
}
BENCHMARK_REGISTER_F(CMUXTreeBenchmark_u64, CMUXTree)
->Apply(CMUXTreeBenchmarkGenerateParams);

View File

@@ -0,0 +1,117 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int input_lwe_dimension;
int output_lwe_dimension;
int ksk_base_log;
int ksk_level;
int number_of_inputs;
} KeyswitchBenchmarkParams;
class KeyswitchBenchmark_u64 : public benchmark::Fixture {
protected:
int input_lwe_dimension;
int output_lwe_dimension;
double noise_variance = 2.9802322387695312e-08;
int ksk_base_log;
int ksk_level;
int message_modulus = 4;
int carry_modulus = 4;
int payload_modulus;
int number_of_inputs;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *plaintexts;
uint64_t *d_ksk_array;
uint64_t *d_lwe_out_ct_array;
uint64_t *d_lwe_in_ct_array;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
input_lwe_dimension = state.range(0);
output_lwe_dimension = state.range(1);
ksk_base_log = state.range(2);
ksk_level = state.range(3);
number_of_inputs = state.range(4);
keyswitch_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_ksk_array, &plaintexts, &d_lwe_in_ct_array,
&d_lwe_out_ct_array, input_lwe_dimension,
output_lwe_dimension, noise_variance, ksk_base_log,
ksk_level, message_modulus, carry_modulus, &payload_modulus,
&delta, number_of_inputs, 1, 1, gpu_index);
}
void TearDown() {
keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_ksk_array, plaintexts, d_lwe_in_ct_array,
d_lwe_out_ct_array, gpu_index);
}
};
BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, Keyswitch)(benchmark::State &st) {
for (auto _ : st) {
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct_array,
(void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, CopiesPlusKeyswitch)
(benchmark::State &st) {
uint64_t *lwe_in_ct = (uint64_t *)malloc(
number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
void *v_stream = (void *)stream;
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct,
number_of_inputs * (input_lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct_array,
(void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
cuda_memcpy_async_to_cpu(lwe_in_ct, d_lwe_out_ct_array,
number_of_inputs * (output_lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
}
free(lwe_in_ct);
}
static void
KeyswitchBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// na, nb, base_log, level, number_of_inputs
std::vector<KeyswitchBenchmarkParams> params = {
(KeyswitchBenchmarkParams){600, 1024, 3, 8, 10},
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.input_lwe_dimension, x.output_lwe_dimension, x.ksk_base_log,
x.ksk_level, x.number_of_inputs});
}
BENCHMARK_REGISTER_F(KeyswitchBenchmark_u64, Keyswitch)
->Apply(KeyswitchBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(KeyswitchBenchmark_u64, CopiesPlusKeyswitch)
->Apply(KeyswitchBenchmarkGenerateParams);

View File

@@ -0,0 +1,227 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int lwe_dimension;
int input_lwe_ciphertext_count;
} LinearAlgebraBenchmarkParams;
class LinearAlgebraBenchmark_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
double noise_variance = 2.9802322387695312e-08;
int ksk_base_log;
int ksk_level;
int message_modulus = 4;
int carry_modulus = 4;
int num_samples;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *d_lwe_in_1_ct;
uint64_t *d_lwe_in_2_ct;
uint64_t *d_lwe_out_ct;
uint64_t *plaintexts_1;
uint64_t *plaintexts_2;
uint64_t *d_plaintext_2;
uint64_t *d_cleartext;
uint64_t *lwe_in_1_ct;
uint64_t *lwe_in_2_ct;
uint64_t *lwe_out_ct;
uint64_t *lwe_sk_array;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = state.range(0);
num_samples = state.range(1);
int payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
linear_algebra_setup(
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
&plaintexts_2, &d_plaintext_2, &d_cleartext, lwe_dimension,
noise_variance, payload_modulus, delta, num_samples, 1, 1, gpu_index);
}
void TearDown() {
linear_algebra_teardown(
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
&plaintexts_2, &d_plaintext_2, &d_cleartext, gpu_index);
}
};
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Addition)(benchmark::State &st) {
// Execute addition
for (auto _ : st) {
cuda_add_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, num_samples);
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
(benchmark::State &st) {
// Execute addition
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_add_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_lwe_in_2_ct, lwe_dimension, num_samples);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_plaintext_2, lwe_dimension, num_samples);
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_plaintext_2, plaintexts_2,
num_samples * sizeof(uint64_t), stream, gpu_index);
// Execute addition
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_plaintext_2, lwe_dimension, num_samples);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_cleartext, lwe_dimension, num_samples);
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64,
CopiesPlusPlaintextMultiplication)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_cleartext, plaintexts_2,
num_samples * sizeof(uint64_t), stream, gpu_index);
// Execute addition
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
(void *)d_cleartext, lwe_dimension, num_samples);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Negate)(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
cuda_negate_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
lwe_dimension, num_samples);
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusNegate)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
// Execute addition
cuda_negate_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
lwe_dimension, num_samples);
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
num_samples * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
}
static void
LinearAlgebraBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// n, input_lwe_ciphertext_count
std::vector<LinearAlgebraBenchmarkParams> params = {
(LinearAlgebraBenchmarkParams){600, 10},
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.lwe_dimension, x.input_lwe_ciphertext_count});
}
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, Addition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64,
CopiesPlusPlaintextMultiplication)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, Negate)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusNegate)
->Apply(LinearAlgebraBenchmarkGenerateParams);

View File

@@ -0,0 +1,169 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int tau;
} WopPBSBenchmarkParams;
class WopPBSBenchmark_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance = 7.52316384526264e-37;
double glwe_modular_variance = 7.52316384526264e-37;
int pbs_base_log;
int pbs_level;
int ks_base_log;
int ks_level;
int pksk_base_log;
int pksk_level;
int cbs_base_log;
int cbs_level;
int tau;
int p;
int input_lwe_dimension;
uint64_t delta;
int cbs_delta_log;
int delta_log;
int delta_log_lut;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *plaintexts;
double *d_fourier_bsk;
uint64_t *lwe_sk_in;
uint64_t *lwe_sk_out;
uint64_t *d_ksk;
uint64_t *d_pksk;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *d_lut_vector;
int8_t *wop_pbs_buffer;
uint64_t *lwe_ct_in_array;
uint64_t *lwe_ct_out_array;
public:
// Test arithmetic functions
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
ks_base_log = state.range(5);
ks_level = state.range(6);
pksk_base_log = state.range(7);
pksk_level = state.range(8);
cbs_base_log = state.range(9);
cbs_level = state.range(10);
tau = state.range(11);
p = 10 / tau;
wop_pbs_setup(stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_ksk,
&d_fourier_bsk, &d_pksk, &plaintexts, &d_lwe_ct_in_array,
&d_lwe_ct_out_array, &d_lut_vector, &wop_pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size,
lwe_modular_variance, glwe_modular_variance, ks_base_log,
ks_level, pksk_base_log, pksk_level, pbs_base_log, pbs_level,
cbs_level, p, &delta_log, &cbs_delta_log, &delta_log_lut,
&delta, tau, 1, 1, gpu_index);
// We keep the following for the benchmarks with copies
lwe_ct_in_array = (uint64_t *)malloc(
(glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
for (int i = 0; i < tau; i++) {
uint64_t plaintext = plaintexts[i];
uint64_t *lwe_ct_in =
lwe_ct_in_array +
(ptrdiff_t)(i * (glwe_dimension * polynomial_size + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, glwe_dimension * polynomial_size,
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
lwe_ct_out_array = (uint64_t *)malloc(
(glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
}
void TearDown() {
wop_pbs_teardown(stream, csprng, lwe_sk_in, lwe_sk_out, d_ksk,
d_fourier_bsk, d_pksk, plaintexts, d_lwe_ct_in_array,
d_lut_vector, d_lwe_ct_out_array, wop_pbs_buffer,
gpu_index);
free(lwe_ct_in_array);
free(lwe_ct_out_array);
}
};
BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, WopPBS)(benchmark::State &st) {
for (auto _ : st) {
// Execute wop pbs
cuda_wop_pbs_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_ct_in_array, (void *)d_lut_vector, (void *)d_fourier_bsk,
(void *)d_ksk, (void *)d_pksk, wop_pbs_buffer, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ks_base_log, ks_level, pksk_base_log, pksk_level, cbs_base_log,
cbs_level, p, p, delta_log, tau, cuda_get_max_shared_memory(gpu_index));
cuda_synchronize_stream(stream);
}
}
BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, CopiesPlusWopPBS)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
(input_lwe_dimension + 1) * tau * sizeof(uint64_t),
stream, gpu_index);
// Execute wop pbs
cuda_wop_pbs_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_ct_in_array, (void *)d_lut_vector, (void *)d_fourier_bsk,
(void *)d_ksk, (void *)d_pksk, wop_pbs_buffer, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ks_base_log, ks_level, pksk_base_log, pksk_level, cbs_base_log,
cbs_level, p, p, delta_log, tau, cuda_get_max_shared_memory(gpu_index));
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(input_lwe_dimension + 1) * tau * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream);
}
}
static void WopPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// ks_base_log, ks_level, tau
std::vector<WopPBSBenchmarkParams> params = {
(WopPBSBenchmarkParams){481, 2, 512, 4, 9, 1, 9, 4, 9, 6, 4, 1} //,
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, x.ks_base_log, x.ks_level,
x.pksk_base_log, x.pksk_level, x.cbs_base_log, x.cbs_level,
x.tau});
}
BENCHMARK_REGISTER_F(WopPBSBenchmark_u64, WopPBS)
->Apply(WopPBSBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(WopPBSBenchmark_u64, CopiesPlusWopPBS)
->Apply(WopPBSBenchmarkGenerateParams);

View File

@@ -0,0 +1,3 @@
#include <benchmark/benchmark.h>
BENCHMARK_MAIN();

View File

@@ -0,0 +1,146 @@
#ifndef SETUP_AND_TEARDOWN_H
#define SETUP_AND_TEARDOWN_H
#include <bit_extraction.h>
#include <bootstrap.h>
#include <circuit_bootstrap.h>
#include <concrete-cpu.h>
#include <device.h>
#include <keyswitch.h>
#include <linear_algebra.h>
#include <utils.h>
#include <vertical_packing.h>
void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
double **d_fourier_bsk_array, uint64_t **plaintexts,
uint64_t **d_lut_pbs_identity,
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array,
int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance,
int pbs_base_log, int pbs_level, int message_modulus,
int carry_modulus, int *payload_modulus, uint64_t *delta,
int number_of_inputs, int repetitions, int samples,
int gpu_index);
void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
double *d_fourier_bsk_array, uint64_t *plaintexts,
uint64_t *d_lut_pbs_identity,
uint64_t *d_lut_pbs_indexes,
uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array,
int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
int gpu_index);
void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
uint64_t **d_ksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array, int input_lwe_dimension,
int output_lwe_dimension, double lwe_modular_variance,
int ksk_base_log, int ksk_level, int message_modulus,
int carry_modulus, int *payload_modulus, uint64_t *delta,
int number_of_inputs, int repetitions, int samples,
int gpu_index);
void keyswitch_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
uint64_t *d_ksk_array, uint64_t *plaintexts,
uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array, int gpu_index);
void bit_extraction_setup(
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **d_ksk_array, uint64_t **plaintexts, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array, int8_t **bit_extract_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance, int ks_base_log,
int ks_level, int pbs_base_log, int pbs_level,
int number_of_bits_of_message_including_padding,
int number_of_bits_to_extract, int *delta_log, uint64_t *delta,
int number_of_inputs, int repetitions, int samples, int gpu_index);
void bit_extraction_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array,
double *d_fourier_bsk_array, uint64_t *d_ksk_array,
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array,
int8_t *bit_extract_buffer, int gpu_index);
void circuit_bootstrap_setup(
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **d_pksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array, uint64_t **d_ggsw_ct_out_array,
uint64_t **d_lut_vector_indexes, int8_t **cbs_buffer, int lwe_dimension,
int glwe_dimension, int polynomial_size, double lwe_modular_variance,
double glwe_modular_variance, int pksk_base_log, int pksk_level,
int pbs_base_log, int pbs_level, int cbs_level,
int number_of_bits_of_message_including_padding, int ggsw_size,
int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions,
int samples, int gpu_index);
void circuit_bootstrap_teardown(
cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
uint64_t *d_pksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lut_vector_indexes, uint64_t *d_ggsw_ct_out_array,
int8_t *cbs_buffer, int gpu_index);
void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
uint64_t **d_lut_identity, uint64_t **plaintexts,
uint64_t **d_ggsw_bit_array, int8_t **cmux_tree_buffer,
uint64_t **d_glwe_out, int glwe_dimension,
int polynomial_size, int base_log, int level_count,
double glwe_modular_variance, int r_lut, int tau,
uint64_t delta_log, int repetitions, int samples,
int gpu_index);
void cmux_tree_teardown(cudaStream_t *stream, Csprng **csprng,
uint64_t **glwe_sk, uint64_t **d_lut_identity,
uint64_t **plaintexts, uint64_t **d_ggsw_bit_array,
int8_t **cmux_tree_buffer, uint64_t **d_glwe_out,
int gpu_index);
void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
uint64_t **d_ksk_array, double **d_fourier_bsk_array,
uint64_t **d_pksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_ct_out_array,
uint64_t **d_lut_vector, int8_t **wop_pbs_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance,
int ks_base_log, int ks_level, int pksk_base_log,
int pksk_level, int pbs_base_log, int pbs_level,
int cbs_level, int p, int *delta_log, int *cbs_delta_log,
int *delta_log_lut, uint64_t *delta, int tau,
int repetitions, int samples, int gpu_index);
void wop_pbs_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
uint64_t *d_ksk_array, double *d_fourier_bsk_array,
uint64_t *d_pksk_array, uint64_t *plaintexts,
uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector,
uint64_t *d_lwe_ct_out_array, int8_t *wop_pbs_buffer,
int gpu_index);
void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
uint64_t **plaintexts_2, uint64_t **d_plaintext_2,
uint64_t **d_plaintexts_2_mul, int lwe_dimension,
double noise_variance, int payload_modulus,
uint64_t delta, int number_of_inputs, int repetitions,
int samples, int gpu_index);
void linear_algebra_teardown(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
uint64_t **plaintexts_2, uint64_t **d_plaintext_2,
uint64_t **d_plaintexts_2_mul, int gpu_index);
#endif // SETUP_AND_TEARDOWN_H

View File

@@ -1,8 +1,8 @@
#ifndef TEST_UTILS_H
#define TEST_UTILS_H
#ifndef UTILS_H
#define UTILS_H
#include "../include/device.h"
#include "concrete-cpu.h"
#include <concrete-cpu.h>
#include <device.h>
#include <functional>
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,

View File

@@ -0,0 +1,808 @@
#include <cmath>
#include <setup_and_teardown.h>
void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
double **d_fourier_bsk_array, uint64_t **plaintexts,
uint64_t **d_lut_pbs_identity,
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array,
int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance,
int pbs_base_log, int pbs_level, int message_modulus,
int carry_modulus, int *payload_modulus, uint64_t *delta,
int number_of_inputs, int repetitions, int samples,
int gpu_index) {
void *v_stream = (void *)stream;
*payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
// Create a Csprng
*csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
*csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, *csprng,
repetitions);
generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
*csprng, repetitions);
generate_lwe_bootstrap_keys(
stream, gpu_index, d_fourier_bsk_array, *lwe_sk_in_array,
*lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, *csprng, glwe_modular_variance, repetitions);
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
repetitions, samples);
// Create the LUT
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
[](int x) -> int { return x; });
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * repetitions *
samples * sizeof(uint64_t));
// Create the input/output ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension,
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
}
// Initialize and copy things in/to the device
*d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(*d_lut_pbs_identity, lut_pbs_identity,
polynomial_size * (glwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
*d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream, gpu_index);
cuda_memset_async(*d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
// Input and output LWEs
*d_lwe_ct_out_array =
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * number_of_inputs * repetitions * samples *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
scratch_cuda_bootstrap_amortized_64(
stream, gpu_index, amortized_pbs_buffer, glwe_dimension, polynomial_size,
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
scratch_cuda_bootstrap_low_latency_64(
stream, gpu_index, lowlat_pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
cuda_synchronize_stream(v_stream);
free(lwe_ct_in_array);
free(lut_pbs_identity);
}
void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
double *d_fourier_bsk_array, uint64_t *plaintexts,
uint64_t *d_lut_pbs_identity,
uint64_t *d_lut_pbs_indexes,
uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array,
int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
int gpu_index) {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_lut_pbs_identity, stream, gpu_index);
cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cleanup_cuda_bootstrap_amortized(stream, gpu_index, &amortized_pbs_buffer);
cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &lowlat_pbs_buffer);
cuda_destroy_stream(stream, gpu_index);
}
void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
uint64_t **d_ksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array, int input_lwe_dimension,
int output_lwe_dimension, double lwe_modular_variance,
int ksk_base_log, int ksk_level, int message_modulus,
int carry_modulus, int *payload_modulus, uint64_t *delta,
int number_of_inputs, int repetitions, int samples,
int gpu_index) {
*payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
void *v_stream = (void *)stream;
// Create a Csprng
*csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
*csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, *csprng,
repetitions);
generate_lwe_secret_keys(lwe_sk_out_array, output_lwe_dimension, *csprng,
repetitions);
generate_lwe_keyswitch_keys(stream, gpu_index, d_ksk_array, *lwe_sk_in_array,
*lwe_sk_out_array, input_lwe_dimension,
output_lwe_dimension, ksk_level, ksk_base_log,
*csprng, lwe_modular_variance, repetitions);
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
repetitions, samples);
*d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream,
gpu_index);
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * number_of_inputs * repetitions * samples *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc((input_lwe_dimension + 1) * number_of_inputs *
repetitions * samples * sizeof(uint64_t));
// Create the input/output ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in =
*lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(input_lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, input_lwe_dimension,
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
}
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * number_of_inputs *
(input_lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
free(lwe_ct_in_array);
}
void keyswitch_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
uint64_t *d_ksk_array, uint64_t *plaintexts,
uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array, int gpu_index) {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
uint64_t **plaintexts_2, uint64_t **d_plaintexts_2,
uint64_t **d_cleartext_2, int lwe_dimension,
double noise_variance, int payload_modulus,
uint64_t delta, int number_of_inputs, int repetitions,
int samples, int gpu_index) {
// Create a Csprng
*csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
*csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(lwe_sk_array, lwe_dimension, *csprng, repetitions);
*plaintexts_1 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
repetitions, samples);
*plaintexts_2 = generate_plaintexts(payload_modulus, delta, number_of_inputs,
repetitions, samples);
*lwe_in_1_ct = (uint64_t *)malloc(repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t));
*lwe_in_2_ct = (uint64_t *)malloc(repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t));
uint64_t *cleartext_2 = (uint64_t *)malloc(
repetitions * samples * number_of_inputs * sizeof(uint64_t));
*lwe_out_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t));
// Create the input/output ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk = *lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = (*plaintexts_1)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t plaintext_2 = (*plaintexts_2)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_1_in =
(*lwe_in_1_ct) + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(lwe_dimension + 1));
uint64_t *lwe_2_in =
(*lwe_in_2_ct) + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk, lwe_1_in, plaintext_1, lwe_dimension, noise_variance,
*csprng, &CONCRETE_CSPRNG_VTABLE);
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk, lwe_2_in, plaintext_2, lwe_dimension, noise_variance,
*csprng, &CONCRETE_CSPRNG_VTABLE);
cleartext_2[r * samples * number_of_inputs + s * number_of_inputs + i] =
plaintext_2 / delta;
}
}
}
// Initialize and copy things in/to the device
*d_lwe_in_1_ct =
(uint64_t *)cuda_malloc_async(repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
*d_lwe_in_2_ct =
(uint64_t *)cuda_malloc_async(repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
*d_plaintexts_2 = (uint64_t *)cuda_malloc_async(
repetitions * samples * number_of_inputs * sizeof(uint64_t), stream,
gpu_index);
*d_cleartext_2 = (uint64_t *)cuda_malloc_async(
repetitions * samples * number_of_inputs * sizeof(uint64_t), stream,
gpu_index);
*d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(*d_lwe_in_1_ct, *lwe_in_1_ct,
repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(*d_lwe_in_2_ct, *lwe_in_2_ct,
repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(*d_plaintexts_2, *plaintexts_2,
repetitions * samples * number_of_inputs *
sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(*d_cleartext_2, cleartext_2,
repetitions * samples * number_of_inputs *
sizeof(uint64_t),
stream, gpu_index);
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
free(cleartext_2);
}
void linear_algebra_teardown(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
uint64_t **plaintexts_2, uint64_t **d_plaintexts_2,
uint64_t **d_cleartext_2, int gpu_index) {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(*csprng);
free(*csprng);
cuda_drop_async(*d_lwe_in_1_ct, stream, gpu_index);
cuda_drop_async(*d_lwe_in_2_ct, stream, gpu_index);
cuda_drop_async(*d_plaintexts_2, stream, gpu_index);
cuda_drop_async(*d_cleartext_2, stream, gpu_index);
cuda_drop_async(*d_lwe_out_ct, stream, gpu_index);
free(*lwe_out_ct);
free(*lwe_sk_array);
free(*plaintexts_1);
free(*plaintexts_2);
free(*lwe_in_1_ct);
free(*lwe_in_2_ct);
}
void bit_extraction_setup(
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **d_ksk_array, uint64_t **plaintexts, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_ct_out_array, int8_t **bit_extract_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance, int ks_base_log,
int ks_level, int pbs_base_log, int pbs_level,
int number_of_bits_of_message_including_padding,
int number_of_bits_to_extract, int *delta_log, uint64_t *delta,
int number_of_inputs, int repetitions, int samples, int gpu_index) {
void *v_stream = (void *)stream;
*delta_log = 64 - number_of_bits_of_message_including_padding;
*delta = (uint64_t)(1) << *delta_log;
// Create a Csprng
*csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
*csprng, Uint128{.little_endian_bytes = {*seed}});
int input_lwe_dimension = glwe_dimension * polynomial_size;
int output_lwe_dimension = lwe_dimension;
// Generate the keys
generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, *csprng,
repetitions);
generate_lwe_secret_keys(lwe_sk_out_array, output_lwe_dimension, *csprng,
repetitions);
generate_lwe_keyswitch_keys(stream, gpu_index, d_ksk_array, *lwe_sk_in_array,
*lwe_sk_out_array, input_lwe_dimension,
output_lwe_dimension, ks_level, ks_base_log,
*csprng, lwe_modular_variance, repetitions);
generate_lwe_bootstrap_keys(
stream, gpu_index, d_fourier_bsk_array, *lwe_sk_out_array,
*lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, *csprng, glwe_modular_variance, repetitions);
*plaintexts =
generate_plaintexts(number_of_bits_of_message_including_padding, *delta,
number_of_inputs, repetitions, samples);
*d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
(output_lwe_dimension + 1) * number_of_bits_to_extract *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * number_of_inputs * repetitions * samples *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc(repetitions * samples * (input_lwe_dimension + 1) *
number_of_inputs * sizeof(uint64_t));
// Create the input ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in =
*lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(input_lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, input_lwe_dimension,
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
}
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * number_of_inputs *
(input_lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
// Execute scratch
scratch_cuda_extract_bits_64(stream, gpu_index, bit_extract_buffer,
glwe_dimension, lwe_dimension, polynomial_size,
pbs_level, number_of_inputs,
cuda_get_max_shared_memory(gpu_index), true);
cuda_synchronize_stream(v_stream);
free(lwe_ct_in_array);
}
void bit_extraction_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array,
double *d_fourier_bsk_array, uint64_t *d_ksk_array,
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_ct_out_array,
int8_t *bit_extract_buffer, int gpu_index) {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
void circuit_bootstrap_setup(
cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **d_pksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array, uint64_t **d_ggsw_ct_out_array,
uint64_t **d_lut_vector_indexes, int8_t **cbs_buffer, int lwe_dimension,
int glwe_dimension, int polynomial_size, double lwe_modular_variance,
double glwe_modular_variance, int pksk_base_log, int pksk_level,
int pbs_base_log, int pbs_level, int cbs_level,
int number_of_bits_of_message_including_padding, int ggsw_size,
int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions,
int samples, int gpu_index) {
void *v_stream = (void *)stream;
*delta_log = 60;
*delta = (uint64_t)(1) << *delta_log;
// Create a Csprng
*csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
*csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, *csprng,
repetitions);
generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
*csprng, repetitions);
generate_lwe_bootstrap_keys(
stream, gpu_index, d_fourier_bsk_array, *lwe_sk_in_array,
*lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, *csprng, glwe_modular_variance, repetitions);
generate_lwe_private_functional_keyswitch_key_lists(
stream, gpu_index, d_pksk_array, *lwe_sk_out_array, *lwe_sk_out_array,
glwe_dimension * polynomial_size, glwe_dimension, polynomial_size,
pksk_level, pksk_base_log, *csprng, lwe_modular_variance, repetitions);
*plaintexts =
generate_plaintexts(number_of_bits_of_message_including_padding, *delta,
number_of_inputs, repetitions, samples);
*d_ggsw_ct_out_array = (uint64_t *)cuda_malloc_async(
repetitions * samples * number_of_inputs * ggsw_size * sizeof(uint64_t),
stream, gpu_index);
*d_lwe_ct_in_array =
(uint64_t *)cuda_malloc_async(repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc(repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t));
// Create the input ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, lwe_dimension,
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
}
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
// Execute cbs scratch
scratch_cuda_circuit_bootstrap_64(
stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, cbs_level, number_of_inputs,
cuda_get_max_shared_memory(gpu_index), true);
// Build LUT vector indexes
uint64_t *h_lut_vector_indexes =
(uint64_t *)malloc(number_of_inputs * cbs_level * sizeof(uint64_t));
for (int index = 0; index < cbs_level * number_of_inputs; index++) {
h_lut_vector_indexes[index] = index % cbs_level;
}
*d_lut_vector_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * cbs_level * sizeof(uint64_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(*d_lut_vector_indexes, h_lut_vector_indexes,
number_of_inputs * cbs_level * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
free(h_lut_vector_indexes);
free(lwe_ct_in_array);
}
void circuit_bootstrap_teardown(
cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
uint64_t *d_pksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lut_vector_indexes, uint64_t *d_ggsw_ct_out_array,
int8_t *cbs_buffer, int gpu_index) {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_pksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_ggsw_ct_out_array, stream, gpu_index);
cuda_drop_async(d_lut_vector_indexes, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}
void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
uint64_t **d_lut_identity, uint64_t **plaintexts,
uint64_t **d_ggsw_bit_array, int8_t **cmux_tree_buffer,
uint64_t **d_glwe_out, int glwe_dimension,
int polynomial_size, int base_log, int level_count,
double glwe_modular_variance, int r_lut, int tau,
uint64_t delta_log, int repetitions, int samples,
int gpu_index) {
void *v_stream = (void *)stream;
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
int glwe_size = (glwe_dimension + 1) * polynomial_size;
// Create a Csprng
*csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
*csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_glwe_secret_keys(glwe_sk, glwe_dimension, polynomial_size, *csprng,
repetitions);
*plaintexts = generate_plaintexts(r_lut, 1, 1, repetitions, samples);
// Create the LUT
int num_lut = (1 << r_lut);
*d_lut_identity = (uint64_t *)cuda_malloc_async(
polynomial_size * num_lut * tau * sizeof(uint64_t), stream, gpu_index);
uint64_t *lut_cmux_tree_identity =
generate_identity_lut_cmux_tree(polynomial_size, num_lut, tau, delta_log);
// Encrypt one bit per GGSW
uint64_t *ggsw_bit_array = (uint64_t *)malloc(repetitions * samples * r_lut *
ggsw_size * sizeof(uint64_t));
for (int r = 0; r < repetitions; r++) {
for (int s = 0; s < samples; s++) {
uint64_t witness = (*plaintexts)[r * samples + s];
// Instantiate the GGSW m^tree ciphertexts
// We need r GGSW ciphertexts
// Bit decomposition of the value from MSB to LSB
uint64_t *bit_array = bit_decompose_value(witness, r_lut);
for (int i = 0; i < r_lut; i++) {
uint64_t *ggsw_slice =
ggsw_bit_array +
(ptrdiff_t)((r * samples * r_lut + s * r_lut + i) * ggsw_size);
concrete_cpu_encrypt_ggsw_ciphertext_u64(
*glwe_sk, ggsw_slice, bit_array[i], glwe_dimension, polynomial_size,
level_count, base_log, glwe_modular_variance, *csprng,
&CONCRETE_CSPRNG_VTABLE);
}
free(bit_array);
}
}
// Allocate and copy things to the device
*d_glwe_out = (uint64_t *)cuda_malloc_async(
tau * glwe_size * sizeof(uint64_t), stream, gpu_index);
*d_ggsw_bit_array = (uint64_t *)cuda_malloc_async(
repetitions * samples * r_lut * ggsw_size * sizeof(uint64_t), stream,
gpu_index);
cuda_memcpy_async_to_gpu(*d_lut_identity, lut_cmux_tree_identity,
polynomial_size * num_lut * tau * sizeof(uint64_t),
stream, gpu_index);
cuda_memcpy_async_to_gpu(*d_ggsw_bit_array, ggsw_bit_array,
repetitions * samples * r_lut * ggsw_size *
sizeof(uint64_t),
stream, gpu_index);
scratch_cuda_cmux_tree_64(stream, gpu_index, cmux_tree_buffer, glwe_dimension,
polynomial_size, level_count, r_lut, tau,
cuda_get_max_shared_memory(gpu_index), true);
cuda_synchronize_stream(v_stream);
free(lut_cmux_tree_identity);
free(ggsw_bit_array);
}
void cmux_tree_teardown(cudaStream_t *stream, Csprng **csprng,
uint64_t **glwe_sk, uint64_t **d_lut_identity,
uint64_t **plaintexts, uint64_t **d_ggsw_bit_array,
int8_t **cmux_tree_buffer, uint64_t **d_glwe_out,
int gpu_index) {
cuda_synchronize_stream(stream);
concrete_cpu_destroy_concrete_csprng(*csprng);
free(*plaintexts);
free(*csprng);
free(*glwe_sk);
cuda_drop_async(*d_lut_identity, stream, gpu_index);
cuda_drop_async(*d_ggsw_bit_array, stream, gpu_index);
cuda_drop_async(*d_glwe_out, stream, gpu_index);
cleanup_cuda_cmux_tree(stream, gpu_index, cmux_tree_buffer);
cuda_destroy_stream(stream, gpu_index);
}
void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
uint64_t **d_ksk_array, double **d_fourier_bsk_array,
uint64_t **d_pksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_ct_out_array,
uint64_t **d_lut_vector, int8_t **wop_pbs_buffer,
int lwe_dimension, int glwe_dimension, int polynomial_size,
double lwe_modular_variance, double glwe_modular_variance,
int ks_base_log, int ks_level, int pksk_base_log,
int pksk_level, int pbs_base_log, int pbs_level,
int cbs_level, int p, int *delta_log, int *cbs_delta_log,
int *delta_log_lut, uint64_t *delta, int tau,
int repetitions, int samples, int gpu_index) {
void *v_stream = (void *)stream;
int input_lwe_dimension = glwe_dimension * polynomial_size;
*delta_log = 64 - p;
*delta_log_lut = *delta_log;
*delta = (uint64_t)(1) << *delta_log;
// Create a Csprng
*csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
*csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, *csprng,
repetitions);
generate_lwe_secret_keys(lwe_sk_out_array, lwe_dimension, *csprng,
repetitions);
generate_lwe_keyswitch_keys(stream, gpu_index, d_ksk_array, *lwe_sk_in_array,
*lwe_sk_out_array, input_lwe_dimension,
lwe_dimension, ks_level, ks_base_log, *csprng,
lwe_modular_variance, repetitions);
generate_lwe_bootstrap_keys(
stream, gpu_index, d_fourier_bsk_array, *lwe_sk_out_array,
*lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, *csprng, glwe_modular_variance, repetitions);
generate_lwe_private_functional_keyswitch_key_lists(
stream, gpu_index, d_pksk_array, *lwe_sk_in_array, *lwe_sk_in_array,
input_lwe_dimension, glwe_dimension, polynomial_size, pksk_level,
pksk_base_log, *csprng, lwe_modular_variance, repetitions);
*plaintexts = generate_plaintexts(p, *delta, tau, repetitions, samples);
// LUT creation
int lut_size = polynomial_size;
int lut_num = tau << (tau * p - (int)log2(polynomial_size)); // r
uint64_t *big_lut = (uint64_t *)malloc(lut_num * lut_size * sizeof(uint64_t));
for (int t = tau - 1; t >= 0; t--) {
uint64_t *small_lut = big_lut + (ptrdiff_t)(t * (1 << (tau * p)));
for (uint64_t value = 0; value < (uint64_t)(1 << (tau * p)); value++) {
int nbits = t * p;
uint64_t x = (value >> nbits) & (uint64_t)((1 << p) - 1);
small_lut[value] =
((x % (uint64_t)(1 << (64 - *delta_log))) << *delta_log_lut);
}
}
*d_lut_vector = (uint64_t *)cuda_malloc_async(
lut_num * lut_size * sizeof(uint64_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(*d_lut_vector, big_lut,
lut_num * lut_size * sizeof(uint64_t), stream,
gpu_index);
// Allocate input
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
repetitions * samples * (input_lwe_dimension + 1) * tau *
sizeof(uint64_t),
stream, gpu_index);
// Allocate output
*d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
repetitions * samples * (input_lwe_dimension + 1) * tau *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc(repetitions * samples * (input_lwe_dimension + 1) *
tau * sizeof(uint64_t));
// Create the input ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in =
*lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < tau; i++) {
uint64_t plaintext = (*plaintexts)[r * samples * tau + s * tau + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * tau + s * tau + i) *
(input_lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_ct_in, plaintext, input_lwe_dimension,
lwe_modular_variance, *csprng, &CONCRETE_CSPRNG_VTABLE);
}
}
}
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * tau *
(input_lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
// Execute scratch
scratch_cuda_wop_pbs_64(stream, gpu_index, wop_pbs_buffer,
(uint32_t *)delta_log, (uint32_t *)cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size,
cbs_level, pbs_level, p, p, tau,
cuda_get_max_shared_memory(gpu_index), true);
cuda_synchronize_stream(v_stream);
free(lwe_ct_in_array);
free(big_lut);
}
void wop_pbs_teardown(cudaStream_t *stream, Csprng *csprng,
uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
uint64_t *d_ksk_array, double *d_fourier_bsk_array,
uint64_t *d_pksk_array, uint64_t *plaintexts,
uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector,
uint64_t *d_lwe_ct_out_array, int8_t *wop_pbs_buffer,
int gpu_index) {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index,
&wop_pbs_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_drop_async(d_pksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cuda_drop_async(d_lut_vector, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
}

View File

@@ -7,8 +7,9 @@ set(gtest_force_shared_crt
CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu/implementation/target/release")
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../concrete-cpu/implementation")
set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation/target/release")
set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation")
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
if(NOT TARGET concrete_cpu)
# Enable ExternalProject CMake module
@@ -28,7 +29,10 @@ if(NOT TARGET concrete_cpu)
LOG_BUILD ON)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
add_library(concrete_cpu_lib STATIC IMPORTED)
set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
@@ -43,7 +47,7 @@ file(
set(SOURCES ${TEST_SOURCES})
add_executable(${BINARY} ${TEST_SOURCES})
add_executable(${BINARY} ${TEST_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
add_test(NAME ${BINARY} COMMAND ${BINARY})

View File

@@ -1,9 +1,6 @@
#include "../include/bit_extraction.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
@@ -47,13 +44,12 @@ protected:
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *lwe_in_ct_array;
uint64_t *lwe_out_ct_array;
uint64_t *lwe_ct_in_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_ksk_array;
uint64_t *d_lwe_in_ct_array;
uint64_t *d_lwe_out_ct_array;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
int8_t *bit_extract_buffer;
int input_lwe_dimension;
int output_lwe_dimension;
@@ -77,114 +73,56 @@ public:
(int)GetParam().number_of_bits_of_message_including_padding;
number_of_bits_to_extract = (int)GetParam().number_of_bits_to_extract;
number_of_inputs = (int)GetParam().number_of_inputs;
delta_log = 64 - number_of_bits_of_message_including_padding;
delta = (uint64_t)(1) << delta_log;
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
input_lwe_dimension = glwe_dimension * polynomial_size;
output_lwe_dimension = lwe_dimension;
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng,
REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng,
REPETITIONS);
generate_lwe_keyswitch_keys(
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
input_lwe_dimension, output_lwe_dimension, ks_level, ks_base_log,
csprng, lwe_modular_variance, REPETITIONS);
generate_lwe_bootstrap_keys(
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array,
lwe_sk_in_array, output_lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
plaintexts =
generate_plaintexts(number_of_bits_of_message_including_padding, delta,
number_of_inputs, REPETITIONS, SAMPLES);
d_lwe_out_ct_array = (uint64_t *)cuda_malloc_async(
(output_lwe_dimension + 1) * number_of_bits_to_extract *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
d_lwe_in_ct_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream,
gpu_index);
lwe_in_ct_array = (uint64_t *)malloc((input_lwe_dimension + 1) *
number_of_inputs * sizeof(uint64_t));
lwe_out_ct_array = (uint64_t *)malloc((output_lwe_dimension + 1) *
number_of_bits_to_extract *
number_of_inputs * sizeof(uint64_t));
// Execute scratch
scratch_cuda_extract_bits_64(stream, gpu_index, &bit_extract_buffer,
glwe_dimension, lwe_dimension, polynomial_size,
pbs_level, number_of_inputs,
cuda_get_max_shared_memory(gpu_index), true);
bit_extraction_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
&d_lwe_ct_out_array, &bit_extract_buffer, lwe_dimension, glwe_dimension,
polynomial_size, lwe_modular_variance, glwe_modular_variance,
ks_base_log, ks_level, pbs_base_log, pbs_level,
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
&delta_log, &delta, number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
free(lwe_in_ct_array);
free(lwe_out_ct_array);
cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_in_ct_array, stream, gpu_index);
cuda_drop_async(d_lwe_out_ct_array, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
bit_extraction_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_fourier_bsk_array, d_ksk_array, plaintexts,
d_lwe_ct_in_array, d_lwe_ct_out_array,
bit_extract_buffer, gpu_index);
}
};
TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) {
void *v_stream = (void *)stream;
uint64_t *lwe_ct_out_array = (uint64_t *)malloc(
(output_lwe_dimension + 1) * number_of_bits_to_extract *
number_of_inputs * sizeof(uint64_t));
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (output_lwe_dimension + 1);
int ksk_size = ks_level * input_lwe_dimension * (output_lwe_dimension + 1);
for (uint r = 0; r < REPETITIONS; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
uint64_t *lwe_in_sk =
lwe_sk_in_array + (ptrdiff_t)(input_lwe_dimension * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_in_ct =
lwe_in_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_in_sk, lwe_in_ct, plaintext, input_lwe_dimension,
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct_array,
(input_lwe_dimension + 1) * number_of_inputs *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(input_lwe_dimension + 1));
// Execute bit extract
cuda_extract_bits_64(
stream, gpu_index, (void *)d_lwe_out_ct_array,
(void *)d_lwe_in_ct_array, bit_extract_buffer, (void *)d_ksk,
(void *)d_fourier_bsk, number_of_bits_to_extract, delta_log,
input_lwe_dimension, output_lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
number_of_inputs, cuda_get_max_shared_memory(gpu_index));
stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
bit_extract_buffer, (void *)d_ksk, (void *)d_fourier_bsk,
number_of_bits_to_extract, delta_log, input_lwe_dimension,
output_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, ks_base_log, ks_level, number_of_inputs,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_out_ct_array,
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(output_lwe_dimension + 1) *
number_of_bits_to_extract *
number_of_inputs * sizeof(uint64_t),
@@ -192,7 +130,7 @@ TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) {
cuda_synchronize_stream(v_stream);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result_array =
lwe_out_ct_array + (ptrdiff_t)(j * number_of_bits_to_extract *
lwe_ct_out_array + (ptrdiff_t)(j * number_of_bits_to_extract *
(output_lwe_dimension + 1));
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + j];

View File

@@ -0,0 +1,245 @@
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <utils.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int repetitions;
int samples;
} BootstrapTestParams;
class BootstrapTestPrimitives_u64
: public ::testing::TestWithParam<BootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
double lwe_modular_variance;
double glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int payload_modulus;
int number_of_inputs;
int repetitions;
int samples;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_out_array;
int8_t *amortized_pbs_buffer;
int8_t *lowlat_pbs_buffer;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_modular_variance = (double)GetParam().lwe_modular_variance;
glwe_modular_variance = (double)GetParam().glwe_modular_variance;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
repetitions = (int)GetParam().repetitions;
samples = (int)GetParam().samples;
bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
&amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pbs_base_log, pbs_level,
message_modulus, carry_modulus, &payload_modulus, &delta,
number_of_inputs, repetitions, samples, gpu_index);
lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
}
void TearDown() {
free(lwe_ct_out_array);
bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
}
};
TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, amortized_pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, 1, 0,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta)
<< "Repetition: " << r << ", sample: " << s;
}
}
}
}
TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
if (number_of_inputs > number_of_sm * 4 / (glwe_dimension + 1) / pbs_level)
GTEST_SKIP() << "The Low Latency PBS does not support this configuration";
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_fourier_bsk, lowlat_pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, 1, 0,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
}
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<BootstrapTestParams> pbs_params_u64 =
::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// message_modulus, carry_modulus, number_of_inputs, repetitions,
// samples
(BootstrapTestParams){567, 5, 256, 7.52316384526264e-25,
7.52316384526264e-25, 15, 1, 2, 1, 5, 2, 5},
(BootstrapTestParams){623, 6, 256, 7.52316384526264e-25,
7.52316384526264e-25, 9, 3, 2, 2, 5, 2, 50},
(BootstrapTestParams){694, 3, 512, 7.52316384526264e-25,
7.52316384526264e-25, 18, 1, 2, 1, 5, 2, 50},
(BootstrapTestParams){769, 2, 1024, 7.52316384526264e-25,
7.52316384526264e-25, 23, 1, 2, 1, 5, 2, 50},
(BootstrapTestParams){754, 1, 2048, 7.52316384526264e-25,
7.52316384526264e-25, 23, 1, 4, 1, 5, 2, 50},
(BootstrapTestParams){847, 1, 4096, 7.52316384526264e-25,
7.52316384526264e-25, 2, 12, 2, 1, 2, 1, 50},
(BootstrapTestParams){881, 1, 8192, 7.52316384526264e-25,
7.52316384526264e-25, 22, 1, 2, 1, 2, 1, 25},
(BootstrapTestParams){976, 1, 16384, 7.52316384526264e-25,
7.52316384526264e-25, 11, 3, 4, 1, 2, 1, 10});
std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
BootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_number_of_inputs_" +
std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(BootstrapInstantiation, BootstrapTestPrimitives_u64,
pbs_params_u64, printParamName);

View File

@@ -1,9 +1,6 @@
#include "../include/circuit_bootstrap.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
@@ -49,13 +46,11 @@ protected:
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *lwe_in_ct;
uint64_t *ggsw_out_ct;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_pksk_array;
uint64_t *d_lwe_in_ct;
uint64_t *d_ggsw_out_ct;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_ggsw_ct_out_array;
uint64_t *d_lut_vector_indexes;
int8_t *cbs_buffer;
@@ -77,90 +72,34 @@ public:
cbs_base_log = (int)GetParam().cbs_base_log;
cbs_level = (int)GetParam().cbs_level;
number_of_inputs = (int)GetParam().number_of_inputs;
// We generate binary messages
number_of_bits_of_message_including_padding = 2;
delta_log = 60;
delta = (uint64_t)(1) << delta_log;
ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
polynomial_size;
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, lwe_dimension, csprng,
REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array,
glwe_dimension * polynomial_size, csprng,
REPETITIONS);
generate_lwe_bootstrap_keys(
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_in_array,
lwe_sk_out_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
generate_lwe_private_functional_keyswitch_key_lists(
stream, gpu_index, &d_pksk_array, lwe_sk_out_array, lwe_sk_out_array,
glwe_dimension * polynomial_size, glwe_dimension, polynomial_size,
pksk_level, pksk_base_log, csprng, lwe_modular_variance, REPETITIONS);
plaintexts =
generate_plaintexts(number_of_bits_of_message_including_padding, delta,
number_of_inputs, REPETITIONS, SAMPLES);
d_ggsw_out_ct = (uint64_t *)cuda_malloc_async(
number_of_inputs * ggsw_size * sizeof(uint64_t), stream, gpu_index);
d_lwe_in_ct = (uint64_t *)cuda_malloc_async(
number_of_inputs * (lwe_dimension + 1) * sizeof(uint64_t), stream,
circuit_bootstrap_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &d_pksk_array, &plaintexts, &d_lwe_ct_in_array,
&d_ggsw_ct_out_array, &d_lut_vector_indexes, &cbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, pksk_base_log, pksk_level, pbs_base_log,
pbs_level, cbs_level, number_of_bits_of_message_including_padding,
ggsw_size, &delta_log, &delta, number_of_inputs, REPETITIONS, SAMPLES,
gpu_index);
lwe_in_ct = (uint64_t *)malloc(number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t));
ggsw_out_ct = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
// Execute cbs scratch
scratch_cuda_circuit_bootstrap_64(
stream, gpu_index, &cbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, cbs_level, number_of_inputs,
cuda_get_max_shared_memory(gpu_index), true);
// Build LUT vector indexes
uint64_t *h_lut_vector_indexes =
(uint64_t *)malloc(number_of_inputs * cbs_level * sizeof(uint64_t));
for (int index = 0; index < cbs_level * number_of_inputs; index++) {
h_lut_vector_indexes[index] = index % cbs_level;
}
d_lut_vector_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * cbs_level * sizeof(uint64_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lut_vector_indexes, h_lut_vector_indexes,
number_of_inputs * cbs_level * sizeof(uint64_t),
stream, gpu_index);
free(h_lut_vector_indexes);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
free(lwe_in_ct);
free(ggsw_out_ct);
cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_pksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_in_ct, stream, gpu_index);
cuda_drop_async(d_ggsw_out_ct, stream, gpu_index);
cuda_drop_async(d_lut_vector_indexes, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
circuit_bootstrap_teardown(
stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
d_pksk_array, plaintexts, d_lwe_ct_in_array, d_lut_vector_indexes,
d_ggsw_ct_out_array, cbs_buffer, gpu_index);
}
};
TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
void *v_stream = (void *)stream;
uint64_t *ggsw_ct_out = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
for (uint r = 0; r < REPETITIONS; r++) {
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
@@ -169,27 +108,17 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
(glwe_dimension * polynomial_size + 1) *
(glwe_dimension + 1);
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
uint64_t *lwe_in_sk = lwe_sk_in_array + (ptrdiff_t)(lwe_dimension * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (uint s = 0; s < SAMPLES; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_in_sk, lwe_in_ct + i * (lwe_dimension + 1), plaintext,
lwe_dimension, lwe_modular_variance, csprng,
&CONCRETE_CSPRNG_VTABLE);
}
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute circuit bootstrap
cuda_circuit_bootstrap_64(
stream, gpu_index, (void *)d_ggsw_out_ct, (void *)d_lwe_in_ct,
stream, gpu_index, (void *)d_ggsw_ct_out_array, (void *)d_lwe_ct_in,
(void *)d_fourier_bsk, (void *)d_pksk_list,
(void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
@@ -203,9 +132,9 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
(uint64_t *)malloc(polynomial_size * (glwe_dimension + 1) *
cbs_level * sizeof(uint64_t));
// Copy result back
cuda_memcpy_async_to_cpu(ggsw_out_ct, d_ggsw_out_ct + i * ggsw_size,
ggsw_size * sizeof(uint64_t), stream,
gpu_index);
cuda_memcpy_async_to_cpu(
ggsw_ct_out, d_ggsw_ct_out_array + i * ggsw_size,
ggsw_size * sizeof(uint64_t), stream, gpu_index);
cuda_synchronize_stream(v_stream);
uint64_t multiplying_factor = -(plaintext >> delta_log);
@@ -215,7 +144,7 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
(glwe_dimension + 1) +
j * polynomial_size);
uint64_t *glwe_ct_out =
ggsw_out_ct +
ggsw_ct_out +
(ptrdiff_t)((l - 1) * polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) +
j * polynomial_size * (glwe_dimension + 1));
@@ -239,7 +168,7 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
(glwe_dimension + 1) +
glwe_dimension * polynomial_size);
uint64_t *glwe_ct_out =
ggsw_out_ct +
ggsw_ct_out +
(ptrdiff_t)((cbs_level - 1) * polynomial_size *
(glwe_dimension + 1) * (glwe_dimension + 1) +
glwe_dimension * polynomial_size *
@@ -258,6 +187,7 @@ TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
}
}
}
free(ggsw_ct_out);
}
// Defines for which parameters set the PBS will be tested.

View File

@@ -1,10 +1,7 @@
#include "../include/device.h"
#include "../include/vertical_packing.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
@@ -39,12 +36,15 @@ protected:
int gpu_index = 0;
uint64_t *glwe_sk;
uint64_t *d_lut_identity;
int8_t *cmux_tree_buffer = nullptr;
uint64_t *d_ggsw_bit_array;
uint64_t *d_glwe_out;
uint64_t *glwe_out;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
glwe_dimension = (int)GetParam().glwe_dimension;
@@ -59,41 +59,20 @@ public:
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << delta_log);
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_glwe_secret_keys(&glwe_sk, glwe_dimension, polynomial_size, csprng,
REPETITIONS);
plaintexts = generate_plaintexts(r_lut, 1, 1, REPETITIONS, SAMPLES);
// Create the LUT
int num_lut = (1 << r_lut);
d_lut_identity = (uint64_t *)cuda_malloc_async(
polynomial_size * num_lut * tau * sizeof(uint64_t), stream, gpu_index);
uint64_t *lut_cmux_tree_identity = generate_identity_lut_cmux_tree(
polynomial_size, num_lut, tau, delta_log);
// Copy all LUTs
cuda_memcpy_async_to_gpu(d_lut_identity, lut_cmux_tree_identity,
polynomial_size * num_lut * tau * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
free(lut_cmux_tree_identity);
cmux_tree_setup(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
glwe_dimension, polynomial_size, base_log, level_count,
glwe_modular_variance, r_lut, tau, delta_log, REPETITIONS,
SAMPLES, gpu_index);
glwe_out = (uint64_t *)malloc(tau * (glwe_dimension + 1) * polynomial_size *
sizeof(uint64_t));
}
void TearDown() {
cuda_synchronize_stream(stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(plaintexts);
free(csprng);
cuda_drop_async(d_lut_identity, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
free(glwe_out);
cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
&d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
gpu_index);
}
};
@@ -101,54 +80,30 @@ TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) {
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
int glwe_size = (glwe_dimension + 1) * polynomial_size;
uint64_t *d_ggsw_bit_array = (uint64_t *)cuda_malloc_async(
r_lut * ggsw_size * sizeof(uint64_t), stream, gpu_index);
uint64_t *d_results = (uint64_t *)cuda_malloc_async(
tau * glwe_size * sizeof(uint64_t), stream, gpu_index);
uint64_t *results = (uint64_t *)malloc(tau * glwe_size * sizeof(uint64_t));
uint64_t *ggsw = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
int8_t *cmux_tree_buffer = nullptr;
scratch_cuda_cmux_tree_64(stream, gpu_index, &cmux_tree_buffer,
glwe_dimension, polynomial_size, level_count, r_lut,
tau, cuda_get_max_shared_memory(gpu_index), true);
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
for (uint s = 0; s < SAMPLES; s++) {
uint64_t witness = plaintexts[r * SAMPLES + s];
// Instantiate the GGSW m^tree ciphertexts
// We need r GGSW ciphertexts
// Bit decomposition of the value from MSB to LSB
uint64_t *bit_array = bit_decompose_value(witness, r_lut);
uint64_t *d_ggsw_bit_array_slice =
d_ggsw_bit_array +
(ptrdiff_t)((r * SAMPLES * r_lut + s * r_lut) * ggsw_size);
for (int i = 0; i < r_lut; i++) {
uint64_t *d_ggsw_slice = d_ggsw_bit_array + i * ggsw_size;
concrete_cpu_encrypt_ggsw_ciphertext_u64(
glwe_sk, ggsw, bit_array[i], glwe_dimension, polynomial_size,
level_count, base_log, glwe_modular_variance, csprng,
&CONCRETE_CSPRNG_VTABLE);
cuda_memcpy_async_to_gpu(d_ggsw_slice, ggsw,
ggsw_size * sizeof(uint64_t), stream,
gpu_index);
}
cuda_synchronize_stream(stream);
// Execute scratch/CMUX tree/cleanup
cuda_cmux_tree_64(stream, gpu_index, (void *)d_results,
(void *)d_ggsw_bit_array, (void *)d_lut_identity,
// Execute CMUX tree
cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
(void *)d_ggsw_bit_array_slice, (void *)d_lut_identity,
cmux_tree_buffer, glwe_dimension, polynomial_size,
base_log, level_count, r_lut, tau,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(results, d_results,
cuda_memcpy_async_to_cpu(glwe_out, d_glwe_out,
tau * glwe_size * sizeof(uint64_t), stream,
gpu_index);
cuda_synchronize_stream(stream);
for (int tree = 0; tree < tau; tree++) {
uint64_t *result = results + tree * glwe_size;
uint64_t *result = glwe_out + tree * glwe_size;
uint64_t *decrypted =
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
concrete_cpu_decrypt_glwe_ciphertext_u64(
@@ -158,27 +113,15 @@ TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) {
// Compute the rounding bit
uint64_t rounding = (decrypted[0] & rounding_bit) << 1;
uint64_t decoded = (decrypted[0] + rounding) / delta;
EXPECT_EQ(decoded, (witness + tree) % (1 << (64 - delta_log)));
EXPECT_EQ(decoded, (witness + tree) % (1 << (64 - delta_log)))
<< "Repetition: " << r << ", sample: " << s << ", tree: " << tree;
free(decrypted);
}
free(bit_array);
}
}
cuda_synchronize_stream(stream);
cleanup_cuda_cmux_tree(stream, gpu_index, &cmux_tree_buffer);
free(ggsw);
cuda_drop_async(d_ggsw_bit_array, stream, gpu_index);
}
int glwe_dimension;
int polynomial_size;
double glwe_modular_variance;
int base_log;
int level_count;
int message_modulus;
int carry_modulus;
// Defines for which parameters set the PBS will be tested.
// It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<CMUXTreeTestParams> cmux_tree_params_u64 =

View File

@@ -1,10 +1,6 @@
#include "../include/device.h"
#include "../include/keyswitch.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cstdint>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
@@ -42,8 +38,8 @@ protected:
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
uint64_t *d_ksk_array;
uint64_t *d_lwe_out_ct;
uint64_t *d_lwe_in_ct;
uint64_t *d_lwe_ct_out_array;
uint64_t *d_lwe_ct_in_array;
uint64_t *lwe_in_ct;
uint64_t *lwe_out_ct;
@@ -51,7 +47,6 @@ public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
input_lwe_dimension = (int)GetParam().input_lwe_dimension;
@@ -63,94 +58,42 @@ public:
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng,
REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array, output_lwe_dimension, csprng,
REPETITIONS);
generate_lwe_keyswitch_keys(
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
input_lwe_dimension, output_lwe_dimension, ksk_level, ksk_base_log,
csprng, noise_variance, REPETITIONS);
plaintexts = generate_plaintexts(payload_modulus, delta, number_of_inputs,
REPETITIONS, SAMPLES);
d_lwe_out_ct = (uint64_t *)cuda_malloc_async(
number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t),
stream, gpu_index);
d_lwe_in_ct = (uint64_t *)cuda_malloc_async(
number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t), stream,
gpu_index);
lwe_in_ct = (uint64_t *)malloc(
number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
lwe_out_ct = (uint64_t *)malloc(
number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t));
cuda_synchronize_stream(v_stream);
keyswitch_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
&d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
&d_lwe_ct_out_array, input_lwe_dimension,
output_lwe_dimension, noise_variance, ksk_base_log,
ksk_level, message_modulus, carry_modulus, &payload_modulus,
&delta, number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
cuda_drop_async(d_lwe_in_ct, stream, gpu_index);
cuda_drop_async(d_lwe_out_ct, stream, gpu_index);
free(lwe_in_ct);
free(lwe_out_ct);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_ksk_array, plaintexts, d_lwe_ct_in_array,
d_lwe_ct_out_array, gpu_index);
}
};
TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
void *v_stream = (void *)stream;
uint64_t *lwe_out_ct = (uint64_t *)malloc(
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t));
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_in_sk =
lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
uint64_t *lwe_out_sk =
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
for (uint s = 0; s < SAMPLES; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_in_sk, lwe_in_ct + i * (input_lwe_dimension + 1), plaintext,
input_lwe_dimension, noise_variance, csprng,
&CONCRETE_CSPRNG_VTABLE);
}
cuda_synchronize_stream(v_stream);
cuda_memcpy_async_to_gpu(d_lwe_in_ct, lwe_in_ct,
number_of_inputs * (input_lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(input_lwe_dimension + 1));
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_ct,
stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
(void *)d_ksk, input_lwe_dimension, output_lwe_dimension,
ksk_base_log, ksk_level, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_ct_out_array,
number_of_inputs * (output_lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
@@ -162,11 +105,6 @@ TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
lwe_out_sk, lwe_out_ct + i * (output_lwe_dimension + 1),
output_lwe_dimension, &decrypted);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
@@ -176,6 +114,7 @@ TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
}
}
}
free(lwe_out_ct);
}
// Defines for which parameters set the PBS will be tested.

View File

@@ -0,0 +1,269 @@
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 5;
const unsigned SAMPLES = 100;
typedef struct {
int lwe_dimension;
double noise_variance;
int message_modulus;
int carry_modulus;
int number_of_inputs;
} LinearAlgebraTestParams;
class LinearAlgebraTestPrimitives_u64
: public ::testing::TestWithParam<LinearAlgebraTestParams> {
protected:
int lwe_dimension;
double noise_variance;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int payload_modulus;
uint64_t delta;
Csprng *csprng;
cudaStream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_array;
uint64_t *d_lwe_in_1_ct;
uint64_t *d_lwe_in_2_ct;
uint64_t *d_plaintext_2;
uint64_t *d_cleartext;
uint64_t *d_lwe_out_ct;
uint64_t *lwe_in_1_ct;
uint64_t *lwe_in_2_ct;
uint64_t *lwe_out_ct;
uint64_t *plaintexts_1;
uint64_t *plaintexts_2;
int num_samples;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
noise_variance = (double)GetParam().noise_variance;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
// In this test we use a smaller delta to avoid an overflow during
// multiplication
delta =
((uint64_t)(1) << 63) / (uint64_t)(payload_modulus * payload_modulus);
linear_algebra_setup(stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct,
&d_lwe_in_2_ct, &d_lwe_out_ct, &lwe_in_1_ct,
&lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
&plaintexts_2, &d_plaintext_2, &d_cleartext,
lwe_dimension, noise_variance, payload_modulus, delta,
number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
}
void TearDown() {
linear_algebra_teardown(
stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
&d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
&plaintexts_2, &d_plaintext_2, &d_cleartext, gpu_index);
}
};
TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
void *v_stream = (void *)stream;
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_1_in =
d_lwe_in_1_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
uint64_t *d_lwe_2_in =
d_lwe_in_2_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute addition
cuda_add_lwe_ciphertext_vector_64(stream, gpu_index, (void *)d_lwe_out_ct,
(void *)d_lwe_1_in, (void *)d_lwe_2_in,
lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta)
<< "Repetition: " << r << ", sample: " << s;
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_1_slice =
d_lwe_in_1_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
uint64_t *d_plaintext_2_in =
d_plaintext_2 +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs));
// Execute addition
cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
(void *)d_plaintext_2_in, lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta)
<< "Repetition: " << r << ", sample: " << s << " i: " << i << ") "
<< plaintext_1 / delta << " + " << plaintext_2 / delta;
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, cleartext_multiplication) {
void *v_stream = (void *)stream;
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_1_slice =
d_lwe_in_1_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
uint64_t *d_cleartext_in =
d_cleartext +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs));
// Execute cleartext multiplication
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
(void *)d_cleartext_in, lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(v_stream);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t cleartext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i] /
delta;
uint64_t cleartext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i] /
delta;
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, cleartext_1 * cleartext_2)
<< "Repetition: " << r << ", sample: " << s << " i: " << i
<< ", decrypted: " << decrypted;
}
}
}
}
TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
// Here execute the PBS
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_1_slice =
d_lwe_in_1_ct +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute negate
cuda_negate_lwe_ciphertext_vector_64(
stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
lwe_dimension, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
number_of_inputs * (lwe_dimension + 1) *
sizeof(uint64_t),
stream, gpu_index);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
concrete_cpu_decrypt_lwe_ciphertext_u64(
lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
&decrypted);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, -plaintext / delta)
<< "Repetition: " << r << ", sample: " << s << " i: " << i;
}
}
}
}
// Defines for which parameters set the linear algebra operations will be
// tested. It executes each test for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<LinearAlgebraTestParams>
linear_algebra_params_u64 = ::testing::Values(
// n, lwe_std_dev, message_modulus, carry_modulus, number_of_inputs
(LinearAlgebraTestParams){600, 7.52316384526264e-37, 2, 2, 10});
std::string
printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {
LinearAlgebraTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension);
}
INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation,
LinearAlgebraTestPrimitives_u64,
linear_algebra_params_u64, printParamName);

View File

@@ -1,10 +1,6 @@
#include "../include/bootstrap.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include "utils.h"
#include "gtest/gtest.h"
#include <cmath>
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
@@ -47,7 +43,7 @@ protected:
int tau;
int p;
uint64_t delta;
uint32_t cbs_delta_log;
int cbs_delta_log;
int delta_log;
int delta_log_lut;
Csprng *csprng;
@@ -71,7 +67,6 @@ public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(0);
void *v_stream = (void *)stream;
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
@@ -89,104 +84,30 @@ public:
cbs_level = (int)GetParam().cbs_level;
tau = (int)GetParam().tau;
p = 10 / tau;
delta_log = 64 - p;
delta_log_lut = delta_log;
delta = (uint64_t)(1) << delta_log;
// Create a Csprng
csprng =
(Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
uint8_t seed[16] = {(uint8_t)0};
concrete_cpu_construct_concrete_csprng(
csprng, Uint128{.little_endian_bytes = {*seed}});
input_lwe_dimension = glwe_dimension * polynomial_size;
// Generate the keys
generate_lwe_secret_keys(&lwe_sk_in_array, input_lwe_dimension, csprng,
REPETITIONS);
generate_lwe_secret_keys(&lwe_sk_out_array, lwe_dimension, csprng,
REPETITIONS);
generate_lwe_keyswitch_keys(
stream, gpu_index, &d_ksk_array, lwe_sk_in_array, lwe_sk_out_array,
input_lwe_dimension, lwe_dimension, ks_level, ks_base_log, csprng,
lwe_modular_variance, REPETITIONS);
generate_lwe_bootstrap_keys(
stream, gpu_index, &d_fourier_bsk_array, lwe_sk_out_array,
lwe_sk_in_array, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, pbs_base_log, csprng, glwe_modular_variance, REPETITIONS);
generate_lwe_private_functional_keyswitch_key_lists(
stream, gpu_index, &d_pksk_array, lwe_sk_in_array, lwe_sk_in_array,
input_lwe_dimension, glwe_dimension, polynomial_size, pksk_level,
pksk_base_log, csprng, lwe_modular_variance, REPETITIONS);
plaintexts = generate_plaintexts(p, delta, tau, REPETITIONS, SAMPLES);
// LUT creation
int lut_size = polynomial_size;
int lut_num = tau << (tau * p - (int)log2(polynomial_size)); // r
uint64_t *big_lut =
(uint64_t *)malloc(lut_num * lut_size * sizeof(uint64_t));
for (int t = tau - 1; t >= 0; t--) {
uint64_t *small_lut = big_lut + (ptrdiff_t)(t * (1 << (tau * p)));
for (uint64_t value = 0; value < (uint64_t)(1 << (tau * p)); value++) {
int nbits = t * p;
uint64_t x = (value >> nbits) & (uint64_t)((1 << p) - 1);
small_lut[value] =
((x % (uint64_t)(1 << (64 - delta_log))) << delta_log_lut);
}
}
d_lut_vector = (uint64_t *)cuda_malloc_async(
lut_num * lut_size * sizeof(uint64_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(d_lut_vector, big_lut,
lut_num * lut_size * sizeof(uint64_t), stream,
gpu_index);
// Execute scratch
scratch_cuda_wop_pbs_64(stream, gpu_index, &wop_pbs_buffer,
(uint32_t *)&delta_log, &cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size,
cbs_level, pbs_level, p, p, tau,
cuda_get_max_shared_memory(gpu_index), true);
// Allocate input
d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
// Allocate output
d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * tau * sizeof(uint64_t), stream, gpu_index);
lwe_in_ct_array =
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
lwe_out_ct_array =
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
cuda_synchronize_stream(v_stream);
free(big_lut);
wop_pbs_setup(
stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array, &d_ksk_array,
&d_fourier_bsk_array, &d_pksk_array, &plaintexts, &d_lwe_ct_in_array,
&d_lwe_ct_out_array, &d_lut_vector, &wop_pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, lwe_modular_variance,
glwe_modular_variance, ks_base_log, ks_level, pksk_base_log, pksk_level,
pbs_base_log, pbs_level, cbs_level, p, &delta_log, &cbs_delta_log,
&delta_log_lut, &delta, tau, REPETITIONS, SAMPLES, gpu_index);
}
void TearDown() {
void *v_stream = (void *)stream;
cuda_synchronize_stream(v_stream);
concrete_cpu_destroy_concrete_csprng(csprng);
free(csprng);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
free(lwe_in_ct_array);
free(lwe_out_ct_array);
cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index,
&wop_pbs_buffer);
cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
cuda_drop_async(d_ksk_array, stream, gpu_index);
cuda_drop_async(d_pksk_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
cuda_drop_async(d_lut_vector, stream, gpu_index);
cuda_destroy_stream(stream, gpu_index);
wop_pbs_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
d_ksk_array, d_fourier_bsk_array, d_pksk_array, plaintexts,
d_lwe_ct_in_array, d_lut_vector, d_lwe_ct_out_array,
wop_pbs_buffer, gpu_index);
}
};
TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) {
void *v_stream = (void *)stream;
int input_lwe_dimension = glwe_dimension * polynomial_size;
uint64_t *lwe_out_ct_array =
(uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
int ksk_size =
@@ -196,33 +117,23 @@ TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) {
(glwe_dimension + 1);
for (uint r = 0; r < REPETITIONS; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_in =
lwe_sk_in_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
uint64_t *lwe_sk_in =
lwe_sk_in_array + (ptrdiff_t)(input_lwe_dimension * r);
for (uint s = 0; s < SAMPLES; s++) {
for (int t = 0; t < tau; t++) {
uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + t];
uint64_t *lwe_in_ct =
lwe_in_ct_array + (ptrdiff_t)(t * (input_lwe_dimension + 1));
concrete_cpu_encrypt_lwe_ciphertext_u64(
lwe_sk_in, lwe_in_ct, plaintext, input_lwe_dimension,
lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
}
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_in_ct_array,
(input_lwe_dimension + 1) * tau *
sizeof(uint64_t),
stream, gpu_index);
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array + (ptrdiff_t)((r * SAMPLES * tau + s * tau) *
(input_lwe_dimension + 1));
// Execute wop pbs
cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_ct_in_array, (void *)d_lut_vector,
(void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk_list,
wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
ks_base_log, ks_level, pksk_base_log, pksk_level,
cbs_base_log, cbs_level, p, p, delta_log, tau,
cuda_get_max_shared_memory(gpu_index));
cuda_wop_pbs_64(
stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
(void *)d_lut_vector, (void *)d_fourier_bsk, (void *)d_ksk,
(void *)d_pksk_list, wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, pbs_base_log, pbs_level, ks_base_log,
ks_level, pksk_base_log, pksk_level, cbs_base_log, cbs_level, p, p,
delta_log, tau, cuda_get_max_shared_memory(gpu_index));
//// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array,

View File

@@ -1,12 +1,12 @@
#include "utils.h"
#include "../include/bootstrap.h"
#include "../include/device.h"
#include "concrete-cpu.h"
#include <bootstrap.h>
#include <cmath>
#include <concrete-cpu.h>
#include <cstdint>
#include <cstdlib>
#include <device.h>
#include <functional>
#include <random>
#include <utils.h>
// For each sample and repetition, create a plaintext
// The payload_modulus is the message modulus times the carry modulus